From cad1eb55f165768f79acc319dfd7e22952ef7878 Mon Sep 17 00:00:00 2001
From: rlorenzo <raphael.lorenzo@inria.fr>
Date: Tue, 17 Dec 2024 13:37:58 +0100
Subject: [PATCH] Add code without changes

---
 .gitignore                                    |    7 +
 Dockerfile                                    |   58 +
 README.md                                     |   25 +-
 configs/_base_/datasets/300w.py               |  384 ++++
 configs/_base_/datasets/aflw.py               |   83 +
 configs/_base_/datasets/aic.py                |  140 ++
 configs/_base_/datasets/aic_info.py           |  140 ++
 configs/_base_/datasets/animalpose.py         |  166 ++
 configs/_base_/datasets/ap10k.py              |  142 ++
 configs/_base_/datasets/ap10k_info.py         |  142 ++
 configs/_base_/datasets/atrw.py               |  144 ++
 configs/_base_/datasets/coco.py               |  181 ++
 configs/_base_/datasets/coco_wholebody.py     | 1154 ++++++++++
 .../_base_/datasets/coco_wholebody_face.py    |  448 ++++
 .../_base_/datasets/coco_wholebody_hand.py    |  147 ++
 .../_base_/datasets/coco_wholebody_info.py    | 1154 ++++++++++
 configs/_base_/datasets/cofw.py               |  134 ++
 configs/_base_/datasets/crowdpose.py          |  147 ++
 configs/_base_/datasets/deepfashion_full.py   |   74 +
 configs/_base_/datasets/deepfashion_lower.py  |   46 +
 configs/_base_/datasets/deepfashion_upper.py  |   60 +
 configs/_base_/datasets/fly.py                |  237 ++
 configs/_base_/datasets/freihand2d.py         |  144 ++
 configs/_base_/datasets/h36m.py               |  152 ++
 configs/_base_/datasets/halpe.py              | 1157 ++++++++++
 configs/_base_/datasets/horse10.py            |  201 ++
 configs/_base_/datasets/interhand2d.py        |  142 ++
 configs/_base_/datasets/interhand3d.py        |  487 ++++
 configs/_base_/datasets/jhmdb.py              |  129 ++
 configs/_base_/datasets/locust.py             |  263 +++
 configs/_base_/datasets/macaque.py            |  183 ++
 configs/_base_/datasets/mhp.py                |  156 ++
 configs/_base_/datasets/mpi_inf_3dhp.py       |  132 ++
 configs/_base_/datasets/mpii.py               |  155 ++
 configs/_base_/datasets/mpii_info.py          |  155 ++
 configs/_base_/datasets/mpii_trb.py           |  380 ++++
 configs/_base_/datasets/ochuman.py            |  181 ++
 configs/_base_/datasets/onehand10k.py         |  142 ++
 configs/_base_/datasets/panoptic_body3d.py    |  160 ++
 configs/_base_/datasets/panoptic_hand2d.py    |  143 ++
 configs/_base_/datasets/posetrack18.py        |  176 ++
 configs/_base_/datasets/rhd2d.py              |  141 ++
 configs/_base_/datasets/wflw.py               |  582 +++++
 configs/_base_/datasets/zebra.py              |   64 +
 configs/_base_/default_runtime.py             |   19 +
 configs/_base_/filters/gausian_filter.py      |    0
 configs/detection/yolo_classes.py             |   84 +
 configs/detection/yolov3_d53_320_273e_coco.py |  140 ++
 configs/pose/ViTPose_base_coco_256x192.py     |  170 ++
 .../pose/ViTPose_base_simple_coco_256x192.py  |  171 ++
 configs/pose/ViTPose_small_coco_256x192.py    |  170 ++
 configs/pose3d/MB_ft_h36m.yaml                |   50 +
 gafa_utils.py                                 |  448 ++++
 launch.sh                                     |   29 +
 lib/data/augmentation.py                      |   99 +
 lib/data/datareader_h36m.py                   |  136 ++
 lib/data/datareader_mesh.py                   |   59 +
 lib/data/dataset_action.py                    |  206 ++
 lib/data/dataset_mesh.py                      |   97 +
 lib/data/dataset_motion_2d.py                 |  148 ++
 lib/data/dataset_motion_3d.py                 |   68 +
 lib/data/dataset_wild.py                      |  185 ++
 lib/model/DSTformer.py                        |  362 +++
 lib/model/drop.py                             |   43 +
 lib/model/loss.py                             |  204 ++
 lib/model/loss_mesh.py                        |   68 +
 lib/model/loss_supcon.py                      |   98 +
 lib/model/model_action.py                     |   71 +
 lib/model/model_mesh.py                       |  101 +
 lib/utils/learning.py                         |  102 +
 lib/utils/tools.py                            |   69 +
 lib/utils/utils_data.py                       |  112 +
 lib/utils/utils_mesh.py                       |  521 +++++
 lib/utils/utils_smpl.py                       |   88 +
 lib/utils/vismo.py                            |  347 +++
 mmpose/.mim/configs                           |    1 +
 mmpose/.mim/demo                              |    1 +
 mmpose/.mim/model-index.yml                   |    1 +
 mmpose/.mim/tools                             |    1 +
 mmpose/__init__.py                            |   29 +
 mmpose/apis/__init__.py                       |   20 +
 mmpose/apis/inference.py                      |  833 +++++++
 mmpose/apis/inference_3d.py                   |  791 +++++++
 mmpose/apis/inference_tracking.py             |  347 +++
 mmpose/apis/test.py                           |  191 ++
 mmpose/apis/train.py                          |  200 ++
 mmpose/core/__init__.py                       |    8 +
 mmpose/core/camera/__init__.py                |    6 +
 mmpose/core/camera/camera_base.py             |   45 +
 mmpose/core/camera/single_camera.py           |  123 +
 mmpose/core/camera/single_camera_torch.py     |  118 +
 mmpose/core/distributed_wrapper.py            |  143 ++
 mmpose/core/evaluation/__init__.py            |   22 +
 mmpose/core/evaluation/bottom_up_eval.py      |  333 +++
 mmpose/core/evaluation/eval_hooks.py          |   98 +
 mmpose/core/evaluation/mesh_eval.py           |   66 +
 mmpose/core/evaluation/pose3d_eval.py         |  171 ++
 mmpose/core/evaluation/top_down_eval.py       |  684 ++++++
 mmpose/core/fp16/__init__.py                  |    9 +
 mmpose/core/fp16/decorators.py                |  175 ++
 mmpose/core/fp16/hooks.py                     |  167 ++
 mmpose/core/fp16/utils.py                     |   34 +
 mmpose/core/optimizer/__init__.py             |    4 +
 mmpose/core/optimizer/builder.py              |   56 +
 mmpose/core/post_processing/__init__.py       |   14 +
 mmpose/core/post_processing/group.py          |  410 ++++
 mmpose/core/post_processing/nms.py            |  207 ++
 .../core/post_processing/one_euro_filter.py   |  102 +
 .../core/post_processing/post_transforms.py   |  366 +++
 mmpose/core/utils/__init__.py                 |    5 +
 mmpose/core/utils/dist_utils.py               |   51 +
 mmpose/core/utils/regularizations.py          |   86 +
 mmpose/core/visualization/__init__.py         |   13 +
 mmpose/core/visualization/effects.py          |  111 +
 mmpose/core/visualization/image.py            |  442 ++++
 mmpose/datasets/__init__.py                   |   42 +
 mmpose/datasets/builder.py                    |  162 ++
 mmpose/datasets/dataset_info.py               |  104 +
 mmpose/datasets/dataset_wrappers.py           |   31 +
 mmpose/datasets/datasets/__init__.py          |   45 +
 mmpose/datasets/datasets/animal/__init__.py   |   15 +
 .../datasets/animal/animal_ap10k_dataset.py   |  367 +++
 .../datasets/animal/animal_atrw_dataset.py    |  353 +++
 .../datasets/animal/animal_base_dataset.py    |   16 +
 .../datasets/animal/animal_fly_dataset.py     |  215 ++
 .../datasets/animal/animal_horse10_dataset.py |  220 ++
 .../datasets/animal/animal_locust_dataset.py  |  218 ++
 .../datasets/animal/animal_macaque_dataset.py |  355 +++
 .../datasets/animal/animal_pose_dataset.py    |  359 +++
 .../datasets/animal/animal_zebra_dataset.py   |  193 ++
 mmpose/datasets/datasets/base/__init__.py     |   17 +
 .../kpt_2d_sview_rgb_img_bottom_up_dataset.py |  188 ++
 .../kpt_2d_sview_rgb_img_top_down_dataset.py  |  287 +++
 .../kpt_2d_sview_rgb_vid_top_down_dataset.py  |  200 ++
 .../kpt_3d_mview_rgb_img_direct_dataset.py    |  143 ++
 .../base/kpt_3d_sview_kpt_2d_dataset.py       |  226 ++
 .../kpt_3d_sview_rgb_img_top_down_dataset.py  |  256 +++
 mmpose/datasets/datasets/body3d/__init__.py   |   11 +
 .../datasets/body3d/body3d_base_dataset.py    |   16 +
 .../datasets/body3d/body3d_h36m_dataset.py    |  343 +++
 .../body3d/body3d_mpi_inf_3dhp_dataset.py     |  417 ++++
 .../body3d_mview_direct_panoptic_dataset.py   |  493 ++++
 .../body3d/body3d_semi_supervision_dataset.py |   41 +
 .../datasets/datasets/bottom_up/__init__.py   |   11 +
 .../datasets/bottom_up/bottom_up_aic.py       |  105 +
 .../bottom_up/bottom_up_base_dataset.py       |   14 +
 .../datasets/bottom_up/bottom_up_coco.py      |  305 +++
 .../bottom_up/bottom_up_coco_wholebody.py     |  238 ++
 .../datasets/bottom_up/bottom_up_crowdpose.py |  109 +
 .../datasets/bottom_up/bottom_up_mhp.py       |  108 +
 mmpose/datasets/datasets/face/__init__.py     |   11 +
 .../datasets/face/face_300w_dataset.py        |  199 ++
 .../datasets/face/face_aflw_dataset.py        |  205 ++
 .../datasets/face/face_base_dataset.py        |   16 +
 .../face/face_coco_wholebody_dataset.py       |  198 ++
 .../datasets/face/face_cofw_dataset.py        |  198 ++
 .../datasets/face/face_wflw_dataset.py        |  199 ++
 mmpose/datasets/datasets/fashion/__init__.py  |    4 +
 .../datasets/fashion/deepfashion_dataset.py   |  225 ++
 .../datasets/fashion/fashion_base_dataset.py  |   16 +
 mmpose/datasets/datasets/hand/__init__.py     |   14 +
 .../datasets/hand/freihand_dataset.py         |  205 ++
 .../datasets/hand/hand_base_dataset.py        |   16 +
 .../hand/hand_coco_wholebody_dataset.py       |  211 ++
 .../datasets/hand/interhand2d_dataset.py      |  306 +++
 .../datasets/hand/interhand3d_dataset.py      |  505 ++++
 .../datasets/hand/onehand10k_dataset.py       |  205 ++
 .../datasets/hand/panoptic_hand2d_dataset.py  |  208 ++
 .../datasets/datasets/hand/rhd2d_dataset.py   |  205 ++
 mmpose/datasets/datasets/mesh/__init__.py     |   10 +
 .../datasets/mesh/mesh_adv_dataset.py         |   43 +
 .../datasets/mesh/mesh_base_dataset.py        |  155 ++
 .../datasets/mesh/mesh_h36m_dataset.py        |  101 +
 .../datasets/mesh/mesh_mix_dataset.py         |   73 +
 mmpose/datasets/datasets/mesh/mosh_dataset.py |   68 +
 mmpose/datasets/datasets/top_down/__init__.py |   30 +
 .../datasets/top_down/topdown_aic_dataset.py  |  112 +
 .../datasets/top_down/topdown_base_dataset.py |   16 +
 .../datasets/top_down/topdown_coco_dataset.py |  405 ++++
 .../topdown_coco_wholebody_dataset.py         |  274 +++
 .../top_down/topdown_crowdpose_dataset.py     |  110 +
 .../datasets/top_down/topdown_h36m_dataset.py |  206 ++
 .../top_down/topdown_halpe_dataset.py         |   77 +
 .../top_down/topdown_jhmdb_dataset.py         |  361 +++
 .../datasets/top_down/topdown_mhp_dataset.py  |  125 +
 .../datasets/top_down/topdown_mpii_dataset.py |  275 +++
 .../top_down/topdown_mpii_trb_dataset.py      |  310 +++
 .../top_down/topdown_ochuman_dataset.py       |   97 +
 .../top_down/topdown_posetrack18_dataset.py   |  312 +++
 .../topdown_posetrack18_video_dataset.py      |  549 +++++
 mmpose/datasets/pipelines/__init__.py         |    8 +
 .../datasets/pipelines/bottom_up_transform.py |  816 +++++++
 mmpose/datasets/pipelines/hand_transform.py   |   63 +
 mmpose/datasets/pipelines/loading.py          |   91 +
 mmpose/datasets/pipelines/mesh_transform.py   |  399 ++++
 mmpose/datasets/pipelines/pose3d_transform.py |  643 ++++++
 mmpose/datasets/pipelines/shared_transform.py |  527 +++++
 .../datasets/pipelines/top_down_transform.py  |  736 ++++++
 mmpose/datasets/registry.py                   |   13 +
 mmpose/datasets/samplers/__init__.py          |    4 +
 .../datasets/samplers/distributed_sampler.py  |   41 +
 mmpose/deprecated.py                          |  199 ++
 mmpose/models/__init__.py                     |   16 +
 mmpose/models/backbones/__init__.py           |   36 +
 mmpose/models/backbones/alexnet.py            |   56 +
 mmpose/models/backbones/base_backbone.py      |   43 +
 mmpose/models/backbones/cpm.py                |  186 ++
 mmpose/models/backbones/hourglass.py          |  212 ++
 mmpose/models/backbones/hourglass_ae.py       |  212 ++
 mmpose/models/backbones/hrformer.py           |  746 ++++++
 mmpose/models/backbones/hrnet.py              |  604 +++++
 mmpose/models/backbones/litehrnet.py          |  984 ++++++++
 mmpose/models/backbones/mobilenet_v2.py       |  275 +++
 mmpose/models/backbones/mobilenet_v3.py       |  188 ++
 mmpose/models/backbones/mspn.py               |  513 +++++
 mmpose/models/backbones/regnet.py             |  317 +++
 mmpose/models/backbones/resnest.py            |  338 +++
 mmpose/models/backbones/resnet.py             |  701 ++++++
 mmpose/models/backbones/resnext.py            |  162 ++
 mmpose/models/backbones/rsn.py                |  616 +++++
 mmpose/models/backbones/scnet.py              |  248 ++
 mmpose/models/backbones/seresnet.py           |  125 +
 mmpose/models/backbones/seresnext.py          |  168 ++
 mmpose/models/backbones/shufflenet_v1.py      |  329 +++
 mmpose/models/backbones/shufflenet_v2.py      |  302 +++
 mmpose/models/backbones/tcn.py                |  267 +++
 mmpose/models/backbones/utils/__init__.py     |   11 +
 .../models/backbones/utils/channel_shuffle.py |   29 +
 .../backbones/utils/inverted_residual.py      |  128 ++
 .../models/backbones/utils/make_divisible.py  |   25 +
 mmpose/models/backbones/utils/se_layer.py     |   54 +
 mmpose/models/backbones/utils/utils.py        |   87 +
 mmpose/models/backbones/v2v_net.py            |  257 +++
 mmpose/models/backbones/vgg.py                |  193 ++
 mmpose/models/backbones/vipnas_mbv3.py        |  179 ++
 mmpose/models/backbones/vipnas_resnet.py      |  589 +++++
 mmpose/models/backbones/vit.py                |  341 +++
 mmpose/models/backbones/vit_moe.py            |  385 ++++
 mmpose/models/builder.py                      |   44 +
 mmpose/models/detectors/__init__.py           |   17 +
 .../models/detectors/associative_embedding.py |  420 ++++
 mmpose/models/detectors/base.py               |  131 ++
 mmpose/models/detectors/interhand_3d.py       |  227 ++
 mmpose/models/detectors/mesh.py               |  438 ++++
 mmpose/models/detectors/multi_task.py         |  187 ++
 mmpose/models/detectors/multiview_pose.py     |  889 ++++++++
 mmpose/models/detectors/pose_lifter.py        |  392 ++++
 mmpose/models/detectors/posewarper.py         |  244 ++
 mmpose/models/detectors/top_down.py           |  307 +++
 mmpose/models/detectors/top_down_moe.py       |  351 +++
 mmpose/models/heads/__init__.py               |   24 +
 .../models/heads/ae_higher_resolution_head.py |  249 ++
 mmpose/models/heads/ae_multi_stage_head.py    |  222 ++
 mmpose/models/heads/ae_simple_head.py         |   99 +
 mmpose/models/heads/deconv_head.py            |  295 +++
 .../models/heads/deeppose_regression_head.py  |  176 ++
 mmpose/models/heads/hmr_head.py               |   94 +
 mmpose/models/heads/interhand_3d_head.py      |  521 +++++
 .../models/heads/temporal_regression_head.py  |  319 +++
 .../models/heads/topdown_heatmap_base_head.py |  120 +
 .../heads/topdown_heatmap_multi_stage_head.py |  572 +++++
 .../heads/topdown_heatmap_simple_head.py      |  350 +++
 .../heads/vipnas_heatmap_simple_head.py       |  349 +++
 mmpose/models/heads/voxelpose_head.py         |  167 ++
 mmpose/models/losses/__init__.py              |   16 +
 mmpose/models/losses/classfication_loss.py    |   41 +
 mmpose/models/losses/heatmap_loss.py          |   86 +
 mmpose/models/losses/mesh_loss.py             |  340 +++
 mmpose/models/losses/mse_loss.py              |  153 ++
 mmpose/models/losses/multi_loss_factory.py    |  281 +++
 mmpose/models/losses/regression_loss.py       |  448 ++++
 mmpose/models/misc/__init__.py                |    1 +
 mmpose/models/misc/discriminator.py           |  307 +++
 mmpose/models/necks/__init__.py               |    5 +
 mmpose/models/necks/gap_neck.py               |   37 +
 mmpose/models/necks/posewarper_neck.py        |  329 +++
 mmpose/models/registry.py                     |   13 +
 mmpose/models/utils/__init__.py               |    4 +
 mmpose/models/utils/geometry.py               |   68 +
 mmpose/models/utils/ops.py                    |   29 +
 mmpose/models/utils/smpl.py                   |  184 ++
 mmpose/utils/__init__.py                      |    9 +
 mmpose/utils/collect_env.py                   |   16 +
 mmpose/utils/hooks.py                         |   60 +
 mmpose/utils/logger.py                        |   25 +
 mmpose/utils/setup_env.py                     |   47 +
 mmpose/utils/timer.py                         |  117 +
 mmpose/version.py                             |   19 +
 nets/nn.py                                    |  278 +++
 rgbd_3d.py                                    |  765 +++++++
 rgbd_detect.py                                | 1011 ++++++++
 rgbd_detect_3d_dir.py                         | 2025 +++++++++++++++++
 run.sh                                        |    1 +
 sixdrep/util.py                               |  442 ++++
 sixdrep/utils.py                              |    8 +
 utils.py                                      |  501 ++++
 visualizer.py                                 |  279 +++
 297 files changed, 63275 insertions(+), 1 deletion(-)
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 configs/_base_/datasets/300w.py
 create mode 100644 configs/_base_/datasets/aflw.py
 create mode 100644 configs/_base_/datasets/aic.py
 create mode 100644 configs/_base_/datasets/aic_info.py
 create mode 100644 configs/_base_/datasets/animalpose.py
 create mode 100644 configs/_base_/datasets/ap10k.py
 create mode 100644 configs/_base_/datasets/ap10k_info.py
 create mode 100644 configs/_base_/datasets/atrw.py
 create mode 100644 configs/_base_/datasets/coco.py
 create mode 100644 configs/_base_/datasets/coco_wholebody.py
 create mode 100644 configs/_base_/datasets/coco_wholebody_face.py
 create mode 100644 configs/_base_/datasets/coco_wholebody_hand.py
 create mode 100644 configs/_base_/datasets/coco_wholebody_info.py
 create mode 100644 configs/_base_/datasets/cofw.py
 create mode 100644 configs/_base_/datasets/crowdpose.py
 create mode 100644 configs/_base_/datasets/deepfashion_full.py
 create mode 100644 configs/_base_/datasets/deepfashion_lower.py
 create mode 100644 configs/_base_/datasets/deepfashion_upper.py
 create mode 100644 configs/_base_/datasets/fly.py
 create mode 100644 configs/_base_/datasets/freihand2d.py
 create mode 100644 configs/_base_/datasets/h36m.py
 create mode 100644 configs/_base_/datasets/halpe.py
 create mode 100644 configs/_base_/datasets/horse10.py
 create mode 100644 configs/_base_/datasets/interhand2d.py
 create mode 100644 configs/_base_/datasets/interhand3d.py
 create mode 100644 configs/_base_/datasets/jhmdb.py
 create mode 100644 configs/_base_/datasets/locust.py
 create mode 100644 configs/_base_/datasets/macaque.py
 create mode 100644 configs/_base_/datasets/mhp.py
 create mode 100644 configs/_base_/datasets/mpi_inf_3dhp.py
 create mode 100644 configs/_base_/datasets/mpii.py
 create mode 100644 configs/_base_/datasets/mpii_info.py
 create mode 100644 configs/_base_/datasets/mpii_trb.py
 create mode 100644 configs/_base_/datasets/ochuman.py
 create mode 100644 configs/_base_/datasets/onehand10k.py
 create mode 100644 configs/_base_/datasets/panoptic_body3d.py
 create mode 100644 configs/_base_/datasets/panoptic_hand2d.py
 create mode 100644 configs/_base_/datasets/posetrack18.py
 create mode 100644 configs/_base_/datasets/rhd2d.py
 create mode 100644 configs/_base_/datasets/wflw.py
 create mode 100644 configs/_base_/datasets/zebra.py
 create mode 100644 configs/_base_/default_runtime.py
 create mode 100644 configs/_base_/filters/gausian_filter.py
 create mode 100644 configs/detection/yolo_classes.py
 create mode 100644 configs/detection/yolov3_d53_320_273e_coco.py
 create mode 100644 configs/pose/ViTPose_base_coco_256x192.py
 create mode 100644 configs/pose/ViTPose_base_simple_coco_256x192.py
 create mode 100644 configs/pose/ViTPose_small_coco_256x192.py
 create mode 100644 configs/pose3d/MB_ft_h36m.yaml
 create mode 100644 gafa_utils.py
 create mode 100644 launch.sh
 create mode 100644 lib/data/augmentation.py
 create mode 100644 lib/data/datareader_h36m.py
 create mode 100644 lib/data/datareader_mesh.py
 create mode 100644 lib/data/dataset_action.py
 create mode 100644 lib/data/dataset_mesh.py
 create mode 100644 lib/data/dataset_motion_2d.py
 create mode 100644 lib/data/dataset_motion_3d.py
 create mode 100644 lib/data/dataset_wild.py
 create mode 100644 lib/model/DSTformer.py
 create mode 100644 lib/model/drop.py
 create mode 100644 lib/model/loss.py
 create mode 100644 lib/model/loss_mesh.py
 create mode 100644 lib/model/loss_supcon.py
 create mode 100644 lib/model/model_action.py
 create mode 100644 lib/model/model_mesh.py
 create mode 100644 lib/utils/learning.py
 create mode 100644 lib/utils/tools.py
 create mode 100644 lib/utils/utils_data.py
 create mode 100644 lib/utils/utils_mesh.py
 create mode 100644 lib/utils/utils_smpl.py
 create mode 100644 lib/utils/vismo.py
 create mode 120000 mmpose/.mim/configs
 create mode 120000 mmpose/.mim/demo
 create mode 120000 mmpose/.mim/model-index.yml
 create mode 120000 mmpose/.mim/tools
 create mode 100644 mmpose/__init__.py
 create mode 100644 mmpose/apis/__init__.py
 create mode 100644 mmpose/apis/inference.py
 create mode 100644 mmpose/apis/inference_3d.py
 create mode 100644 mmpose/apis/inference_tracking.py
 create mode 100644 mmpose/apis/test.py
 create mode 100644 mmpose/apis/train.py
 create mode 100644 mmpose/core/__init__.py
 create mode 100644 mmpose/core/camera/__init__.py
 create mode 100644 mmpose/core/camera/camera_base.py
 create mode 100644 mmpose/core/camera/single_camera.py
 create mode 100644 mmpose/core/camera/single_camera_torch.py
 create mode 100644 mmpose/core/distributed_wrapper.py
 create mode 100644 mmpose/core/evaluation/__init__.py
 create mode 100644 mmpose/core/evaluation/bottom_up_eval.py
 create mode 100644 mmpose/core/evaluation/eval_hooks.py
 create mode 100644 mmpose/core/evaluation/mesh_eval.py
 create mode 100644 mmpose/core/evaluation/pose3d_eval.py
 create mode 100644 mmpose/core/evaluation/top_down_eval.py
 create mode 100644 mmpose/core/fp16/__init__.py
 create mode 100644 mmpose/core/fp16/decorators.py
 create mode 100644 mmpose/core/fp16/hooks.py
 create mode 100644 mmpose/core/fp16/utils.py
 create mode 100644 mmpose/core/optimizer/__init__.py
 create mode 100644 mmpose/core/optimizer/builder.py
 create mode 100644 mmpose/core/post_processing/__init__.py
 create mode 100644 mmpose/core/post_processing/group.py
 create mode 100644 mmpose/core/post_processing/nms.py
 create mode 100644 mmpose/core/post_processing/one_euro_filter.py
 create mode 100644 mmpose/core/post_processing/post_transforms.py
 create mode 100644 mmpose/core/utils/__init__.py
 create mode 100644 mmpose/core/utils/dist_utils.py
 create mode 100644 mmpose/core/utils/regularizations.py
 create mode 100644 mmpose/core/visualization/__init__.py
 create mode 100644 mmpose/core/visualization/effects.py
 create mode 100644 mmpose/core/visualization/image.py
 create mode 100644 mmpose/datasets/__init__.py
 create mode 100644 mmpose/datasets/builder.py
 create mode 100644 mmpose/datasets/dataset_info.py
 create mode 100644 mmpose/datasets/dataset_wrappers.py
 create mode 100644 mmpose/datasets/datasets/__init__.py
 create mode 100644 mmpose/datasets/datasets/animal/__init__.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_ap10k_dataset.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_atrw_dataset.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_base_dataset.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_fly_dataset.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_horse10_dataset.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_locust_dataset.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_macaque_dataset.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_pose_dataset.py
 create mode 100644 mmpose/datasets/datasets/animal/animal_zebra_dataset.py
 create mode 100644 mmpose/datasets/datasets/base/__init__.py
 create mode 100644 mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py
 create mode 100644 mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
 create mode 100644 mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py
 create mode 100644 mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py
 create mode 100644 mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py
 create mode 100644 mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py
 create mode 100644 mmpose/datasets/datasets/body3d/__init__.py
 create mode 100644 mmpose/datasets/datasets/body3d/body3d_base_dataset.py
 create mode 100644 mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py
 create mode 100644 mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py
 create mode 100644 mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py
 create mode 100644 mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py
 create mode 100644 mmpose/datasets/datasets/bottom_up/__init__.py
 create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_aic.py
 create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py
 create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_coco.py
 create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py
 create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py
 create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py
 create mode 100644 mmpose/datasets/datasets/face/__init__.py
 create mode 100644 mmpose/datasets/datasets/face/face_300w_dataset.py
 create mode 100644 mmpose/datasets/datasets/face/face_aflw_dataset.py
 create mode 100644 mmpose/datasets/datasets/face/face_base_dataset.py
 create mode 100644 mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py
 create mode 100644 mmpose/datasets/datasets/face/face_cofw_dataset.py
 create mode 100644 mmpose/datasets/datasets/face/face_wflw_dataset.py
 create mode 100644 mmpose/datasets/datasets/fashion/__init__.py
 create mode 100644 mmpose/datasets/datasets/fashion/deepfashion_dataset.py
 create mode 100644 mmpose/datasets/datasets/fashion/fashion_base_dataset.py
 create mode 100644 mmpose/datasets/datasets/hand/__init__.py
 create mode 100644 mmpose/datasets/datasets/hand/freihand_dataset.py
 create mode 100644 mmpose/datasets/datasets/hand/hand_base_dataset.py
 create mode 100644 mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py
 create mode 100644 mmpose/datasets/datasets/hand/interhand2d_dataset.py
 create mode 100644 mmpose/datasets/datasets/hand/interhand3d_dataset.py
 create mode 100644 mmpose/datasets/datasets/hand/onehand10k_dataset.py
 create mode 100644 mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py
 create mode 100644 mmpose/datasets/datasets/hand/rhd2d_dataset.py
 create mode 100644 mmpose/datasets/datasets/mesh/__init__.py
 create mode 100644 mmpose/datasets/datasets/mesh/mesh_adv_dataset.py
 create mode 100644 mmpose/datasets/datasets/mesh/mesh_base_dataset.py
 create mode 100644 mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py
 create mode 100644 mmpose/datasets/datasets/mesh/mesh_mix_dataset.py
 create mode 100644 mmpose/datasets/datasets/mesh/mosh_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/__init__.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_aic_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_base_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_coco_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py
 create mode 100644 mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py
 create mode 100644 mmpose/datasets/pipelines/__init__.py
 create mode 100644 mmpose/datasets/pipelines/bottom_up_transform.py
 create mode 100644 mmpose/datasets/pipelines/hand_transform.py
 create mode 100644 mmpose/datasets/pipelines/loading.py
 create mode 100644 mmpose/datasets/pipelines/mesh_transform.py
 create mode 100644 mmpose/datasets/pipelines/pose3d_transform.py
 create mode 100644 mmpose/datasets/pipelines/shared_transform.py
 create mode 100644 mmpose/datasets/pipelines/top_down_transform.py
 create mode 100644 mmpose/datasets/registry.py
 create mode 100644 mmpose/datasets/samplers/__init__.py
 create mode 100644 mmpose/datasets/samplers/distributed_sampler.py
 create mode 100644 mmpose/deprecated.py
 create mode 100644 mmpose/models/__init__.py
 create mode 100644 mmpose/models/backbones/__init__.py
 create mode 100644 mmpose/models/backbones/alexnet.py
 create mode 100644 mmpose/models/backbones/base_backbone.py
 create mode 100644 mmpose/models/backbones/cpm.py
 create mode 100644 mmpose/models/backbones/hourglass.py
 create mode 100644 mmpose/models/backbones/hourglass_ae.py
 create mode 100644 mmpose/models/backbones/hrformer.py
 create mode 100644 mmpose/models/backbones/hrnet.py
 create mode 100644 mmpose/models/backbones/litehrnet.py
 create mode 100644 mmpose/models/backbones/mobilenet_v2.py
 create mode 100644 mmpose/models/backbones/mobilenet_v3.py
 create mode 100644 mmpose/models/backbones/mspn.py
 create mode 100644 mmpose/models/backbones/regnet.py
 create mode 100644 mmpose/models/backbones/resnest.py
 create mode 100644 mmpose/models/backbones/resnet.py
 create mode 100644 mmpose/models/backbones/resnext.py
 create mode 100644 mmpose/models/backbones/rsn.py
 create mode 100644 mmpose/models/backbones/scnet.py
 create mode 100644 mmpose/models/backbones/seresnet.py
 create mode 100644 mmpose/models/backbones/seresnext.py
 create mode 100644 mmpose/models/backbones/shufflenet_v1.py
 create mode 100644 mmpose/models/backbones/shufflenet_v2.py
 create mode 100644 mmpose/models/backbones/tcn.py
 create mode 100644 mmpose/models/backbones/utils/__init__.py
 create mode 100644 mmpose/models/backbones/utils/channel_shuffle.py
 create mode 100644 mmpose/models/backbones/utils/inverted_residual.py
 create mode 100644 mmpose/models/backbones/utils/make_divisible.py
 create mode 100644 mmpose/models/backbones/utils/se_layer.py
 create mode 100644 mmpose/models/backbones/utils/utils.py
 create mode 100644 mmpose/models/backbones/v2v_net.py
 create mode 100644 mmpose/models/backbones/vgg.py
 create mode 100644 mmpose/models/backbones/vipnas_mbv3.py
 create mode 100644 mmpose/models/backbones/vipnas_resnet.py
 create mode 100644 mmpose/models/backbones/vit.py
 create mode 100644 mmpose/models/backbones/vit_moe.py
 create mode 100644 mmpose/models/builder.py
 create mode 100644 mmpose/models/detectors/__init__.py
 create mode 100644 mmpose/models/detectors/associative_embedding.py
 create mode 100644 mmpose/models/detectors/base.py
 create mode 100644 mmpose/models/detectors/interhand_3d.py
 create mode 100644 mmpose/models/detectors/mesh.py
 create mode 100644 mmpose/models/detectors/multi_task.py
 create mode 100644 mmpose/models/detectors/multiview_pose.py
 create mode 100644 mmpose/models/detectors/pose_lifter.py
 create mode 100644 mmpose/models/detectors/posewarper.py
 create mode 100644 mmpose/models/detectors/top_down.py
 create mode 100644 mmpose/models/detectors/top_down_moe.py
 create mode 100644 mmpose/models/heads/__init__.py
 create mode 100644 mmpose/models/heads/ae_higher_resolution_head.py
 create mode 100644 mmpose/models/heads/ae_multi_stage_head.py
 create mode 100644 mmpose/models/heads/ae_simple_head.py
 create mode 100644 mmpose/models/heads/deconv_head.py
 create mode 100644 mmpose/models/heads/deeppose_regression_head.py
 create mode 100644 mmpose/models/heads/hmr_head.py
 create mode 100644 mmpose/models/heads/interhand_3d_head.py
 create mode 100644 mmpose/models/heads/temporal_regression_head.py
 create mode 100644 mmpose/models/heads/topdown_heatmap_base_head.py
 create mode 100644 mmpose/models/heads/topdown_heatmap_multi_stage_head.py
 create mode 100644 mmpose/models/heads/topdown_heatmap_simple_head.py
 create mode 100644 mmpose/models/heads/vipnas_heatmap_simple_head.py
 create mode 100644 mmpose/models/heads/voxelpose_head.py
 create mode 100644 mmpose/models/losses/__init__.py
 create mode 100644 mmpose/models/losses/classfication_loss.py
 create mode 100644 mmpose/models/losses/heatmap_loss.py
 create mode 100644 mmpose/models/losses/mesh_loss.py
 create mode 100644 mmpose/models/losses/mse_loss.py
 create mode 100644 mmpose/models/losses/multi_loss_factory.py
 create mode 100644 mmpose/models/losses/regression_loss.py
 create mode 100644 mmpose/models/misc/__init__.py
 create mode 100644 mmpose/models/misc/discriminator.py
 create mode 100644 mmpose/models/necks/__init__.py
 create mode 100644 mmpose/models/necks/gap_neck.py
 create mode 100644 mmpose/models/necks/posewarper_neck.py
 create mode 100644 mmpose/models/registry.py
 create mode 100644 mmpose/models/utils/__init__.py
 create mode 100644 mmpose/models/utils/geometry.py
 create mode 100644 mmpose/models/utils/ops.py
 create mode 100644 mmpose/models/utils/smpl.py
 create mode 100644 mmpose/utils/__init__.py
 create mode 100644 mmpose/utils/collect_env.py
 create mode 100644 mmpose/utils/hooks.py
 create mode 100644 mmpose/utils/logger.py
 create mode 100644 mmpose/utils/setup_env.py
 create mode 100644 mmpose/utils/timer.py
 create mode 100644 mmpose/version.py
 create mode 100644 nets/nn.py
 create mode 100755 rgbd_3d.py
 create mode 100644 rgbd_detect.py
 create mode 100644 rgbd_detect_3d_dir.py
 create mode 100644 run.sh
 create mode 100644 sixdrep/util.py
 create mode 100644 sixdrep/utils.py
 create mode 100644 utils.py
 create mode 100755 visualizer.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0caa42d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.pyc
+*.pth
+*.pt
+*.zip
+*.deb
+*.bin
+*.onnx
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..2c4778a
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,58 @@
+ARG PYTORCH="1.8.0"
+ARG CUDA="11.1"
+ARG CUDNN="8"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+COPY ./cuda-keyring_1.0-1_all.deb cuda-keyring_1.0-1_all.deb
+RUN rm /etc/apt/sources.list.d/cuda.list && rm /etc/apt/sources.list.d/nvidia-ml.list && dpkg -i cuda-keyring_1.0-1_all.deb
+RUN apt-get update
+
+RUN apt-get install -y software-properties-common
+RUN apt-get update
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+RUN apt install -y gcc-9
+RUN apt-get install libstdc++6
+RUN apt-get update
+
+
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
+
+# Refer a advise in the issues page
+#RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+#RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+
+RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install xtcocotools
+RUN pip install cython
+RUN pip install xtcocotools
+
+# Install MMCV
+RUN pip install mmcv-full==1.3.17 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
+
+
+#System full upgrade
+RUN apt-get update && apt-get --with-new-pkgs upgrade -y
+
+
+#Install for python rgbdetect
+RUN pip install mmdet==2.28.2
+RUN git clone https://github.com/ViTAE-Transformer/ViTPose/
+WORKDIR /workspace/ViTPose
+RUN pip install -v -e .
+RUN pip install timm==0.4.9 einops
+RUN pip install print-color
+RUN pip install --extra-index-url https://rospypi.github.io/simple/ rospy
+RUN pip install -U --extra-index-url https://rospypi.github.io/simple/_pre sensor_msgs tf2_ros tf2_sensor_msgs tf tf2-py
+RUN pip install -U --extra-index-url https://rospypi.github.io/simple/_pre cv_bridge
+RUN pip install albumentations==1.1.0
+RUN pip install onnxruntime
+RUN pip uninstall opencv-python -y
+RUN pip install opencv-python==4.10.0.84
+
+WORKDIR /workspace/rgbd_pose_and_depth
diff --git a/README.md b/README.md
index 09c38c3..316dc86 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,19 @@ The pipeline consists of the following main components :
 - Gaze estimation
 
 ## Installation
-How to install with docker
+How to install with docker and launch interactively
+
+- Build docker
+
+```bash
+docker build -t inria_docker:rgbd_detect .
+```
+
+- Launch docker
+
+```bash
+sh launch.sh
+```
 
 ## ROS Interface
 Input / output topics and format
@@ -21,3 +33,14 @@ Flags to run with options
 
 ## Acknoledgments
 MMDet, MMPose, 6DRepNet...
+
+
+## TODO
+- Clean the code of unnecesary options (keep only sixdrep)
+- Remove unecessary dependencies
+- Keep only rgbd_detect_3d_dir.py
+- Add models in external links, LFS or directly from their source in the corresponding repos
+- Add a description of all options
+- Add ROS interface description
+- Add illustration
+- Add instructions and `requirements.txt` for local installation
\ No newline at end of file
diff --git a/configs/_base_/datasets/300w.py b/configs/_base_/datasets/300w.py
new file mode 100644
index 0000000..10c343a
--- /dev/null
+++ b/configs/_base_/datasets/300w.py
@@ -0,0 +1,384 @@
+dataset_info = dict(
+    dataset_name='300w',
+    paper_info=dict(
+        author='Sagonas, Christos and Antonakos, Epameinondas '
+        'and Tzimiropoulos, Georgios and Zafeiriou, Stefanos '
+        'and Pantic, Maja',
+        title='300 faces in-the-wild challenge: '
+        'Database and results',
+        container='Image and vision computing',
+        year='2016',
+        homepage='https://ibug.doc.ic.ac.uk/resources/300-W/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-16'),
+        1:
+        dict(
+            name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-15'),
+        2:
+        dict(
+            name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-14'),
+        3:
+        dict(
+            name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-13'),
+        4:
+        dict(
+            name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-12'),
+        5:
+        dict(
+            name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-11'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-10'),
+        7:
+        dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-9'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap=''),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-7'),
+        10:
+        dict(
+            name='kpt-10', id=10, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        11:
+        dict(
+            name='kpt-11', id=11, color=[255, 255, 255], type='',
+            swap='kpt-5'),
+        12:
+        dict(
+            name='kpt-12', id=12, color=[255, 255, 255], type='',
+            swap='kpt-4'),
+        13:
+        dict(
+            name='kpt-13', id=13, color=[255, 255, 255], type='',
+            swap='kpt-3'),
+        14:
+        dict(
+            name='kpt-14', id=14, color=[255, 255, 255], type='',
+            swap='kpt-2'),
+        15:
+        dict(
+            name='kpt-15', id=15, color=[255, 255, 255], type='',
+            swap='kpt-1'),
+        16:
+        dict(
+            name='kpt-16', id=16, color=[255, 255, 255], type='',
+            swap='kpt-0'),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-26'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-25'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-24'),
+        20:
+        dict(
+            name='kpt-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-23'),
+        21:
+        dict(
+            name='kpt-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-21'),
+        23:
+        dict(
+            name='kpt-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-20'),
+        24:
+        dict(
+            name='kpt-24',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        25:
+        dict(
+            name='kpt-25',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        26:
+        dict(
+            name='kpt-26',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        27:
+        dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap=''),
+        29:
+        dict(name='kpt-29', id=29, color=[255, 255, 255], type='', swap=''),
+        30:
+        dict(name='kpt-30', id=30, color=[255, 255, 255], type='', swap=''),
+        31:
+        dict(
+            name='kpt-31',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-35'),
+        32:
+        dict(
+            name='kpt-32',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-34'),
+        33:
+        dict(name='kpt-33', id=33, color=[255, 255, 255], type='', swap=''),
+        34:
+        dict(
+            name='kpt-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-32'),
+        35:
+        dict(
+            name='kpt-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-31'),
+        36:
+        dict(
+            name='kpt-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-45'),
+        37:
+        dict(
+            name='kpt-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-44'),
+        38:
+        dict(
+            name='kpt-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-43'),
+        39:
+        dict(
+            name='kpt-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-42'),
+        40:
+        dict(
+            name='kpt-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-47'),
+        41:
+        dict(
+            name='kpt-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-46'),
+        42:
+        dict(
+            name='kpt-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-39'),
+        43:
+        dict(
+            name='kpt-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-38'),
+        44:
+        dict(
+            name='kpt-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-37'),
+        45:
+        dict(
+            name='kpt-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-36'),
+        46:
+        dict(
+            name='kpt-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-41'),
+        47:
+        dict(
+            name='kpt-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-40'),
+        48:
+        dict(
+            name='kpt-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-54'),
+        49:
+        dict(
+            name='kpt-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-53'),
+        50:
+        dict(
+            name='kpt-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-52'),
+        51:
+        dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(
+            name='kpt-52',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-50'),
+        53:
+        dict(
+            name='kpt-53',
+            id=53,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-49'),
+        54:
+        dict(
+            name='kpt-54',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-48'),
+        55:
+        dict(
+            name='kpt-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-59'),
+        56:
+        dict(
+            name='kpt-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-58'),
+        57:
+        dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='kpt-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-56'),
+        59:
+        dict(
+            name='kpt-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-55'),
+        60:
+        dict(
+            name='kpt-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-64'),
+        61:
+        dict(
+            name='kpt-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-63'),
+        62:
+        dict(name='kpt-62', id=62, color=[255, 255, 255], type='', swap=''),
+        63:
+        dict(
+            name='kpt-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-61'),
+        64:
+        dict(
+            name='kpt-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-60'),
+        65:
+        dict(
+            name='kpt-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-67'),
+        66:
+        dict(name='kpt-66', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='kpt-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-65'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 68,
+    sigmas=[])
diff --git a/configs/_base_/datasets/aflw.py b/configs/_base_/datasets/aflw.py
new file mode 100644
index 0000000..bf534cb
--- /dev/null
+++ b/configs/_base_/datasets/aflw.py
@@ -0,0 +1,83 @@
+dataset_info = dict(
+    dataset_name='aflw',
+    paper_info=dict(
+        author='Koestinger, Martin and Wohlhart, Paul and '
+        'Roth, Peter M and Bischof, Horst',
+        title='Annotated facial landmarks in the wild: '
+        'A large-scale, real-world database for facial '
+        'landmark localization',
+        container='2011 IEEE international conference on computer '
+        'vision workshops (ICCV workshops)',
+        year='2011',
+        homepage='https://www.tugraz.at/institute/icg/research/'
+        'team-bischof/lrs/downloads/aflw/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-5'),
+        1:
+        dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-4'),
+        2:
+        dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
+        3:
+        dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
+        4:
+        dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-1'),
+        5:
+        dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-0'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-11'),
+        7:
+        dict(
+            name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-10'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
+        10:
+        dict(
+            name='kpt-10', id=10, color=[255, 255, 255], type='',
+            swap='kpt-7'),
+        11:
+        dict(
+            name='kpt-11', id=11, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        13:
+        dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap=''),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        16:
+        dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        18:
+        dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 19,
+    sigmas=[])
diff --git a/configs/_base_/datasets/aic.py b/configs/_base_/datasets/aic.py
new file mode 100644
index 0000000..9ecdbe3
--- /dev/null
+++ b/configs/_base_/datasets/aic.py
@@ -0,0 +1,140 @@
+dataset_info = dict(
+    dataset_name='aic',
+    paper_info=dict(
+        author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+        'Li, Yixin and Yan, Baoming and Liang, Rui and '
+        'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+        'Fu, Yanwei and others',
+        title='Ai challenger: A large-scale dataset for going '
+        'deeper in image understanding',
+        container='arXiv',
+        year='2017',
+        homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_shoulder',
+            id=0,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        1:
+        dict(
+            name='right_elbow',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        2:
+        dict(
+            name='right_wrist',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='right_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        7:
+        dict(
+            name='right_knee',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        8:
+        dict(
+            name='right_ankle',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        9:
+        dict(
+            name='left_hip',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='left_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        12:
+        dict(
+            name='head_top',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
+        9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
+        11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+        12: dict(
+            link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
+    ],
+
+    # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
+    # 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
+    # delta = 2 x sigma
+    sigmas=[
+        0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
+        0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
+        0.01291456, 0.01236173
+    ])
diff --git a/configs/_base_/datasets/aic_info.py b/configs/_base_/datasets/aic_info.py
new file mode 100644
index 0000000..f143fd8
--- /dev/null
+++ b/configs/_base_/datasets/aic_info.py
@@ -0,0 +1,140 @@
+aic_info = dict(
+    dataset_name='aic',
+    paper_info=dict(
+        author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+        'Li, Yixin and Yan, Baoming and Liang, Rui and '
+        'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+        'Fu, Yanwei and others',
+        title='Ai challenger: A large-scale dataset for going '
+        'deeper in image understanding',
+        container='arXiv',
+        year='2017',
+        homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_shoulder',
+            id=0,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        1:
+        dict(
+            name='right_elbow',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        2:
+        dict(
+            name='right_wrist',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='right_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        7:
+        dict(
+            name='right_knee',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        8:
+        dict(
+            name='right_ankle',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        9:
+        dict(
+            name='left_hip',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='left_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        12:
+        dict(
+            name='head_top',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
+        9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
+        11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+        12: dict(
+            link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
+    ],
+
+    # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
+    # 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
+    # delta = 2 x sigma
+    sigmas=[
+        0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
+        0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
+        0.01291456, 0.01236173
+    ])
diff --git a/configs/_base_/datasets/animalpose.py b/configs/_base_/datasets/animalpose.py
new file mode 100644
index 0000000..d5bb62d
--- /dev/null
+++ b/configs/_base_/datasets/animalpose.py
@@ -0,0 +1,166 @@
+dataset_info = dict(
+    dataset_name='animalpose',
+    paper_info=dict(
+        author='Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and '
+        'Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing',
+        title='Cross-Domain Adaptation for Animal Pose Estimation',
+        container='The IEEE International Conference on '
+        'Computer Vision (ICCV)',
+        year='2019',
+        homepage='https://sites.google.com/view/animal-pose/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(
+            name='L_EarBase',
+            id=2,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_EarBase'),
+        3:
+        dict(
+            name='R_EarBase',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_EarBase'),
+        4:
+        dict(name='Nose', id=4, color=[51, 153, 255], type='upper', swap=''),
+        5:
+        dict(name='Throat', id=5, color=[51, 153, 255], type='upper', swap=''),
+        6:
+        dict(
+            name='TailBase', id=6, color=[51, 153, 255], type='lower',
+            swap=''),
+        7:
+        dict(
+            name='Withers', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='L_F_Elbow',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Elbow'),
+        9:
+        dict(
+            name='R_F_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Elbow'),
+        10:
+        dict(
+            name='L_B_Elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Elbow'),
+        11:
+        dict(
+            name='R_B_Elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Elbow'),
+        12:
+        dict(
+            name='L_F_Knee',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Knee'),
+        13:
+        dict(
+            name='R_F_Knee',
+            id=13,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Knee'),
+        14:
+        dict(
+            name='L_B_Knee',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Knee'),
+        15:
+        dict(
+            name='R_B_Knee',
+            id=15,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Knee'),
+        16:
+        dict(
+            name='L_F_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        17:
+        dict(
+            name='R_F_Paw',
+            id=17,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Paw'),
+        18:
+        dict(
+            name='L_B_Paw',
+            id=18,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        19:
+        dict(
+            name='R_B_Paw',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Paw')
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[51, 153, 255]),
+        1: dict(link=('L_Eye', 'L_EarBase'), id=1, color=[0, 255, 0]),
+        2: dict(link=('R_Eye', 'R_EarBase'), id=2, color=[255, 128, 0]),
+        3: dict(link=('L_Eye', 'Nose'), id=3, color=[0, 255, 0]),
+        4: dict(link=('R_Eye', 'Nose'), id=4, color=[255, 128, 0]),
+        5: dict(link=('Nose', 'Throat'), id=5, color=[51, 153, 255]),
+        6: dict(link=('Throat', 'Withers'), id=6, color=[51, 153, 255]),
+        7: dict(link=('TailBase', 'Withers'), id=7, color=[51, 153, 255]),
+        8: dict(link=('Throat', 'L_F_Elbow'), id=8, color=[0, 255, 0]),
+        9: dict(link=('L_F_Elbow', 'L_F_Knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('L_F_Knee', 'L_F_Paw'), id=10, color=[0, 255, 0]),
+        11: dict(link=('Throat', 'R_F_Elbow'), id=11, color=[255, 128, 0]),
+        12: dict(link=('R_F_Elbow', 'R_F_Knee'), id=12, color=[255, 128, 0]),
+        13: dict(link=('R_F_Knee', 'R_F_Paw'), id=13, color=[255, 128, 0]),
+        14: dict(link=('TailBase', 'L_B_Elbow'), id=14, color=[0, 255, 0]),
+        15: dict(link=('L_B_Elbow', 'L_B_Knee'), id=15, color=[0, 255, 0]),
+        16: dict(link=('L_B_Knee', 'L_B_Paw'), id=16, color=[0, 255, 0]),
+        17: dict(link=('TailBase', 'R_B_Elbow'), id=17, color=[255, 128, 0]),
+        18: dict(link=('R_B_Elbow', 'R_B_Knee'), id=18, color=[255, 128, 0]),
+        19: dict(link=('R_B_Knee', 'R_B_Paw'), id=19, color=[255, 128, 0])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2,
+        1.5, 1.5, 1.5, 1.5
+    ],
+
+    # Note: The original paper did not provide enough information about
+    # the sigmas. We modified from 'https://github.com/cocodataset/'
+    # 'cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523'
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.10, 0.10, 0.10, 0.107, 0.107,
+        0.107, 0.107, 0.087, 0.087, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089
+    ])
diff --git a/configs/_base_/datasets/ap10k.py b/configs/_base_/datasets/ap10k.py
new file mode 100644
index 0000000..c0df579
--- /dev/null
+++ b/configs/_base_/datasets/ap10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='ap10k',
+    paper_info=dict(
+        author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
+        'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
+        title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
+        container='35th Conference on Neural Information Processing Systems '
+        '(NeurIPS 2021) Track on Datasets and Bench-marks.',
+        year='2021',
+        homepage='https://github.com/AlexTheBad/AP-10K',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
+        4:
+        dict(
+            name='Root of tail',
+            id=4,
+            color=[51, 153, 255],
+            type='lower',
+            swap=''),
+        5:
+        dict(
+            name='L_Shoulder',
+            id=5,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Shoulder'),
+        6:
+        dict(
+            name='L_Elbow',
+            id=6,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Elbow'),
+        7:
+        dict(
+            name='L_F_Paw',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        8:
+        dict(
+            name='R_Shoulder',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Shoulder'),
+        9:
+        dict(
+            name='R_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Elbow'),
+        10:
+        dict(
+            name='R_F_Paw',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_F_Paw'),
+        11:
+        dict(
+            name='L_Hip',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Hip'),
+        12:
+        dict(
+            name='L_Knee',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Knee'),
+        13:
+        dict(
+            name='L_B_Paw',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        14:
+        dict(
+            name='R_Hip', id=14, color=[0, 255, 0], type='lower',
+            swap='L_Hip'),
+        15:
+        dict(
+            name='R_Knee',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Knee'),
+        16:
+        dict(
+            name='R_B_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_B_Paw'),
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
+        1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
+        2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
+        3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
+        4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
+        5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
+        6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
+        7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
+        8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
+        9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
+        10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
+        11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
+        12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
+        13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
+        14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
+        15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
+        16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
+        0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
+    ])
diff --git a/configs/_base_/datasets/ap10k_info.py b/configs/_base_/datasets/ap10k_info.py
new file mode 100644
index 0000000..af2461c
--- /dev/null
+++ b/configs/_base_/datasets/ap10k_info.py
@@ -0,0 +1,142 @@
+ap10k_info = dict(
+    dataset_name='ap10k',
+    paper_info=dict(
+        author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
+        'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
+        title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
+        container='35th Conference on Neural Information Processing Systems '
+        '(NeurIPS 2021) Track on Datasets and Bench-marks.',
+        year='2021',
+        homepage='https://github.com/AlexTheBad/AP-10K',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
+        4:
+        dict(
+            name='Root of tail',
+            id=4,
+            color=[51, 153, 255],
+            type='lower',
+            swap=''),
+        5:
+        dict(
+            name='L_Shoulder',
+            id=5,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Shoulder'),
+        6:
+        dict(
+            name='L_Elbow',
+            id=6,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Elbow'),
+        7:
+        dict(
+            name='L_F_Paw',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        8:
+        dict(
+            name='R_Shoulder',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Shoulder'),
+        9:
+        dict(
+            name='R_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Elbow'),
+        10:
+        dict(
+            name='R_F_Paw',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_F_Paw'),
+        11:
+        dict(
+            name='L_Hip',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Hip'),
+        12:
+        dict(
+            name='L_Knee',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Knee'),
+        13:
+        dict(
+            name='L_B_Paw',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        14:
+        dict(
+            name='R_Hip', id=14, color=[0, 255, 0], type='lower',
+            swap='L_Hip'),
+        15:
+        dict(
+            name='R_Knee',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Knee'),
+        16:
+        dict(
+            name='R_B_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_B_Paw'),
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
+        1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
+        2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
+        3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
+        4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
+        5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
+        6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
+        7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
+        8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
+        9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
+        10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
+        11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
+        12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
+        13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
+        14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
+        15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
+        16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
+        0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
+    ])
diff --git a/configs/_base_/datasets/atrw.py b/configs/_base_/datasets/atrw.py
new file mode 100644
index 0000000..7ec71c8
--- /dev/null
+++ b/configs/_base_/datasets/atrw.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+    dataset_name='atrw',
+    paper_info=dict(
+        author='Li, Shuyuan and Li, Jianguo and Tang, Hanlin '
+        'and Qian, Rui and Lin, Weiyao',
+        title='ATRW: A Benchmark for Amur Tiger '
+        'Re-identification in the Wild',
+        container='Proceedings of the 28th ACM '
+        'International Conference on Multimedia',
+        year='2020',
+        homepage='https://cvwc2019.github.io/challenge.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_ear',
+            id=0,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        1:
+        dict(
+            name='right_ear',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        2:
+        dict(name='nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(
+            name='right_shoulder',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        4:
+        dict(
+            name='right_front_paw',
+            id=4,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_front_paw'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='left_front_paw',
+            id=6,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_front_paw'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='right_knee',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        9:
+        dict(
+            name='right_back_paw',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_back_paw'),
+        10:
+        dict(
+            name='left_hip',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        11:
+        dict(
+            name='left_knee',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        12:
+        dict(
+            name='left_back_paw',
+            id=12,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_back_paw'),
+        13:
+        dict(name='tail', id=13, color=[51, 153, 255], type='lower', swap=''),
+        14:
+        dict(
+            name='center', id=14, color=[51, 153, 255], type='lower', swap=''),
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ear', 'nose'), id=0, color=[51, 153, 255]),
+        1:
+        dict(link=('right_ear', 'nose'), id=1, color=[51, 153, 255]),
+        2:
+        dict(link=('nose', 'center'), id=2, color=[51, 153, 255]),
+        3:
+        dict(
+            link=('left_shoulder', 'left_front_paw'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_shoulder', 'center'), id=4, color=[0, 255, 0]),
+        5:
+        dict(
+            link=('right_shoulder', 'right_front_paw'),
+            id=5,
+            color=[255, 128, 0]),
+        6:
+        dict(link=('right_shoulder', 'center'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('tail', 'center'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('right_back_paw', 'right_knee'), id=8, color=[255, 128, 0]),
+        9:
+        dict(link=('right_knee', 'right_hip'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('right_hip', 'tail'), id=10, color=[255, 128, 0]),
+        11:
+        dict(link=('left_back_paw', 'left_knee'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_knee', 'left_hip'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_hip', 'tail'), id=13, color=[0, 255, 0]),
+    },
+    joint_weights=[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+    sigmas=[
+        0.0277, 0.0823, 0.0831, 0.0202, 0.0716, 0.0263, 0.0646, 0.0302, 0.0440,
+        0.0316, 0.0333, 0.0547, 0.0263, 0.0683, 0.0539
+    ])
diff --git a/configs/_base_/datasets/coco.py b/configs/_base_/datasets/coco.py
new file mode 100644
index 0000000..865a95b
--- /dev/null
+++ b/configs/_base_/datasets/coco.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='coco',
+    paper_info=dict(
+        author='Lin, Tsung-Yi and Maire, Michael and '
+        'Belongie, Serge and Hays, James and '
+        'Perona, Pietro and Ramanan, Deva and '
+        r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+        title='Microsoft coco: Common objects in context',
+        container='European conference on computer vision',
+        year='2014',
+        homepage='http://cocodataset.org/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/configs/_base_/datasets/coco_wholebody.py b/configs/_base_/datasets/coco_wholebody.py
new file mode 100644
index 0000000..ef9b707
--- /dev/null
+++ b/configs/_base_/datasets/coco_wholebody.py
@@ -0,0 +1,1154 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(
+            name='left_big_toe',
+            id=17,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        18:
+        dict(
+            name='left_small_toe',
+            id=18,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        19:
+        dict(
+            name='left_heel',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        20:
+        dict(
+            name='right_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        21:
+        dict(
+            name='right_small_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        22:
+        dict(
+            name='right_heel',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        23:
+        dict(
+            name='face-0',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        24:
+        dict(
+            name='face-1',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        25:
+        dict(
+            name='face-2',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        26:
+        dict(
+            name='face-3',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        27:
+        dict(
+            name='face-4',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        28:
+        dict(
+            name='face-5',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        29:
+        dict(
+            name='face-6',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        30:
+        dict(
+            name='face-7',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        31:
+        dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+        32:
+        dict(
+            name='face-9',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        33:
+        dict(
+            name='face-10',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        34:
+        dict(
+            name='face-11',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        35:
+        dict(
+            name='face-12',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        36:
+        dict(
+            name='face-13',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        37:
+        dict(
+            name='face-14',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        38:
+        dict(
+            name='face-15',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        39:
+        dict(
+            name='face-16',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        40:
+        dict(
+            name='face-17',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        41:
+        dict(
+            name='face-18',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        42:
+        dict(
+            name='face-19',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        43:
+        dict(
+            name='face-20',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        44:
+        dict(
+            name='face-21',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        45:
+        dict(
+            name='face-22',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        46:
+        dict(
+            name='face-23',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        47:
+        dict(
+            name='face-24',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        48:
+        dict(
+            name='face-25',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        49:
+        dict(
+            name='face-26',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        50:
+        dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+        51:
+        dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(
+            name='face-31',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        55:
+        dict(
+            name='face-32',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        56:
+        dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-34',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        58:
+        dict(
+            name='face-35',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        59:
+        dict(
+            name='face-36',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        60:
+        dict(
+            name='face-37',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        61:
+        dict(
+            name='face-38',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        62:
+        dict(
+            name='face-39',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        63:
+        dict(
+            name='face-40',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        64:
+        dict(
+            name='face-41',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        65:
+        dict(
+            name='face-42',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        66:
+        dict(
+            name='face-43',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        67:
+        dict(
+            name='face-44',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        68:
+        dict(
+            name='face-45',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        69:
+        dict(
+            name='face-46',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        70:
+        dict(
+            name='face-47',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        71:
+        dict(
+            name='face-48',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        72:
+        dict(
+            name='face-49',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        73:
+        dict(
+            name='face-50',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        74:
+        dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+        75:
+        dict(
+            name='face-52',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        76:
+        dict(
+            name='face-53',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        77:
+        dict(
+            name='face-54',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        78:
+        dict(
+            name='face-55',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        79:
+        dict(
+            name='face-56',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        80:
+        dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+        81:
+        dict(
+            name='face-58',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        82:
+        dict(
+            name='face-59',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        83:
+        dict(
+            name='face-60',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        84:
+        dict(
+            name='face-61',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        85:
+        dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='face-63',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        87:
+        dict(
+            name='face-64',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        88:
+        dict(
+            name='face-65',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        89:
+        dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+        90:
+        dict(
+            name='face-67',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        91:
+        dict(
+            name='left_hand_root',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        92:
+        dict(
+            name='left_thumb1',
+            id=92,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        93:
+        dict(
+            name='left_thumb2',
+            id=93,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        94:
+        dict(
+            name='left_thumb3',
+            id=94,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        95:
+        dict(
+            name='left_thumb4',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        96:
+        dict(
+            name='left_forefinger1',
+            id=96,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        97:
+        dict(
+            name='left_forefinger2',
+            id=97,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        98:
+        dict(
+            name='left_forefinger3',
+            id=98,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        99:
+        dict(
+            name='left_forefinger4',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        100:
+        dict(
+            name='left_middle_finger1',
+            id=100,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        101:
+        dict(
+            name='left_middle_finger2',
+            id=101,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        102:
+        dict(
+            name='left_middle_finger3',
+            id=102,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        103:
+        dict(
+            name='left_middle_finger4',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        104:
+        dict(
+            name='left_ring_finger1',
+            id=104,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        105:
+        dict(
+            name='left_ring_finger2',
+            id=105,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        106:
+        dict(
+            name='left_ring_finger3',
+            id=106,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        107:
+        dict(
+            name='left_ring_finger4',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        108:
+        dict(
+            name='left_pinky_finger1',
+            id=108,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        109:
+        dict(
+            name='left_pinky_finger2',
+            id=109,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        110:
+        dict(
+            name='left_pinky_finger3',
+            id=110,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        111:
+        dict(
+            name='left_pinky_finger4',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        112:
+        dict(
+            name='right_hand_root',
+            id=112,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        113:
+        dict(
+            name='right_thumb1',
+            id=113,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        114:
+        dict(
+            name='right_thumb2',
+            id=114,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        115:
+        dict(
+            name='right_thumb3',
+            id=115,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        116:
+        dict(
+            name='right_thumb4',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        117:
+        dict(
+            name='right_forefinger1',
+            id=117,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        118:
+        dict(
+            name='right_forefinger2',
+            id=118,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        119:
+        dict(
+            name='right_forefinger3',
+            id=119,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        120:
+        dict(
+            name='right_forefinger4',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        121:
+        dict(
+            name='right_middle_finger1',
+            id=121,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        122:
+        dict(
+            name='right_middle_finger2',
+            id=122,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        123:
+        dict(
+            name='right_middle_finger3',
+            id=123,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        124:
+        dict(
+            name='right_middle_finger4',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        125:
+        dict(
+            name='right_ring_finger1',
+            id=125,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        126:
+        dict(
+            name='right_ring_finger2',
+            id=126,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        127:
+        dict(
+            name='right_ring_finger3',
+            id=127,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        128:
+        dict(
+            name='right_ring_finger4',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        129:
+        dict(
+            name='right_pinky_finger1',
+            id=129,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        130:
+        dict(
+            name='right_pinky_finger2',
+            id=130,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        131:
+        dict(
+            name='right_pinky_finger3',
+            id=131,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        132:
+        dict(
+            name='right_pinky_finger4',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+        20:
+        dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+        21:
+        dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+        22:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+        23:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=23,
+            color=[255, 128, 0]),
+        24:
+        dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+                                                                  0]),
+        26:
+        dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+        29:
+        dict(
+            link=('left_hand_root', 'left_forefinger1'),
+            id=29,
+            color=[255, 153, 255]),
+        30:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=30,
+            color=[255, 153, 255]),
+        31:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_hand_root', 'left_middle_finger1'),
+            id=33,
+            color=[102, 178, 255]),
+        34:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=34,
+            color=[102, 178, 255]),
+        35:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_hand_root', 'left_ring_finger1'),
+            id=37,
+            color=[255, 51, 51]),
+        38:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=38,
+            color=[255, 51, 51]),
+        39:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_hand_root', 'left_pinky_finger1'),
+            id=41,
+            color=[0, 255, 0]),
+        42:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=42,
+            color=[0, 255, 0]),
+        43:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('right_hand_root', 'right_thumb1'),
+            id=45,
+            color=[255, 128, 0]),
+        46:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+        47:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_hand_root', 'right_forefinger1'),
+            id=49,
+            color=[255, 153, 255]),
+        50:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=50,
+            color=[255, 153, 255]),
+        51:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_hand_root', 'right_middle_finger1'),
+            id=53,
+            color=[102, 178, 255]),
+        54:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=54,
+            color=[102, 178, 255]),
+        55:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_hand_root', 'right_ring_finger1'),
+            id=57,
+            color=[255, 51, 51]),
+        58:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=58,
+            color=[255, 51, 51]),
+        59:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_hand_root', 'right_pinky_finger1'),
+            id=61,
+            color=[0, 255, 0]),
+        62:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=62,
+            color=[0, 255, 0]),
+        63:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=64,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 133,
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L175'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+        0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+        0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+        0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+        0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+        0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+        0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+        0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+        0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+        0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+        0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+        0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+        0.019, 0.022, 0.031
+    ])
diff --git a/configs/_base_/datasets/coco_wholebody_face.py b/configs/_base_/datasets/coco_wholebody_face.py
new file mode 100644
index 0000000..7c9ee33
--- /dev/null
+++ b/configs/_base_/datasets/coco_wholebody_face.py
@@ -0,0 +1,448 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody_face',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='face-0',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        1:
+        dict(
+            name='face-1',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        2:
+        dict(
+            name='face-2',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        3:
+        dict(
+            name='face-3',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        4:
+        dict(
+            name='face-4',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        5:
+        dict(
+            name='face-5',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        6:
+        dict(
+            name='face-6',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        7:
+        dict(
+            name='face-7', id=7, color=[255, 255, 255], type='',
+            swap='face-9'),
+        8:
+        dict(name='face-8', id=8, color=[255, 255, 255], type='', swap=''),
+        9:
+        dict(
+            name='face-9', id=9, color=[255, 255, 255], type='',
+            swap='face-7'),
+        10:
+        dict(
+            name='face-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        11:
+        dict(
+            name='face-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        12:
+        dict(
+            name='face-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        13:
+        dict(
+            name='face-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        14:
+        dict(
+            name='face-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        15:
+        dict(
+            name='face-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        16:
+        dict(
+            name='face-16',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        17:
+        dict(
+            name='face-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        18:
+        dict(
+            name='face-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        19:
+        dict(
+            name='face-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        20:
+        dict(
+            name='face-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        21:
+        dict(
+            name='face-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        22:
+        dict(
+            name='face-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        23:
+        dict(
+            name='face-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        24:
+        dict(
+            name='face-24',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        25:
+        dict(
+            name='face-25',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        26:
+        dict(
+            name='face-26',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        27:
+        dict(name='face-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='face-28', id=28, color=[255, 255, 255], type='', swap=''),
+        29:
+        dict(name='face-29', id=29, color=[255, 255, 255], type='', swap=''),
+        30:
+        dict(name='face-30', id=30, color=[255, 255, 255], type='', swap=''),
+        31:
+        dict(
+            name='face-31',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        32:
+        dict(
+            name='face-32',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        33:
+        dict(name='face-33', id=33, color=[255, 255, 255], type='', swap=''),
+        34:
+        dict(
+            name='face-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        35:
+        dict(
+            name='face-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        36:
+        dict(
+            name='face-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        37:
+        dict(
+            name='face-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        38:
+        dict(
+            name='face-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        39:
+        dict(
+            name='face-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        40:
+        dict(
+            name='face-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        41:
+        dict(
+            name='face-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        42:
+        dict(
+            name='face-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        43:
+        dict(
+            name='face-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        44:
+        dict(
+            name='face-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        45:
+        dict(
+            name='face-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        46:
+        dict(
+            name='face-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        47:
+        dict(
+            name='face-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        48:
+        dict(
+            name='face-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        49:
+        dict(
+            name='face-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        50:
+        dict(
+            name='face-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        51:
+        dict(name='face-51', id=52, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(
+            name='face-52',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        53:
+        dict(
+            name='face-53',
+            id=53,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        54:
+        dict(
+            name='face-54',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        55:
+        dict(
+            name='face-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        56:
+        dict(
+            name='face-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        57:
+        dict(name='face-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='face-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        59:
+        dict(
+            name='face-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        60:
+        dict(
+            name='face-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        61:
+        dict(
+            name='face-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        62:
+        dict(name='face-62', id=62, color=[255, 255, 255], type='', swap=''),
+        63:
+        dict(
+            name='face-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        64:
+        dict(
+            name='face-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        65:
+        dict(
+            name='face-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        66:
+        dict(name='face-66', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='face-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 68,
+
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L177'
+    sigmas=[
+        0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, 0.025, 0.020, 0.023,
+        0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, 0.013, 0.012, 0.011,
+        0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, 0.009, 0.007, 0.007,
+        0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, 0.011, 0.009, 0.011,
+        0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, 0.034, 0.008, 0.008,
+        0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, 0.009, 0.009, 0.007,
+        0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, 0.008
+    ])
diff --git a/configs/_base_/datasets/coco_wholebody_hand.py b/configs/_base_/datasets/coco_wholebody_hand.py
new file mode 100644
index 0000000..1910b2c
--- /dev/null
+++ b/configs/_base_/datasets/coco_wholebody_hand.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody_hand',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[
+        0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, 0.018,
+        0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, 0.022,
+        0.031
+    ])
diff --git a/configs/_base_/datasets/coco_wholebody_info.py b/configs/_base_/datasets/coco_wholebody_info.py
new file mode 100644
index 0000000..50ac8fe
--- /dev/null
+++ b/configs/_base_/datasets/coco_wholebody_info.py
@@ -0,0 +1,1154 @@
+cocowholebody_info = dict(
+    dataset_name='coco_wholebody',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(
+            name='left_big_toe',
+            id=17,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        18:
+        dict(
+            name='left_small_toe',
+            id=18,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        19:
+        dict(
+            name='left_heel',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        20:
+        dict(
+            name='right_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        21:
+        dict(
+            name='right_small_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        22:
+        dict(
+            name='right_heel',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        23:
+        dict(
+            name='face-0',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        24:
+        dict(
+            name='face-1',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        25:
+        dict(
+            name='face-2',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        26:
+        dict(
+            name='face-3',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        27:
+        dict(
+            name='face-4',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        28:
+        dict(
+            name='face-5',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        29:
+        dict(
+            name='face-6',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        30:
+        dict(
+            name='face-7',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        31:
+        dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+        32:
+        dict(
+            name='face-9',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        33:
+        dict(
+            name='face-10',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        34:
+        dict(
+            name='face-11',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        35:
+        dict(
+            name='face-12',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        36:
+        dict(
+            name='face-13',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        37:
+        dict(
+            name='face-14',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        38:
+        dict(
+            name='face-15',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        39:
+        dict(
+            name='face-16',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        40:
+        dict(
+            name='face-17',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        41:
+        dict(
+            name='face-18',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        42:
+        dict(
+            name='face-19',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        43:
+        dict(
+            name='face-20',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        44:
+        dict(
+            name='face-21',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        45:
+        dict(
+            name='face-22',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        46:
+        dict(
+            name='face-23',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        47:
+        dict(
+            name='face-24',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        48:
+        dict(
+            name='face-25',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        49:
+        dict(
+            name='face-26',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        50:
+        dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+        51:
+        dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(
+            name='face-31',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        55:
+        dict(
+            name='face-32',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        56:
+        dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-34',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        58:
+        dict(
+            name='face-35',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        59:
+        dict(
+            name='face-36',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        60:
+        dict(
+            name='face-37',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        61:
+        dict(
+            name='face-38',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        62:
+        dict(
+            name='face-39',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        63:
+        dict(
+            name='face-40',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        64:
+        dict(
+            name='face-41',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        65:
+        dict(
+            name='face-42',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        66:
+        dict(
+            name='face-43',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        67:
+        dict(
+            name='face-44',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        68:
+        dict(
+            name='face-45',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        69:
+        dict(
+            name='face-46',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        70:
+        dict(
+            name='face-47',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        71:
+        dict(
+            name='face-48',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        72:
+        dict(
+            name='face-49',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        73:
+        dict(
+            name='face-50',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        74:
+        dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+        75:
+        dict(
+            name='face-52',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        76:
+        dict(
+            name='face-53',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        77:
+        dict(
+            name='face-54',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        78:
+        dict(
+            name='face-55',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        79:
+        dict(
+            name='face-56',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        80:
+        dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+        81:
+        dict(
+            name='face-58',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        82:
+        dict(
+            name='face-59',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        83:
+        dict(
+            name='face-60',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        84:
+        dict(
+            name='face-61',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        85:
+        dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='face-63',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        87:
+        dict(
+            name='face-64',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        88:
+        dict(
+            name='face-65',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        89:
+        dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+        90:
+        dict(
+            name='face-67',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        91:
+        dict(
+            name='left_hand_root',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        92:
+        dict(
+            name='left_thumb1',
+            id=92,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        93:
+        dict(
+            name='left_thumb2',
+            id=93,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        94:
+        dict(
+            name='left_thumb3',
+            id=94,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        95:
+        dict(
+            name='left_thumb4',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        96:
+        dict(
+            name='left_forefinger1',
+            id=96,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        97:
+        dict(
+            name='left_forefinger2',
+            id=97,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        98:
+        dict(
+            name='left_forefinger3',
+            id=98,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        99:
+        dict(
+            name='left_forefinger4',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        100:
+        dict(
+            name='left_middle_finger1',
+            id=100,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        101:
+        dict(
+            name='left_middle_finger2',
+            id=101,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        102:
+        dict(
+            name='left_middle_finger3',
+            id=102,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        103:
+        dict(
+            name='left_middle_finger4',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        104:
+        dict(
+            name='left_ring_finger1',
+            id=104,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        105:
+        dict(
+            name='left_ring_finger2',
+            id=105,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        106:
+        dict(
+            name='left_ring_finger3',
+            id=106,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        107:
+        dict(
+            name='left_ring_finger4',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        108:
+        dict(
+            name='left_pinky_finger1',
+            id=108,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        109:
+        dict(
+            name='left_pinky_finger2',
+            id=109,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        110:
+        dict(
+            name='left_pinky_finger3',
+            id=110,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        111:
+        dict(
+            name='left_pinky_finger4',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        112:
+        dict(
+            name='right_hand_root',
+            id=112,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        113:
+        dict(
+            name='right_thumb1',
+            id=113,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        114:
+        dict(
+            name='right_thumb2',
+            id=114,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        115:
+        dict(
+            name='right_thumb3',
+            id=115,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        116:
+        dict(
+            name='right_thumb4',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        117:
+        dict(
+            name='right_forefinger1',
+            id=117,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        118:
+        dict(
+            name='right_forefinger2',
+            id=118,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        119:
+        dict(
+            name='right_forefinger3',
+            id=119,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        120:
+        dict(
+            name='right_forefinger4',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        121:
+        dict(
+            name='right_middle_finger1',
+            id=121,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        122:
+        dict(
+            name='right_middle_finger2',
+            id=122,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        123:
+        dict(
+            name='right_middle_finger3',
+            id=123,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        124:
+        dict(
+            name='right_middle_finger4',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        125:
+        dict(
+            name='right_ring_finger1',
+            id=125,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        126:
+        dict(
+            name='right_ring_finger2',
+            id=126,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        127:
+        dict(
+            name='right_ring_finger3',
+            id=127,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        128:
+        dict(
+            name='right_ring_finger4',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        129:
+        dict(
+            name='right_pinky_finger1',
+            id=129,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        130:
+        dict(
+            name='right_pinky_finger2',
+            id=130,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        131:
+        dict(
+            name='right_pinky_finger3',
+            id=131,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        132:
+        dict(
+            name='right_pinky_finger4',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+        20:
+        dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+        21:
+        dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+        22:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+        23:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=23,
+            color=[255, 128, 0]),
+        24:
+        dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+                                                                  0]),
+        26:
+        dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+        29:
+        dict(
+            link=('left_hand_root', 'left_forefinger1'),
+            id=29,
+            color=[255, 153, 255]),
+        30:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=30,
+            color=[255, 153, 255]),
+        31:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_hand_root', 'left_middle_finger1'),
+            id=33,
+            color=[102, 178, 255]),
+        34:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=34,
+            color=[102, 178, 255]),
+        35:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_hand_root', 'left_ring_finger1'),
+            id=37,
+            color=[255, 51, 51]),
+        38:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=38,
+            color=[255, 51, 51]),
+        39:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_hand_root', 'left_pinky_finger1'),
+            id=41,
+            color=[0, 255, 0]),
+        42:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=42,
+            color=[0, 255, 0]),
+        43:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('right_hand_root', 'right_thumb1'),
+            id=45,
+            color=[255, 128, 0]),
+        46:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+        47:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_hand_root', 'right_forefinger1'),
+            id=49,
+            color=[255, 153, 255]),
+        50:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=50,
+            color=[255, 153, 255]),
+        51:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_hand_root', 'right_middle_finger1'),
+            id=53,
+            color=[102, 178, 255]),
+        54:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=54,
+            color=[102, 178, 255]),
+        55:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_hand_root', 'right_ring_finger1'),
+            id=57,
+            color=[255, 51, 51]),
+        58:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=58,
+            color=[255, 51, 51]),
+        59:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_hand_root', 'right_pinky_finger1'),
+            id=61,
+            color=[0, 255, 0]),
+        62:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=62,
+            color=[0, 255, 0]),
+        63:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=64,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 133,
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L175'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+        0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+        0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+        0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+        0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+        0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+        0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+        0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+        0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+        0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+        0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+        0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+        0.019, 0.022, 0.031
+    ])
diff --git a/configs/_base_/datasets/cofw.py b/configs/_base_/datasets/cofw.py
new file mode 100644
index 0000000..2fb7ad2
--- /dev/null
+++ b/configs/_base_/datasets/cofw.py
@@ -0,0 +1,134 @@
+dataset_info = dict(
+    dataset_name='cofw',
+    paper_info=dict(
+        author='Burgos-Artizzu, Xavier P and Perona, '
+        r'Pietro and Doll{\'a}r, Piotr',
+        title='Robust face landmark estimation under occlusion',
+        container='Proceedings of the IEEE international '
+        'conference on computer vision',
+        year='2013',
+        homepage='http://www.vision.caltech.edu/xpburgos/ICCV13/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-1'),
+        1:
+        dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-0'),
+        2:
+        dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
+        3:
+        dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
+        4:
+        dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-6'),
+        5:
+        dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-7'),
+        6:
+        dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-4'),
+        7:
+        dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-5'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
+        10:
+        dict(
+            name='kpt-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-11'),
+        11:
+        dict(
+            name='kpt-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-10'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        13:
+        dict(
+            name='kpt-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-13'),
+        16:
+        dict(
+            name='kpt-16',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-16'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        20:
+        dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap=''),
+        21:
+        dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap=''),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-23'),
+        23:
+        dict(
+            name='kpt-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        24:
+        dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap=''),
+        25:
+        dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''),
+        26:
+        dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap=''),
+        27:
+        dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 29,
+    sigmas=[])
diff --git a/configs/_base_/datasets/crowdpose.py b/configs/_base_/datasets/crowdpose.py
new file mode 100644
index 0000000..4508653
--- /dev/null
+++ b/configs/_base_/datasets/crowdpose.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+    dataset_name='crowdpose',
+    paper_info=dict(
+        author='Li, Jiefeng and Wang, Can and Zhu, Hao and '
+        'Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu',
+        title='CrowdPose: Efficient Crowded Scenes Pose Estimation '
+        'and A New Benchmark',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2019',
+        homepage='https://github.com/Jeff-sjtu/CrowdPose',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_shoulder',
+            id=0,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_shoulder'),
+        1:
+        dict(
+            name='right_shoulder',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_shoulder'),
+        2:
+        dict(
+            name='left_elbow',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_elbow'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='left_wrist',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_wrist'),
+        5:
+        dict(
+            name='right_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='left_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='left_knee',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_knee'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_ankle',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_ankle'),
+        11:
+        dict(
+            name='right_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_ankle'),
+        12:
+        dict(
+            name='top_head', id=12, color=[255, 128, 0], type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[0, 255, 0], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('top_head', 'neck'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('right_shoulder', 'neck'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('left_shoulder', 'neck'), id=14, color=[51, 153, 255])
+    },
+    joint_weights=[
+        0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5
+    ],
+    sigmas=[
+        0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087,
+        0.089, 0.089, 0.079, 0.079
+    ])
diff --git a/configs/_base_/datasets/deepfashion_full.py b/configs/_base_/datasets/deepfashion_full.py
new file mode 100644
index 0000000..4d98906
--- /dev/null
+++ b/configs/_base_/datasets/deepfashion_full.py
@@ -0,0 +1,74 @@
+dataset_info = dict(
+    dataset_name='deepfashion_full',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left collar',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right collar'),
+        1:
+        dict(
+            name='right collar',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left collar'),
+        2:
+        dict(
+            name='left sleeve',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right sleeve'),
+        3:
+        dict(
+            name='right sleeve',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left sleeve'),
+        4:
+        dict(
+            name='left waistline',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right waistline'),
+        5:
+        dict(
+            name='right waistline',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left waistline'),
+        6:
+        dict(
+            name='left hem',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        7:
+        dict(
+            name='right hem',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 8,
+    sigmas=[])
diff --git a/configs/_base_/datasets/deepfashion_lower.py b/configs/_base_/datasets/deepfashion_lower.py
new file mode 100644
index 0000000..db014a1
--- /dev/null
+++ b/configs/_base_/datasets/deepfashion_lower.py
@@ -0,0 +1,46 @@
+dataset_info = dict(
+    dataset_name='deepfashion_lower',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left waistline',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right waistline'),
+        1:
+        dict(
+            name='right waistline',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left waistline'),
+        2:
+        dict(
+            name='left hem',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        3:
+        dict(
+            name='right hem',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 4,
+    sigmas=[])
diff --git a/configs/_base_/datasets/deepfashion_upper.py b/configs/_base_/datasets/deepfashion_upper.py
new file mode 100644
index 0000000..f0b012f
--- /dev/null
+++ b/configs/_base_/datasets/deepfashion_upper.py
@@ -0,0 +1,60 @@
+dataset_info = dict(
+    dataset_name='deepfashion_upper',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left collar',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right collar'),
+        1:
+        dict(
+            name='right collar',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left collar'),
+        2:
+        dict(
+            name='left sleeve',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right sleeve'),
+        3:
+        dict(
+            name='right sleeve',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left sleeve'),
+        4:
+        dict(
+            name='left hem',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        5:
+        dict(
+            name='right hem',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 6,
+    sigmas=[])
diff --git a/configs/_base_/datasets/fly.py b/configs/_base_/datasets/fly.py
new file mode 100644
index 0000000..5f94ff5
--- /dev/null
+++ b/configs/_base_/datasets/fly.py
@@ -0,0 +1,237 @@
+dataset_info = dict(
+    dataset_name='fly',
+    paper_info=dict(
+        author='Pereira, Talmo D and Aldarondo, Diego E and '
+        'Willmore, Lindsay and Kislin, Mikhail and '
+        'Wang, Samuel S-H and Murthy, Mala and Shaevitz, Joshua W',
+        title='Fast animal pose estimation using deep neural networks',
+        container='Nature methods',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='eyeL', id=1, color=[255, 255, 255], type='', swap='eyeR'),
+        2:
+        dict(name='eyeR', id=2, color=[255, 255, 255], type='', swap='eyeL'),
+        3:
+        dict(name='neck', id=3, color=[255, 255, 255], type='', swap=''),
+        4:
+        dict(name='thorax', id=4, color=[255, 255, 255], type='', swap=''),
+        5:
+        dict(name='abdomen', id=5, color=[255, 255, 255], type='', swap=''),
+        6:
+        dict(
+            name='forelegR1',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        7:
+        dict(
+            name='forelegR2',
+            id=7,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL2'),
+        8:
+        dict(
+            name='forelegR3',
+            id=8,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL3'),
+        9:
+        dict(
+            name='forelegR4',
+            id=9,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL4'),
+        10:
+        dict(
+            name='midlegR1',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL1'),
+        11:
+        dict(
+            name='midlegR2',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL2'),
+        12:
+        dict(
+            name='midlegR3',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL3'),
+        13:
+        dict(
+            name='midlegR4',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL4'),
+        14:
+        dict(
+            name='hindlegR1',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        15:
+        dict(
+            name='hindlegR2',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL2'),
+        16:
+        dict(
+            name='hindlegR3',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL3'),
+        17:
+        dict(
+            name='hindlegR4',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL4'),
+        18:
+        dict(
+            name='forelegL1',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        19:
+        dict(
+            name='forelegL2',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR2'),
+        20:
+        dict(
+            name='forelegL3',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR3'),
+        21:
+        dict(
+            name='forelegL4',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR4'),
+        22:
+        dict(
+            name='midlegL1',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR1'),
+        23:
+        dict(
+            name='midlegL2',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR2'),
+        24:
+        dict(
+            name='midlegL3',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR3'),
+        25:
+        dict(
+            name='midlegL4',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR4'),
+        26:
+        dict(
+            name='hindlegL1',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        27:
+        dict(
+            name='hindlegL2',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR2'),
+        28:
+        dict(
+            name='hindlegL3',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR3'),
+        29:
+        dict(
+            name='hindlegL4',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR4'),
+        30:
+        dict(
+            name='wingL', id=30, color=[255, 255, 255], type='', swap='wingR'),
+        31:
+        dict(
+            name='wingR', id=31, color=[255, 255, 255], type='', swap='wingL'),
+    },
+    skeleton_info={
+        0: dict(link=('eyeL', 'head'), id=0, color=[255, 255, 255]),
+        1: dict(link=('eyeR', 'head'), id=1, color=[255, 255, 255]),
+        2: dict(link=('neck', 'head'), id=2, color=[255, 255, 255]),
+        3: dict(link=('thorax', 'neck'), id=3, color=[255, 255, 255]),
+        4: dict(link=('abdomen', 'thorax'), id=4, color=[255, 255, 255]),
+        5: dict(link=('forelegR2', 'forelegR1'), id=5, color=[255, 255, 255]),
+        6: dict(link=('forelegR3', 'forelegR2'), id=6, color=[255, 255, 255]),
+        7: dict(link=('forelegR4', 'forelegR3'), id=7, color=[255, 255, 255]),
+        8: dict(link=('midlegR2', 'midlegR1'), id=8, color=[255, 255, 255]),
+        9: dict(link=('midlegR3', 'midlegR2'), id=9, color=[255, 255, 255]),
+        10: dict(link=('midlegR4', 'midlegR3'), id=10, color=[255, 255, 255]),
+        11:
+        dict(link=('hindlegR2', 'hindlegR1'), id=11, color=[255, 255, 255]),
+        12:
+        dict(link=('hindlegR3', 'hindlegR2'), id=12, color=[255, 255, 255]),
+        13:
+        dict(link=('hindlegR4', 'hindlegR3'), id=13, color=[255, 255, 255]),
+        14:
+        dict(link=('forelegL2', 'forelegL1'), id=14, color=[255, 255, 255]),
+        15:
+        dict(link=('forelegL3', 'forelegL2'), id=15, color=[255, 255, 255]),
+        16:
+        dict(link=('forelegL4', 'forelegL3'), id=16, color=[255, 255, 255]),
+        17: dict(link=('midlegL2', 'midlegL1'), id=17, color=[255, 255, 255]),
+        18: dict(link=('midlegL3', 'midlegL2'), id=18, color=[255, 255, 255]),
+        19: dict(link=('midlegL4', 'midlegL3'), id=19, color=[255, 255, 255]),
+        20:
+        dict(link=('hindlegL2', 'hindlegL1'), id=20, color=[255, 255, 255]),
+        21:
+        dict(link=('hindlegL3', 'hindlegL2'), id=21, color=[255, 255, 255]),
+        22:
+        dict(link=('hindlegL4', 'hindlegL3'), id=22, color=[255, 255, 255]),
+        23: dict(link=('wingL', 'neck'), id=23, color=[255, 255, 255]),
+        24: dict(link=('wingR', 'neck'), id=24, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 32,
+    sigmas=[])
diff --git a/configs/_base_/datasets/freihand2d.py b/configs/_base_/datasets/freihand2d.py
new file mode 100644
index 0000000..8b960d1
--- /dev/null
+++ b/configs/_base_/datasets/freihand2d.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+    dataset_name='freihand',
+    paper_info=dict(
+        author='Zimmermann, Christian and Ceylan, Duygu and '
+        'Yang, Jimei and Russell, Bryan and '
+        'Argus, Max and Brox, Thomas',
+        title='Freihand: A dataset for markerless capture of hand pose '
+        'and shape from single rgb images',
+        container='Proceedings of the IEEE International '
+        'Conference on Computer Vision',
+        year='2019',
+        homepage='https://lmb.informatik.uni-freiburg.de/projects/freihand/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/configs/_base_/datasets/h36m.py b/configs/_base_/datasets/h36m.py
new file mode 100644
index 0000000..00a719d
--- /dev/null
+++ b/configs/_base_/datasets/h36m.py
@@ -0,0 +1,152 @@
+dataset_info = dict(
+    dataset_name='h36m',
+    paper_info=dict(
+        author='Ionescu, Catalin and Papava, Dragos and '
+        'Olaru, Vlad and Sminchisescu, Cristian',
+        title='Human3.6M: Large Scale Datasets and Predictive '
+        'Methods for 3D Human Sensing in Natural Environments',
+        container='IEEE Transactions on Pattern Analysis and '
+        'Machine Intelligence',
+        year='2014',
+        homepage='http://vision.imar.ro/human3.6m/description.php',
+    ),
+    keypoint_info={
+        0:
+        dict(name='root', id=0, color=[51, 153, 255], type='lower', swap=''),
+        1:
+        dict(
+            name='right_hip',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        2:
+        dict(
+            name='right_knee',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        3:
+        dict(
+            name='right_foot',
+            id=3,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_foot'),
+        4:
+        dict(
+            name='left_hip',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        5:
+        dict(
+            name='left_knee',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        6:
+        dict(
+            name='left_foot',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_foot'),
+        7:
+        dict(name='spine', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(name='thorax', id=8, color=[51, 153, 255], type='upper', swap=''),
+        9:
+        dict(
+            name='neck_base',
+            id=9,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        10:
+        dict(name='head', id=10, color=[51, 153, 255], type='upper', swap=''),
+        11:
+        dict(
+            name='left_shoulder',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        12:
+        dict(
+            name='left_elbow',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        13:
+        dict(
+            name='left_wrist',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        14:
+        dict(
+            name='right_shoulder',
+            id=14,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        15:
+        dict(
+            name='right_elbow',
+            id=15,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        16:
+        dict(
+            name='right_wrist',
+            id=16,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('root', 'left_hip'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_hip', 'left_knee'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('left_knee', 'left_foot'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('root', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('right_hip', 'right_knee'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('right_knee', 'right_foot'), id=5, color=[255, 128, 0]),
+        6:
+        dict(link=('root', 'spine'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('spine', 'thorax'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('thorax', 'neck_base'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('neck_base', 'head'), id=9, color=[51, 153, 255]),
+        10:
+        dict(link=('thorax', 'left_shoulder'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('left_shoulder', 'left_elbow'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_elbow', 'left_wrist'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('thorax', 'right_shoulder'), id=13, color=[255, 128, 0]),
+        14:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=14, color=[255, 128,
+                                                                  0]),
+        15:
+        dict(link=('right_elbow', 'right_wrist'), id=15, color=[255, 128, 0])
+    },
+    joint_weights=[1.] * 17,
+    sigmas=[],
+    stats_info=dict(bbox_center=(528., 427.), bbox_scale=400.))
diff --git a/configs/_base_/datasets/halpe.py b/configs/_base_/datasets/halpe.py
new file mode 100644
index 0000000..1385fe8
--- /dev/null
+++ b/configs/_base_/datasets/halpe.py
@@ -0,0 +1,1157 @@
+dataset_info = dict(
+    dataset_name='halpe',
+    paper_info=dict(
+        author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie'
+        ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu'
+        ' and Ma, Ze and Chen, Mingyang and Lu, Cewu',
+        title='PaStaNet: Toward Human Activity Knowledge Engine',
+        container='CVPR',
+        year='2020',
+        homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''),
+        18:
+        dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''),
+        19:
+        dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''),
+        20:
+        dict(
+            name='left_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        21:
+        dict(
+            name='right_big_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        22:
+        dict(
+            name='left_small_toe',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        23:
+        dict(
+            name='right_small_toe',
+            id=23,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        24:
+        dict(
+            name='left_heel',
+            id=24,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        25:
+        dict(
+            name='right_heel',
+            id=25,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        26:
+        dict(
+            name='face-0',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        27:
+        dict(
+            name='face-1',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        28:
+        dict(
+            name='face-2',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        29:
+        dict(
+            name='face-3',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        30:
+        dict(
+            name='face-4',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        31:
+        dict(
+            name='face-5',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        32:
+        dict(
+            name='face-6',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        33:
+        dict(
+            name='face-7',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        34:
+        dict(name='face-8', id=34, color=[255, 255, 255], type='', swap=''),
+        35:
+        dict(
+            name='face-9',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        36:
+        dict(
+            name='face-10',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        37:
+        dict(
+            name='face-11',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        38:
+        dict(
+            name='face-12',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        39:
+        dict(
+            name='face-13',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        40:
+        dict(
+            name='face-14',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        41:
+        dict(
+            name='face-15',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        42:
+        dict(
+            name='face-16',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        43:
+        dict(
+            name='face-17',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        44:
+        dict(
+            name='face-18',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        45:
+        dict(
+            name='face-19',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        46:
+        dict(
+            name='face-20',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        47:
+        dict(
+            name='face-21',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        48:
+        dict(
+            name='face-22',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        49:
+        dict(
+            name='face-23',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        50:
+        dict(
+            name='face-24',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        51:
+        dict(
+            name='face-25',
+            id=51,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        52:
+        dict(
+            name='face-26',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        53:
+        dict(name='face-27', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(name='face-28', id=54, color=[255, 255, 255], type='', swap=''),
+        55:
+        dict(name='face-29', id=55, color=[255, 255, 255], type='', swap=''),
+        56:
+        dict(name='face-30', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-31',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        58:
+        dict(
+            name='face-32',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        59:
+        dict(name='face-33', id=59, color=[255, 255, 255], type='', swap=''),
+        60:
+        dict(
+            name='face-34',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        61:
+        dict(
+            name='face-35',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        62:
+        dict(
+            name='face-36',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        63:
+        dict(
+            name='face-37',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        64:
+        dict(
+            name='face-38',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        65:
+        dict(
+            name='face-39',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        66:
+        dict(
+            name='face-40',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        67:
+        dict(
+            name='face-41',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        68:
+        dict(
+            name='face-42',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        69:
+        dict(
+            name='face-43',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        70:
+        dict(
+            name='face-44',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        71:
+        dict(
+            name='face-45',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        72:
+        dict(
+            name='face-46',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        73:
+        dict(
+            name='face-47',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        74:
+        dict(
+            name='face-48',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        75:
+        dict(
+            name='face-49',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        76:
+        dict(
+            name='face-50',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        77:
+        dict(name='face-51', id=77, color=[255, 255, 255], type='', swap=''),
+        78:
+        dict(
+            name='face-52',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        79:
+        dict(
+            name='face-53',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        80:
+        dict(
+            name='face-54',
+            id=80,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        81:
+        dict(
+            name='face-55',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        82:
+        dict(
+            name='face-56',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        83:
+        dict(name='face-57', id=83, color=[255, 255, 255], type='', swap=''),
+        84:
+        dict(
+            name='face-58',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        85:
+        dict(
+            name='face-59',
+            id=85,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        86:
+        dict(
+            name='face-60',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        87:
+        dict(
+            name='face-61',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        88:
+        dict(name='face-62', id=88, color=[255, 255, 255], type='', swap=''),
+        89:
+        dict(
+            name='face-63',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        90:
+        dict(
+            name='face-64',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        91:
+        dict(
+            name='face-65',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        92:
+        dict(name='face-66', id=92, color=[255, 255, 255], type='', swap=''),
+        93:
+        dict(
+            name='face-67',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        94:
+        dict(
+            name='left_hand_root',
+            id=94,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        95:
+        dict(
+            name='left_thumb1',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        96:
+        dict(
+            name='left_thumb2',
+            id=96,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        97:
+        dict(
+            name='left_thumb3',
+            id=97,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        98:
+        dict(
+            name='left_thumb4',
+            id=98,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        99:
+        dict(
+            name='left_forefinger1',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        100:
+        dict(
+            name='left_forefinger2',
+            id=100,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        101:
+        dict(
+            name='left_forefinger3',
+            id=101,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        102:
+        dict(
+            name='left_forefinger4',
+            id=102,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        103:
+        dict(
+            name='left_middle_finger1',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        104:
+        dict(
+            name='left_middle_finger2',
+            id=104,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        105:
+        dict(
+            name='left_middle_finger3',
+            id=105,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        106:
+        dict(
+            name='left_middle_finger4',
+            id=106,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        107:
+        dict(
+            name='left_ring_finger1',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        108:
+        dict(
+            name='left_ring_finger2',
+            id=108,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        109:
+        dict(
+            name='left_ring_finger3',
+            id=109,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        110:
+        dict(
+            name='left_ring_finger4',
+            id=110,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        111:
+        dict(
+            name='left_pinky_finger1',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        112:
+        dict(
+            name='left_pinky_finger2',
+            id=112,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        113:
+        dict(
+            name='left_pinky_finger3',
+            id=113,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        114:
+        dict(
+            name='left_pinky_finger4',
+            id=114,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        115:
+        dict(
+            name='right_hand_root',
+            id=115,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        116:
+        dict(
+            name='right_thumb1',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        117:
+        dict(
+            name='right_thumb2',
+            id=117,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        118:
+        dict(
+            name='right_thumb3',
+            id=118,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        119:
+        dict(
+            name='right_thumb4',
+            id=119,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        120:
+        dict(
+            name='right_forefinger1',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        121:
+        dict(
+            name='right_forefinger2',
+            id=121,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        122:
+        dict(
+            name='right_forefinger3',
+            id=122,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        123:
+        dict(
+            name='right_forefinger4',
+            id=123,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        124:
+        dict(
+            name='right_middle_finger1',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        125:
+        dict(
+            name='right_middle_finger2',
+            id=125,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        126:
+        dict(
+            name='right_middle_finger3',
+            id=126,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        127:
+        dict(
+            name='right_middle_finger4',
+            id=127,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        128:
+        dict(
+            name='right_ring_finger1',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        129:
+        dict(
+            name='right_ring_finger2',
+            id=129,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        130:
+        dict(
+            name='right_ring_finger3',
+            id=130,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        131:
+        dict(
+            name='right_ring_finger4',
+            id=131,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        132:
+        dict(
+            name='right_pinky_finger1',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        133:
+        dict(
+            name='right_pinky_finger2',
+            id=133,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        134:
+        dict(
+            name='right_pinky_finger3',
+            id=134,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        135:
+        dict(
+            name='right_pinky_finger4',
+            id=135,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]),
+        6:
+        dict(link=('head', 'neck'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]),
+        9:
+        dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]),
+        12:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128,
+                                                                  0]),
+        13:
+        dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]),
+        14:
+        dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]),
+        18:
+        dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]),
+        20:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]),
+        21:
+        dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]),
+        22:
+        dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]),
+        23:
+        dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]),
+        24:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=25,
+            color=[255, 128, 0]),
+        26:
+        dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_wrist', 'left_thumb1'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb1', 'left_thumb2'), id=28, color=[255, 128, 0]),
+        29:
+        dict(link=('left_thumb2', 'left_thumb3'), id=29, color=[255, 128, 0]),
+        30:
+        dict(link=('left_thumb3', 'left_thumb4'), id=30, color=[255, 128, 0]),
+        31:
+        dict(
+            link=('left_wrist', 'left_forefinger1'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=33,
+            color=[255, 153, 255]),
+        34:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=34,
+            color=[255, 153, 255]),
+        35:
+        dict(
+            link=('left_wrist', 'left_middle_finger1'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=37,
+            color=[102, 178, 255]),
+        38:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=38,
+            color=[102, 178, 255]),
+        39:
+        dict(
+            link=('left_wrist', 'left_ring_finger1'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=41,
+            color=[255, 51, 51]),
+        42:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=42,
+            color=[255, 51, 51]),
+        43:
+        dict(
+            link=('left_wrist', 'left_pinky_finger1'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=45,
+            color=[0, 255, 0]),
+        46:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=46,
+            color=[0, 255, 0]),
+        47:
+        dict(link=('right_wrist', 'right_thumb1'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=49, color=[255, 128, 0]),
+        50:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=50, color=[255, 128, 0]),
+        51:
+        dict(
+            link=('right_wrist', 'right_forefinger1'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=53,
+            color=[255, 153, 255]),
+        54:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=54,
+            color=[255, 153, 255]),
+        55:
+        dict(
+            link=('right_wrist', 'right_middle_finger1'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=57,
+            color=[102, 178, 255]),
+        58:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=58,
+            color=[102, 178, 255]),
+        59:
+        dict(
+            link=('right_wrist', 'right_ring_finger1'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=61,
+            color=[255, 51, 51]),
+        62:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=62,
+            color=[255, 51, 51]),
+        63:
+        dict(
+            link=('right_wrist', 'right_pinky_finger1'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=64,
+            color=[0, 255, 0]),
+        65:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=65,
+            color=[0, 255, 0]),
+        66:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=66,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 136,
+
+    # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/'
+    # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.08, 0.08, 0.08,
+        0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015
+    ])
diff --git a/configs/_base_/datasets/horse10.py b/configs/_base_/datasets/horse10.py
new file mode 100644
index 0000000..a485bf1
--- /dev/null
+++ b/configs/_base_/datasets/horse10.py
@@ -0,0 +1,201 @@
+dataset_info = dict(
+    dataset_name='horse10',
+    paper_info=dict(
+        author='Mathis, Alexander and Biasi, Thomas and '
+        'Schneider, Steffen and '
+        'Yuksekgonul, Mert and Rogers, Byron and '
+        'Bethge, Matthias and '
+        'Mathis, Mackenzie W',
+        title='Pretraining boosts out-of-domain robustness '
+        'for pose estimation',
+        container='Proceedings of the IEEE/CVF Winter Conference on '
+        'Applications of Computer Vision',
+        year='2021',
+        homepage='http://www.mackenziemathislab.org/horse10',
+    ),
+    keypoint_info={
+        0:
+        dict(name='Nose', id=0, color=[255, 153, 255], type='upper', swap=''),
+        1:
+        dict(name='Eye', id=1, color=[255, 153, 255], type='upper', swap=''),
+        2:
+        dict(
+            name='Nearknee',
+            id=2,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        3:
+        dict(
+            name='Nearfrontfetlock',
+            id=3,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        4:
+        dict(
+            name='Nearfrontfoot',
+            id=4,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        5:
+        dict(
+            name='Offknee', id=5, color=[255, 102, 255], type='upper',
+            swap=''),
+        6:
+        dict(
+            name='Offfrontfetlock',
+            id=6,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        7:
+        dict(
+            name='Offfrontfoot',
+            id=7,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        8:
+        dict(
+            name='Shoulder',
+            id=8,
+            color=[255, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='Midshoulder',
+            id=9,
+            color=[255, 153, 255],
+            type='upper',
+            swap=''),
+        10:
+        dict(
+            name='Elbow', id=10, color=[255, 153, 255], type='upper', swap=''),
+        11:
+        dict(
+            name='Girth', id=11, color=[255, 153, 255], type='upper', swap=''),
+        12:
+        dict(
+            name='Wither', id=12, color=[255, 153, 255], type='upper',
+            swap=''),
+        13:
+        dict(
+            name='Nearhindhock',
+            id=13,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        14:
+        dict(
+            name='Nearhindfetlock',
+            id=14,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        15:
+        dict(
+            name='Nearhindfoot',
+            id=15,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        16:
+        dict(name='Hip', id=16, color=[255, 153, 255], type='lower', swap=''),
+        17:
+        dict(
+            name='Stifle', id=17, color=[255, 153, 255], type='lower',
+            swap=''),
+        18:
+        dict(
+            name='Offhindhock',
+            id=18,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        19:
+        dict(
+            name='Offhindfetlock',
+            id=19,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        20:
+        dict(
+            name='Offhindfoot',
+            id=20,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        21:
+        dict(
+            name='Ischium',
+            id=21,
+            color=[255, 153, 255],
+            type='lower',
+            swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('Nose', 'Eye'), id=0, color=[255, 153, 255]),
+        1:
+        dict(link=('Eye', 'Wither'), id=1, color=[255, 153, 255]),
+        2:
+        dict(link=('Wither', 'Hip'), id=2, color=[255, 153, 255]),
+        3:
+        dict(link=('Hip', 'Ischium'), id=3, color=[255, 153, 255]),
+        4:
+        dict(link=('Ischium', 'Stifle'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('Stifle', 'Girth'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('Girth', 'Elbow'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('Elbow', 'Shoulder'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('Shoulder', 'Midshoulder'), id=8, color=[255, 153, 255]),
+        9:
+        dict(link=('Midshoulder', 'Wither'), id=9, color=[255, 153, 255]),
+        10:
+        dict(
+            link=('Nearknee', 'Nearfrontfetlock'),
+            id=10,
+            color=[255, 102, 255]),
+        11:
+        dict(
+            link=('Nearfrontfetlock', 'Nearfrontfoot'),
+            id=11,
+            color=[255, 102, 255]),
+        12:
+        dict(
+            link=('Offknee', 'Offfrontfetlock'), id=12, color=[255, 102, 255]),
+        13:
+        dict(
+            link=('Offfrontfetlock', 'Offfrontfoot'),
+            id=13,
+            color=[255, 102, 255]),
+        14:
+        dict(
+            link=('Nearhindhock', 'Nearhindfetlock'),
+            id=14,
+            color=[255, 51, 255]),
+        15:
+        dict(
+            link=('Nearhindfetlock', 'Nearhindfoot'),
+            id=15,
+            color=[255, 51, 255]),
+        16:
+        dict(
+            link=('Offhindhock', 'Offhindfetlock'),
+            id=16,
+            color=[255, 51, 255]),
+        17:
+        dict(
+            link=('Offhindfetlock', 'Offhindfoot'),
+            id=17,
+            color=[255, 51, 255])
+    },
+    joint_weights=[1.] * 22,
+    sigmas=[])
diff --git a/configs/_base_/datasets/interhand2d.py b/configs/_base_/datasets/interhand2d.py
new file mode 100644
index 0000000..0134f07
--- /dev/null
+++ b/configs/_base_/datasets/interhand2d.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='interhand2d',
+    paper_info=dict(
+        author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+        'Shiratori, Takaaki and Lee, Kyoung Mu',
+        title='InterHand2.6M: A dataset and baseline for 3D '
+        'interacting hand pose estimation from a single RGB image',
+        container='arXiv',
+        year='2020',
+        homepage='https://mks0601.github.io/InterHand2.6M/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='thumb4', id=0, color=[255, 128, 0], type='', swap=''),
+        1:
+        dict(name='thumb3', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb1', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(
+            name='forefinger4', id=4, color=[255, 153, 255], type='', swap=''),
+        5:
+        dict(
+            name='forefinger3', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger1', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='middle_finger4',
+            id=8,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        9:
+        dict(
+            name='middle_finger3',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger1',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='ring_finger4', id=12, color=[255, 51, 51], type='', swap=''),
+        13:
+        dict(
+            name='ring_finger3', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger1', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(name='pinky_finger4', id=16, color=[0, 255, 0], type='', swap=''),
+        17:
+        dict(name='pinky_finger3', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger1', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='wrist', id=20, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/configs/_base_/datasets/interhand3d.py b/configs/_base_/datasets/interhand3d.py
new file mode 100644
index 0000000..e2bd812
--- /dev/null
+++ b/configs/_base_/datasets/interhand3d.py
@@ -0,0 +1,487 @@
+dataset_info = dict(
+    dataset_name='interhand3d',
+    paper_info=dict(
+        author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+        'Shiratori, Takaaki and Lee, Kyoung Mu',
+        title='InterHand2.6M: A dataset and baseline for 3D '
+        'interacting hand pose estimation from a single RGB image',
+        container='arXiv',
+        year='2020',
+        homepage='https://mks0601.github.io/InterHand2.6M/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_thumb4',
+            id=0,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        1:
+        dict(
+            name='right_thumb3',
+            id=1,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        2:
+        dict(
+            name='right_thumb2',
+            id=2,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        3:
+        dict(
+            name='right_thumb1',
+            id=3,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        4:
+        dict(
+            name='right_forefinger4',
+            id=4,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        5:
+        dict(
+            name='right_forefinger3',
+            id=5,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        6:
+        dict(
+            name='right_forefinger2',
+            id=6,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        7:
+        dict(
+            name='right_forefinger1',
+            id=7,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        8:
+        dict(
+            name='right_middle_finger4',
+            id=8,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        9:
+        dict(
+            name='right_middle_finger3',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        10:
+        dict(
+            name='right_middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        11:
+        dict(
+            name='right_middle_finger1',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        12:
+        dict(
+            name='right_ring_finger4',
+            id=12,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        13:
+        dict(
+            name='right_ring_finger3',
+            id=13,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        14:
+        dict(
+            name='right_ring_finger2',
+            id=14,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        15:
+        dict(
+            name='right_ring_finger1',
+            id=15,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        16:
+        dict(
+            name='right_pinky_finger4',
+            id=16,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4'),
+        17:
+        dict(
+            name='right_pinky_finger3',
+            id=17,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        18:
+        dict(
+            name='right_pinky_finger2',
+            id=18,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        19:
+        dict(
+            name='right_pinky_finger1',
+            id=19,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        20:
+        dict(
+            name='right_wrist',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='left_wrist'),
+        21:
+        dict(
+            name='left_thumb4',
+            id=21,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        22:
+        dict(
+            name='left_thumb3',
+            id=22,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        23:
+        dict(
+            name='left_thumb2',
+            id=23,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        24:
+        dict(
+            name='left_thumb1',
+            id=24,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        25:
+        dict(
+            name='left_forefinger4',
+            id=25,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        26:
+        dict(
+            name='left_forefinger3',
+            id=26,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        27:
+        dict(
+            name='left_forefinger2',
+            id=27,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        28:
+        dict(
+            name='left_forefinger1',
+            id=28,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        29:
+        dict(
+            name='left_middle_finger4',
+            id=29,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        30:
+        dict(
+            name='left_middle_finger3',
+            id=30,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        31:
+        dict(
+            name='left_middle_finger2',
+            id=31,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        32:
+        dict(
+            name='left_middle_finger1',
+            id=32,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        33:
+        dict(
+            name='left_ring_finger4',
+            id=33,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        34:
+        dict(
+            name='left_ring_finger3',
+            id=34,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        35:
+        dict(
+            name='left_ring_finger2',
+            id=35,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        36:
+        dict(
+            name='left_ring_finger1',
+            id=36,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        37:
+        dict(
+            name='left_pinky_finger4',
+            id=37,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        38:
+        dict(
+            name='left_pinky_finger3',
+            id=38,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        39:
+        dict(
+            name='left_pinky_finger2',
+            id=39,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        40:
+        dict(
+            name='left_pinky_finger1',
+            id=40,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        41:
+        dict(
+            name='left_wrist',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='right_wrist'),
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_thumb1', 'right_thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_thumb2', 'right_thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_thumb3', 'right_thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(
+            link=('right_wrist', 'right_forefinger1'),
+            id=4,
+            color=[255, 153, 255]),
+        5:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=5,
+            color=[255, 153, 255]),
+        6:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=6,
+            color=[255, 153, 255]),
+        7:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=7,
+            color=[255, 153, 255]),
+        8:
+        dict(
+            link=('right_wrist', 'right_middle_finger1'),
+            id=8,
+            color=[102, 178, 255]),
+        9:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(
+            link=('right_wrist', 'right_ring_finger1'),
+            id=12,
+            color=[255, 51, 51]),
+        13:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=13,
+            color=[255, 51, 51]),
+        14:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=14,
+            color=[255, 51, 51]),
+        15:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=15,
+            color=[255, 51, 51]),
+        16:
+        dict(
+            link=('right_wrist', 'right_pinky_finger1'),
+            id=16,
+            color=[0, 255, 0]),
+        17:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=17,
+            color=[0, 255, 0]),
+        18:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=18,
+            color=[0, 255, 0]),
+        19:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=19,
+            color=[0, 255, 0]),
+        20:
+        dict(link=('left_wrist', 'left_thumb1'), id=20, color=[255, 128, 0]),
+        21:
+        dict(link=('left_thumb1', 'left_thumb2'), id=21, color=[255, 128, 0]),
+        22:
+        dict(link=('left_thumb2', 'left_thumb3'), id=22, color=[255, 128, 0]),
+        23:
+        dict(link=('left_thumb3', 'left_thumb4'), id=23, color=[255, 128, 0]),
+        24:
+        dict(
+            link=('left_wrist', 'left_forefinger1'),
+            id=24,
+            color=[255, 153, 255]),
+        25:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=25,
+            color=[255, 153, 255]),
+        26:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=26,
+            color=[255, 153, 255]),
+        27:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=27,
+            color=[255, 153, 255]),
+        28:
+        dict(
+            link=('left_wrist', 'left_middle_finger1'),
+            id=28,
+            color=[102, 178, 255]),
+        29:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=29,
+            color=[102, 178, 255]),
+        30:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=30,
+            color=[102, 178, 255]),
+        31:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=31,
+            color=[102, 178, 255]),
+        32:
+        dict(
+            link=('left_wrist', 'left_ring_finger1'),
+            id=32,
+            color=[255, 51, 51]),
+        33:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=33,
+            color=[255, 51, 51]),
+        34:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=34,
+            color=[255, 51, 51]),
+        35:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=35,
+            color=[255, 51, 51]),
+        36:
+        dict(
+            link=('left_wrist', 'left_pinky_finger1'),
+            id=36,
+            color=[0, 255, 0]),
+        37:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=37,
+            color=[0, 255, 0]),
+        38:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=38,
+            color=[0, 255, 0]),
+        39:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=39,
+            color=[0, 255, 0]),
+    },
+    joint_weights=[1.] * 42,
+    sigmas=[])
diff --git a/configs/_base_/datasets/jhmdb.py b/configs/_base_/datasets/jhmdb.py
new file mode 100644
index 0000000..1b37488
--- /dev/null
+++ b/configs/_base_/datasets/jhmdb.py
@@ -0,0 +1,129 @@
+dataset_info = dict(
+    dataset_name='jhmdb',
+    paper_info=dict(
+        author='H. Jhuang and J. Gall and S. Zuffi and '
+        'C. Schmid and M. J. Black',
+        title='Towards understanding action recognition',
+        container='International Conf. on Computer Vision (ICCV)',
+        year='2013',
+        homepage='http://jhmdb.is.tue.mpg.de/dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(name='neck', id=0, color=[255, 128, 0], type='upper', swap=''),
+        1:
+        dict(name='belly', id=1, color=[255, 128, 0], type='upper', swap=''),
+        2:
+        dict(name='head', id=2, color=[255, 128, 0], type='upper', swap=''),
+        3:
+        dict(
+            name='right_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='left_shoulder'),
+        4:
+        dict(
+            name='left_shoulder',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        5:
+        dict(
+            name='right_hip',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_hip'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[51, 153, 255],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_elbow',
+            id=7,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_elbow'),
+        8:
+        dict(
+            name='left_elbow',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_elbow'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[51, 153, 255],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='right_wrist',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        12:
+        dict(
+            name='left_wrist',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='right_wrist'),
+        13:
+        dict(
+            name='right_ankle',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_ankle'),
+        14:
+        dict(
+            name='left_ankle',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle')
+    },
+    skeleton_info={
+        0: dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1: dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_hip', 'belly'), id=2, color=[255, 128, 0]),
+        3: dict(link=('belly', 'left_hip'), id=3, color=[0, 255, 0]),
+        4: dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6: dict(link=('belly', 'neck'), id=6, color=[51, 153, 255]),
+        7: dict(link=('neck', 'head'), id=7, color=[51, 153, 255]),
+        8: dict(link=('neck', 'right_shoulder'), id=8, color=[255, 128, 0]),
+        9: dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('right_elbow', 'right_wrist'), id=10, color=[255, 128, 0]),
+        11: dict(link=('neck', 'left_shoulder'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_shoulder', 'left_elbow'), id=12, color=[0, 255, 0]),
+        13: dict(link=('left_elbow', 'left_wrist'), id=13, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, 1.5, 1.5, 1.5, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.025, 0.107, 0.025, 0.079, 0.079, 0.107, 0.107, 0.072, 0.072, 0.087,
+        0.087, 0.062, 0.062, 0.089, 0.089
+    ])
diff --git a/configs/_base_/datasets/locust.py b/configs/_base_/datasets/locust.py
new file mode 100644
index 0000000..db3fa15
--- /dev/null
+++ b/configs/_base_/datasets/locust.py
@@ -0,0 +1,263 @@
+dataset_info = dict(
+    dataset_name='locust',
+    paper_info=dict(
+        author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+        'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+        'Couzin, Iain D',
+        title='DeepPoseKit, a software toolkit for fast and robust '
+        'animal pose estimation using deep learning',
+        container='Elife',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='neck', id=1, color=[255, 255, 255], type='', swap=''),
+        2:
+        dict(name='thorax', id=2, color=[255, 255, 255], type='', swap=''),
+        3:
+        dict(name='abdomen1', id=3, color=[255, 255, 255], type='', swap=''),
+        4:
+        dict(name='abdomen2', id=4, color=[255, 255, 255], type='', swap=''),
+        5:
+        dict(
+            name='anttipL',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='anttipR'),
+        6:
+        dict(
+            name='antbaseL',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='antbaseR'),
+        7:
+        dict(name='eyeL', id=7, color=[255, 255, 255], type='', swap='eyeR'),
+        8:
+        dict(
+            name='forelegL1',
+            id=8,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        9:
+        dict(
+            name='forelegL2',
+            id=9,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR2'),
+        10:
+        dict(
+            name='forelegL3',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR3'),
+        11:
+        dict(
+            name='forelegL4',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR4'),
+        12:
+        dict(
+            name='midlegL1',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR1'),
+        13:
+        dict(
+            name='midlegL2',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR2'),
+        14:
+        dict(
+            name='midlegL3',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR3'),
+        15:
+        dict(
+            name='midlegL4',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR4'),
+        16:
+        dict(
+            name='hindlegL1',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        17:
+        dict(
+            name='hindlegL2',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR2'),
+        18:
+        dict(
+            name='hindlegL3',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR3'),
+        19:
+        dict(
+            name='hindlegL4',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR4'),
+        20:
+        dict(
+            name='anttipR',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='anttipL'),
+        21:
+        dict(
+            name='antbaseR',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='antbaseL'),
+        22:
+        dict(name='eyeR', id=22, color=[255, 255, 255], type='', swap='eyeL'),
+        23:
+        dict(
+            name='forelegR1',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        24:
+        dict(
+            name='forelegR2',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL2'),
+        25:
+        dict(
+            name='forelegR3',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL3'),
+        26:
+        dict(
+            name='forelegR4',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL4'),
+        27:
+        dict(
+            name='midlegR1',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL1'),
+        28:
+        dict(
+            name='midlegR2',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL2'),
+        29:
+        dict(
+            name='midlegR3',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL3'),
+        30:
+        dict(
+            name='midlegR4',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL4'),
+        31:
+        dict(
+            name='hindlegR1',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        32:
+        dict(
+            name='hindlegR2',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL2'),
+        33:
+        dict(
+            name='hindlegR3',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL3'),
+        34:
+        dict(
+            name='hindlegR4',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL4')
+    },
+    skeleton_info={
+        0: dict(link=('neck', 'head'), id=0, color=[255, 255, 255]),
+        1: dict(link=('thorax', 'neck'), id=1, color=[255, 255, 255]),
+        2: dict(link=('abdomen1', 'thorax'), id=2, color=[255, 255, 255]),
+        3: dict(link=('abdomen2', 'abdomen1'), id=3, color=[255, 255, 255]),
+        4: dict(link=('antbaseL', 'anttipL'), id=4, color=[255, 255, 255]),
+        5: dict(link=('eyeL', 'antbaseL'), id=5, color=[255, 255, 255]),
+        6: dict(link=('forelegL2', 'forelegL1'), id=6, color=[255, 255, 255]),
+        7: dict(link=('forelegL3', 'forelegL2'), id=7, color=[255, 255, 255]),
+        8: dict(link=('forelegL4', 'forelegL3'), id=8, color=[255, 255, 255]),
+        9: dict(link=('midlegL2', 'midlegL1'), id=9, color=[255, 255, 255]),
+        10: dict(link=('midlegL3', 'midlegL2'), id=10, color=[255, 255, 255]),
+        11: dict(link=('midlegL4', 'midlegL3'), id=11, color=[255, 255, 255]),
+        12:
+        dict(link=('hindlegL2', 'hindlegL1'), id=12, color=[255, 255, 255]),
+        13:
+        dict(link=('hindlegL3', 'hindlegL2'), id=13, color=[255, 255, 255]),
+        14:
+        dict(link=('hindlegL4', 'hindlegL3'), id=14, color=[255, 255, 255]),
+        15: dict(link=('antbaseR', 'anttipR'), id=15, color=[255, 255, 255]),
+        16: dict(link=('eyeR', 'antbaseR'), id=16, color=[255, 255, 255]),
+        17:
+        dict(link=('forelegR2', 'forelegR1'), id=17, color=[255, 255, 255]),
+        18:
+        dict(link=('forelegR3', 'forelegR2'), id=18, color=[255, 255, 255]),
+        19:
+        dict(link=('forelegR4', 'forelegR3'), id=19, color=[255, 255, 255]),
+        20: dict(link=('midlegR2', 'midlegR1'), id=20, color=[255, 255, 255]),
+        21: dict(link=('midlegR3', 'midlegR2'), id=21, color=[255, 255, 255]),
+        22: dict(link=('midlegR4', 'midlegR3'), id=22, color=[255, 255, 255]),
+        23:
+        dict(link=('hindlegR2', 'hindlegR1'), id=23, color=[255, 255, 255]),
+        24:
+        dict(link=('hindlegR3', 'hindlegR2'), id=24, color=[255, 255, 255]),
+        25:
+        dict(link=('hindlegR4', 'hindlegR3'), id=25, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 35,
+    sigmas=[])
diff --git a/configs/_base_/datasets/macaque.py b/configs/_base_/datasets/macaque.py
new file mode 100644
index 0000000..ea8dac2
--- /dev/null
+++ b/configs/_base_/datasets/macaque.py
@@ -0,0 +1,183 @@
+dataset_info = dict(
+    dataset_name='macaque',
+    paper_info=dict(
+        author='Labuguen, Rollyn and Matsumoto, Jumpei and '
+        'Negrete, Salvador and Nishimaru, Hiroshi and '
+        'Nishijo, Hisao and Takada, Masahiko and '
+        'Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro',
+        title='MacaquePose: A novel "in the wild" macaque monkey pose dataset '
+        'for markerless motion capture',
+        container='bioRxiv',
+        year='2020',
+        homepage='http://www.pri.kyoto-u.ac.jp/datasets/'
+        'macaquepose/index.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/configs/_base_/datasets/mhp.py b/configs/_base_/datasets/mhp.py
new file mode 100644
index 0000000..e16e37c
--- /dev/null
+++ b/configs/_base_/datasets/mhp.py
@@ -0,0 +1,156 @@
+dataset_info = dict(
+    dataset_name='mhp',
+    paper_info=dict(
+        author='Zhao, Jian and Li, Jianshu and Cheng, Yu and '
+        'Sim, Terence and Yan, Shuicheng and Feng, Jiashi',
+        title='Understanding humans in crowded scenes: '
+        'Deep nested adversarial learning and a '
+        'new benchmark for multi-human parsing',
+        container='Proceedings of the 26th ACM '
+        'international conference on Multimedia',
+        year='2018',
+        homepage='https://lv-mhp.github.io/dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/configs/_base_/datasets/mpi_inf_3dhp.py b/configs/_base_/datasets/mpi_inf_3dhp.py
new file mode 100644
index 0000000..ffd0a70
--- /dev/null
+++ b/configs/_base_/datasets/mpi_inf_3dhp.py
@@ -0,0 +1,132 @@
+dataset_info = dict(
+    dataset_name='mpi_inf_3dhp',
+    paper_info=dict(
+        author='ehta, Dushyant and Rhodin, Helge and Casas, Dan and '
+        'Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and '
+        'Theobalt, Christian',
+        title='Monocular 3D Human Pose Estimation In The Wild Using Improved '
+        'CNN Supervision',
+        container='2017 international conference on 3D vision (3DV)',
+        year='2017',
+        homepage='http://gvv.mpi-inf.mpg.de/3dhp-dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='head_top', id=0, color=[51, 153, 255], type='upper',
+            swap=''),
+        1:
+        dict(name='neck', id=1, color=[51, 153, 255], type='upper', swap=''),
+        2:
+        dict(
+            name='right_shoulder',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='right_wrist',
+            id=4,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='left_elbow',
+            id=6,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        7:
+        dict(
+            name='left_wrist',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        8:
+        dict(
+            name='right_hip',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='right_ankle',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='left_knee',
+            id=12,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        13:
+        dict(
+            name='left_ankle',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        14:
+        dict(name='root', id=14, color=[51, 153, 255], type='lower', swap=''),
+        15:
+        dict(name='spine', id=15, color=[51, 153, 255], type='upper', swap=''),
+        16:
+        dict(name='head', id=16, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_shoulder', 'right_elbow'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_elbow', 'right_wrist'), id=2, color=[255, 128, 0]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[0, 255, 0]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('root', 'right_hip'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_hip', 'right_knee'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_knee', 'right_ankle'), id=8, color=[255, 128, 0]),
+        9: dict(link=('root', 'left_hip'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+        11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 255, 0]),
+        12: dict(link=('head_top', 'head'), id=12, color=[51, 153, 255]),
+        13: dict(link=('head', 'neck'), id=13, color=[51, 153, 255]),
+        14: dict(link=('neck', 'spine'), id=14, color=[51, 153, 255]),
+        15: dict(link=('spine', 'root'), id=15, color=[51, 153, 255])
+    },
+    joint_weights=[1.] * 17,
+    sigmas=[])
diff --git a/configs/_base_/datasets/mpii.py b/configs/_base_/datasets/mpii.py
new file mode 100644
index 0000000..6c2a491
--- /dev/null
+++ b/configs/_base_/datasets/mpii.py
@@ -0,0 +1,155 @@
+dataset_info = dict(
+    dataset_name='mpii',
+    paper_info=dict(
+        author='Mykhaylo Andriluka and Leonid Pishchulin and '
+        'Peter Gehler and Schiele, Bernt',
+        title='2D Human Pose Estimation: New Benchmark and '
+        'State of the Art Analysis',
+        container='IEEE Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://human-pose.mpi-inf.mpg.de/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/configs/_base_/datasets/mpii_info.py b/configs/_base_/datasets/mpii_info.py
new file mode 100644
index 0000000..8090992
--- /dev/null
+++ b/configs/_base_/datasets/mpii_info.py
@@ -0,0 +1,155 @@
+mpii_info = dict(
+    dataset_name='mpii',
+    paper_info=dict(
+        author='Mykhaylo Andriluka and Leonid Pishchulin and '
+        'Peter Gehler and Schiele, Bernt',
+        title='2D Human Pose Estimation: New Benchmark and '
+        'State of the Art Analysis',
+        container='IEEE Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://human-pose.mpi-inf.mpg.de/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/configs/_base_/datasets/mpii_trb.py b/configs/_base_/datasets/mpii_trb.py
new file mode 100644
index 0000000..73940d4
--- /dev/null
+++ b/configs/_base_/datasets/mpii_trb.py
@@ -0,0 +1,380 @@
+dataset_info = dict(
+    dataset_name='mpii_trb',
+    paper_info=dict(
+        author='Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and '
+        'Liu, Wentao and Qian, Chen and Ouyang, Wanli',
+        title='TRB: A Novel Triplet Representation for '
+        'Understanding 2D Human Body',
+        container='Proceedings of the IEEE International '
+        'Conference on Computer Vision',
+        year='2019',
+        homepage='https://github.com/kennymckormick/'
+        'Triplet-Representation-of-human-Body',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_shoulder',
+            id=0,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        1:
+        dict(
+            name='right_shoulder',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        2:
+        dict(
+            name='left_elbow',
+            id=2,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='left_wrist',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        5:
+        dict(
+            name='right_wrist',
+            id=5,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='left_knee',
+            id=8,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_ankle',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        11:
+        dict(
+            name='right_ankle',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        12:
+        dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap=''),
+        14:
+        dict(
+            name='right_neck',
+            id=14,
+            color=[255, 255, 255],
+            type='upper',
+            swap='left_neck'),
+        15:
+        dict(
+            name='left_neck',
+            id=15,
+            color=[255, 255, 255],
+            type='upper',
+            swap='right_neck'),
+        16:
+        dict(
+            name='medial_right_shoulder',
+            id=16,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_shoulder'),
+        17:
+        dict(
+            name='lateral_right_shoulder',
+            id=17,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_shoulder'),
+        18:
+        dict(
+            name='medial_right_bow',
+            id=18,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_bow'),
+        19:
+        dict(
+            name='lateral_right_bow',
+            id=19,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_bow'),
+        20:
+        dict(
+            name='medial_right_wrist',
+            id=20,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_wrist'),
+        21:
+        dict(
+            name='lateral_right_wrist',
+            id=21,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_wrist'),
+        22:
+        dict(
+            name='medial_left_shoulder',
+            id=22,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_shoulder'),
+        23:
+        dict(
+            name='lateral_left_shoulder',
+            id=23,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_shoulder'),
+        24:
+        dict(
+            name='medial_left_bow',
+            id=24,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_bow'),
+        25:
+        dict(
+            name='lateral_left_bow',
+            id=25,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_bow'),
+        26:
+        dict(
+            name='medial_left_wrist',
+            id=26,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_wrist'),
+        27:
+        dict(
+            name='lateral_left_wrist',
+            id=27,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_wrist'),
+        28:
+        dict(
+            name='medial_right_hip',
+            id=28,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_hip'),
+        29:
+        dict(
+            name='lateral_right_hip',
+            id=29,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_hip'),
+        30:
+        dict(
+            name='medial_right_knee',
+            id=30,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_knee'),
+        31:
+        dict(
+            name='lateral_right_knee',
+            id=31,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_knee'),
+        32:
+        dict(
+            name='medial_right_ankle',
+            id=32,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_ankle'),
+        33:
+        dict(
+            name='lateral_right_ankle',
+            id=33,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_ankle'),
+        34:
+        dict(
+            name='medial_left_hip',
+            id=34,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_hip'),
+        35:
+        dict(
+            name='lateral_left_hip',
+            id=35,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_hip'),
+        36:
+        dict(
+            name='medial_left_knee',
+            id=36,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_knee'),
+        37:
+        dict(
+            name='lateral_left_knee',
+            id=37,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_knee'),
+        38:
+        dict(
+            name='medial_left_ankle',
+            id=38,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_ankle'),
+        39:
+        dict(
+            name='lateral_left_ankle',
+            id=39,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_ankle'),
+    },
+    skeleton_info={
+        0:
+        dict(link=('head', 'neck'), id=0, color=[51, 153, 255]),
+        1:
+        dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]),
+        2:
+        dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]),
+        3:
+        dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+        4:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('left_shoulder', 'left_hip'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('right_shoulder', 'right_hip'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('left_hip', 'right_hip'), id=9, color=[51, 153, 255]),
+        10:
+        dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_hip', 'right_knee'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('right_knee', 'right_ankle'), id=13, color=[255, 128, 0]),
+        14:
+        dict(link=('right_neck', 'left_neck'), id=14, color=[255, 255, 255]),
+        15:
+        dict(
+            link=('medial_right_shoulder', 'lateral_right_shoulder'),
+            id=15,
+            color=[255, 255, 255]),
+        16:
+        dict(
+            link=('medial_right_bow', 'lateral_right_bow'),
+            id=16,
+            color=[255, 255, 255]),
+        17:
+        dict(
+            link=('medial_right_wrist', 'lateral_right_wrist'),
+            id=17,
+            color=[255, 255, 255]),
+        18:
+        dict(
+            link=('medial_left_shoulder', 'lateral_left_shoulder'),
+            id=18,
+            color=[255, 255, 255]),
+        19:
+        dict(
+            link=('medial_left_bow', 'lateral_left_bow'),
+            id=19,
+            color=[255, 255, 255]),
+        20:
+        dict(
+            link=('medial_left_wrist', 'lateral_left_wrist'),
+            id=20,
+            color=[255, 255, 255]),
+        21:
+        dict(
+            link=('medial_right_hip', 'lateral_right_hip'),
+            id=21,
+            color=[255, 255, 255]),
+        22:
+        dict(
+            link=('medial_right_knee', 'lateral_right_knee'),
+            id=22,
+            color=[255, 255, 255]),
+        23:
+        dict(
+            link=('medial_right_ankle', 'lateral_right_ankle'),
+            id=23,
+            color=[255, 255, 255]),
+        24:
+        dict(
+            link=('medial_left_hip', 'lateral_left_hip'),
+            id=24,
+            color=[255, 255, 255]),
+        25:
+        dict(
+            link=('medial_left_knee', 'lateral_left_knee'),
+            id=25,
+            color=[255, 255, 255]),
+        26:
+        dict(
+            link=('medial_left_ankle', 'lateral_left_ankle'),
+            id=26,
+            color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 40,
+    sigmas=[])
diff --git a/configs/_base_/datasets/ochuman.py b/configs/_base_/datasets/ochuman.py
new file mode 100644
index 0000000..2ef2083
--- /dev/null
+++ b/configs/_base_/datasets/ochuman.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='ochuman',
+    paper_info=dict(
+        author='Zhang, Song-Hai and Li, Ruilong and Dong, Xin and '
+        'Rosin, Paul and Cai, Zixi and Han, Xi and '
+        'Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min',
+        title='Pose2seg: Detection free human instance segmentation',
+        container='Proceedings of the IEEE conference on computer '
+        'vision and pattern recognition',
+        year='2019',
+        homepage='https://github.com/liruilong940607/OCHumanApi',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/configs/_base_/datasets/onehand10k.py b/configs/_base_/datasets/onehand10k.py
new file mode 100644
index 0000000..016770f
--- /dev/null
+++ b/configs/_base_/datasets/onehand10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='onehand10k',
+    paper_info=dict(
+        author='Wang, Yangang and Peng, Cong and Liu, Yebin',
+        title='Mask-pose cascaded cnn for 2d hand pose estimation '
+        'from single color image',
+        container='IEEE Transactions on Circuits and Systems '
+        'for Video Technology',
+        year='2018',
+        homepage='https://www.yangangwang.com/papers/WANG-MCC-2018-10.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/configs/_base_/datasets/panoptic_body3d.py b/configs/_base_/datasets/panoptic_body3d.py
new file mode 100644
index 0000000..e3b19ac
--- /dev/null
+++ b/configs/_base_/datasets/panoptic_body3d.py
@@ -0,0 +1,160 @@
+dataset_info = dict(
+    dataset_name='panoptic_pose_3d',
+    paper_info=dict(
+        author='Joo, Hanbyul and Simon, Tomas and  Li, Xulong'
+        'and Liu, Hao and Tan, Lei and Gui, Lin and Banerjee, Sean'
+        'and Godisart, Timothy and Nabbe, Bart and Matthews, Iain'
+        'and Kanade, Takeo and Nobuhara, Shohei and Sheikh, Yaser',
+        title='Panoptic Studio: A Massively Multiview System '
+        'for Interaction Motion Capture',
+        container='IEEE Transactions on Pattern Analysis'
+        ' and Machine Intelligence',
+        year='2017',
+        homepage='http://domedb.perception.cs.cmu.edu',
+    ),
+    keypoint_info={
+        0:
+        dict(name='neck', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(name='nose', id=1, color=[51, 153, 255], type='upper', swap=''),
+        2:
+        dict(name='mid_hip', id=2, color=[0, 255, 0], type='lower', swap=''),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='left_knee',
+            id=7,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        8:
+        dict(
+            name='left_ankle',
+            id=8,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        9:
+        dict(
+            name='right_shoulder',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        10:
+        dict(
+            name='right_elbow',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        11:
+        dict(
+            name='right_wrist',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='right_knee',
+            id=13,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        14:
+        dict(
+            name='right_ankle',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        15:
+        dict(
+            name='left_eye',
+            id=15,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        16:
+        dict(
+            name='left_ear',
+            id=16,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        17:
+        dict(
+            name='right_eye',
+            id=17,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        18:
+        dict(
+            name='right_ear',
+            id=18,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear')
+    },
+    skeleton_info={
+        0: dict(link=('nose', 'neck'), id=0, color=[51, 153, 255]),
+        1: dict(link=('neck', 'left_shoulder'), id=1, color=[0, 255, 0]),
+        2: dict(link=('neck', 'right_shoulder'), id=2, color=[255, 128, 0]),
+        3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+        4: dict(
+            link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+        7: dict(link=('left_ankle', 'left_knee'), id=7, color=[0, 255, 0]),
+        8: dict(link=('left_knee', 'left_hip'), id=8, color=[0, 255, 0]),
+        9: dict(link=('right_ankle', 'right_knee'), id=9, color=[255, 128, 0]),
+        10: dict(link=('right_knee', 'right_hip'), id=10, color=[255, 128, 0]),
+        11: dict(link=('mid_hip', 'left_hip'), id=11, color=[0, 255, 0]),
+        12: dict(link=('mid_hip', 'right_hip'), id=12, color=[255, 128, 0]),
+        13: dict(link=('mid_hip', 'neck'), id=13, color=[51, 153, 255]),
+    },
+    joint_weights=[
+        1.0, 1.0, 1.0, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2,
+        1.5, 1.0, 1.0, 1.0, 1.0
+    ],
+    sigmas=[
+        0.026, 0.026, 0.107, 0.079, 0.072, 0.062, 0.107, 0.087, 0.089, 0.079,
+        0.072, 0.062, 0.107, 0.087, 0.089, 0.025, 0.035, 0.025, 0.035
+    ])
diff --git a/configs/_base_/datasets/panoptic_hand2d.py b/configs/_base_/datasets/panoptic_hand2d.py
new file mode 100644
index 0000000..7a65731
--- /dev/null
+++ b/configs/_base_/datasets/panoptic_hand2d.py
@@ -0,0 +1,143 @@
+dataset_info = dict(
+    dataset_name='panoptic_hand2d',
+    paper_info=dict(
+        author='Simon, Tomas and Joo, Hanbyul and '
+        'Matthews, Iain and Sheikh, Yaser',
+        title='Hand keypoint detection in single images using '
+        'multiview bootstrapping',
+        container='Proceedings of the IEEE conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2017',
+        homepage='http://domedb.perception.cs.cmu.edu/handdb.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/configs/_base_/datasets/posetrack18.py b/configs/_base_/datasets/posetrack18.py
new file mode 100644
index 0000000..5aefd1c
--- /dev/null
+++ b/configs/_base_/datasets/posetrack18.py
@@ -0,0 +1,176 @@
+dataset_info = dict(
+    dataset_name='posetrack18',
+    paper_info=dict(
+        author='Andriluka, Mykhaylo and Iqbal, Umar and '
+        'Insafutdinov, Eldar and Pishchulin, Leonid and '
+        'Milan, Anton and Gall, Juergen and Schiele, Bernt',
+        title='Posetrack: A benchmark for human pose estimation and tracking',
+        container='Proceedings of the IEEE Conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2018',
+        homepage='https://posetrack.net/users/download.php',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='head_bottom',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        2:
+        dict(
+            name='head_top', id=2, color=[51, 153, 255], type='upper',
+            swap=''),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('nose', 'head_bottom'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'head_top'), id=13, color=[51, 153, 255]),
+        14:
+        dict(
+            link=('head_bottom', 'left_shoulder'), id=14, color=[51, 153,
+                                                                 255]),
+        15:
+        dict(
+            link=('head_bottom', 'right_shoulder'),
+            id=15,
+            color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/configs/_base_/datasets/rhd2d.py b/configs/_base_/datasets/rhd2d.py
new file mode 100644
index 0000000..f48e637
--- /dev/null
+++ b/configs/_base_/datasets/rhd2d.py
@@ -0,0 +1,141 @@
+dataset_info = dict(
+    dataset_name='rhd2d',
+    paper_info=dict(
+        author='Christian Zimmermann and Thomas Brox',
+        title='Learning to Estimate 3D Hand Pose from Single RGB Images',
+        container='arXiv',
+        year='2017',
+        homepage='https://lmb.informatik.uni-freiburg.de/resources/'
+        'datasets/RenderedHandposeDataset.en.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/configs/_base_/datasets/wflw.py b/configs/_base_/datasets/wflw.py
new file mode 100644
index 0000000..bed6f56
--- /dev/null
+++ b/configs/_base_/datasets/wflw.py
@@ -0,0 +1,582 @@
+dataset_info = dict(
+    dataset_name='wflw',
+    paper_info=dict(
+        author='Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, '
+        'Quan and Cai, Yici and Zhou, Qiang',
+        title='Look at boundary: A boundary-aware face alignment algorithm',
+        container='Proceedings of the IEEE conference on computer '
+        'vision and pattern recognition',
+        year='2018',
+        homepage='https://wywu.github.io/projects/LAB/WFLW.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-32'),
+        1:
+        dict(
+            name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-31'),
+        2:
+        dict(
+            name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-30'),
+        3:
+        dict(
+            name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-29'),
+        4:
+        dict(
+            name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-28'),
+        5:
+        dict(
+            name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-27'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-26'),
+        7:
+        dict(
+            name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-25'),
+        8:
+        dict(
+            name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-24'),
+        9:
+        dict(
+            name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-23'),
+        10:
+        dict(
+            name='kpt-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        11:
+        dict(
+            name='kpt-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-21'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-20'),
+        13:
+        dict(
+            name='kpt-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        16:
+        dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-13'),
+        20:
+        dict(
+            name='kpt-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        21:
+        dict(
+            name='kpt-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-11'),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-10'),
+        23:
+        dict(
+            name='kpt-23', id=23, color=[255, 255, 255], type='',
+            swap='kpt-9'),
+        24:
+        dict(
+            name='kpt-24', id=24, color=[255, 255, 255], type='',
+            swap='kpt-8'),
+        25:
+        dict(
+            name='kpt-25', id=25, color=[255, 255, 255], type='',
+            swap='kpt-7'),
+        26:
+        dict(
+            name='kpt-26', id=26, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        27:
+        dict(
+            name='kpt-27', id=27, color=[255, 255, 255], type='',
+            swap='kpt-5'),
+        28:
+        dict(
+            name='kpt-28', id=28, color=[255, 255, 255], type='',
+            swap='kpt-4'),
+        29:
+        dict(
+            name='kpt-29', id=29, color=[255, 255, 255], type='',
+            swap='kpt-3'),
+        30:
+        dict(
+            name='kpt-30', id=30, color=[255, 255, 255], type='',
+            swap='kpt-2'),
+        31:
+        dict(
+            name='kpt-31', id=31, color=[255, 255, 255], type='',
+            swap='kpt-1'),
+        32:
+        dict(
+            name='kpt-32', id=32, color=[255, 255, 255], type='',
+            swap='kpt-0'),
+        33:
+        dict(
+            name='kpt-33',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-46'),
+        34:
+        dict(
+            name='kpt-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-45'),
+        35:
+        dict(
+            name='kpt-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-44'),
+        36:
+        dict(
+            name='kpt-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-43'),
+        37:
+        dict(
+            name='kpt-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-42'),
+        38:
+        dict(
+            name='kpt-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-50'),
+        39:
+        dict(
+            name='kpt-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-49'),
+        40:
+        dict(
+            name='kpt-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-48'),
+        41:
+        dict(
+            name='kpt-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-47'),
+        42:
+        dict(
+            name='kpt-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-37'),
+        43:
+        dict(
+            name='kpt-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-36'),
+        44:
+        dict(
+            name='kpt-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-35'),
+        45:
+        dict(
+            name='kpt-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-34'),
+        46:
+        dict(
+            name='kpt-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-33'),
+        47:
+        dict(
+            name='kpt-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-41'),
+        48:
+        dict(
+            name='kpt-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-40'),
+        49:
+        dict(
+            name='kpt-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-39'),
+        50:
+        dict(
+            name='kpt-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-38'),
+        51:
+        dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='kpt-52', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='kpt-53', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(name='kpt-54', id=54, color=[255, 255, 255], type='', swap=''),
+        55:
+        dict(
+            name='kpt-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-59'),
+        56:
+        dict(
+            name='kpt-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-58'),
+        57:
+        dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='kpt-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-56'),
+        59:
+        dict(
+            name='kpt-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-55'),
+        60:
+        dict(
+            name='kpt-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-72'),
+        61:
+        dict(
+            name='kpt-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-71'),
+        62:
+        dict(
+            name='kpt-62',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-70'),
+        63:
+        dict(
+            name='kpt-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-69'),
+        64:
+        dict(
+            name='kpt-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-68'),
+        65:
+        dict(
+            name='kpt-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-75'),
+        66:
+        dict(
+            name='kpt-66',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-74'),
+        67:
+        dict(
+            name='kpt-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-73'),
+        68:
+        dict(
+            name='kpt-68',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-64'),
+        69:
+        dict(
+            name='kpt-69',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-63'),
+        70:
+        dict(
+            name='kpt-70',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-62'),
+        71:
+        dict(
+            name='kpt-71',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-61'),
+        72:
+        dict(
+            name='kpt-72',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-60'),
+        73:
+        dict(
+            name='kpt-73',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-67'),
+        74:
+        dict(
+            name='kpt-74',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-66'),
+        75:
+        dict(
+            name='kpt-75',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-65'),
+        76:
+        dict(
+            name='kpt-76',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-82'),
+        77:
+        dict(
+            name='kpt-77',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-81'),
+        78:
+        dict(
+            name='kpt-78',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-80'),
+        79:
+        dict(name='kpt-79', id=79, color=[255, 255, 255], type='', swap=''),
+        80:
+        dict(
+            name='kpt-80',
+            id=80,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-78'),
+        81:
+        dict(
+            name='kpt-81',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-77'),
+        82:
+        dict(
+            name='kpt-82',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-76'),
+        83:
+        dict(
+            name='kpt-83',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-87'),
+        84:
+        dict(
+            name='kpt-84',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-86'),
+        85:
+        dict(name='kpt-85', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='kpt-86',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-84'),
+        87:
+        dict(
+            name='kpt-87',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-83'),
+        88:
+        dict(
+            name='kpt-88',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-92'),
+        89:
+        dict(
+            name='kpt-89',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-91'),
+        90:
+        dict(name='kpt-90', id=90, color=[255, 255, 255], type='', swap=''),
+        91:
+        dict(
+            name='kpt-91',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-89'),
+        92:
+        dict(
+            name='kpt-92',
+            id=92,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-88'),
+        93:
+        dict(
+            name='kpt-93',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-95'),
+        94:
+        dict(name='kpt-94', id=94, color=[255, 255, 255], type='', swap=''),
+        95:
+        dict(
+            name='kpt-95',
+            id=95,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-93'),
+        96:
+        dict(
+            name='kpt-96',
+            id=96,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-97'),
+        97:
+        dict(
+            name='kpt-97',
+            id=97,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-96')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 98,
+    sigmas=[])
diff --git a/configs/_base_/datasets/zebra.py b/configs/_base_/datasets/zebra.py
new file mode 100644
index 0000000..eac71f7
--- /dev/null
+++ b/configs/_base_/datasets/zebra.py
@@ -0,0 +1,64 @@
+dataset_info = dict(
+    dataset_name='zebra',
+    paper_info=dict(
+        author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+        'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+        'Couzin, Iain D',
+        title='DeepPoseKit, a software toolkit for fast and robust '
+        'animal pose estimation using deep learning',
+        container='Elife',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='snout', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='head', id=1, color=[255, 255, 255], type='', swap=''),
+        2:
+        dict(name='neck', id=2, color=[255, 255, 255], type='', swap=''),
+        3:
+        dict(
+            name='forelegL1',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        4:
+        dict(
+            name='forelegR1',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        5:
+        dict(
+            name='hindlegL1',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        6:
+        dict(
+            name='hindlegR1',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        7:
+        dict(name='tailbase', id=7, color=[255, 255, 255], type='', swap=''),
+        8:
+        dict(name='tailtip', id=8, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={
+        0: dict(link=('head', 'snout'), id=0, color=[255, 255, 255]),
+        1: dict(link=('neck', 'head'), id=1, color=[255, 255, 255]),
+        2: dict(link=('forelegL1', 'neck'), id=2, color=[255, 255, 255]),
+        3: dict(link=('forelegR1', 'neck'), id=3, color=[255, 255, 255]),
+        4: dict(link=('hindlegL1', 'tailbase'), id=4, color=[255, 255, 255]),
+        5: dict(link=('hindlegR1', 'tailbase'), id=5, color=[255, 255, 255]),
+        6: dict(link=('tailbase', 'neck'), id=6, color=[255, 255, 255]),
+        7: dict(link=('tailtip', 'tailbase'), id=7, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 9,
+    sigmas=[])
diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
new file mode 100644
index 0000000..d78da5a
--- /dev/null
+++ b/configs/_base_/default_runtime.py
@@ -0,0 +1,19 @@
+checkpoint_config = dict(interval=10)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'
diff --git a/configs/_base_/filters/gausian_filter.py b/configs/_base_/filters/gausian_filter.py
new file mode 100644
index 0000000..e69de29
diff --git a/configs/detection/yolo_classes.py b/configs/detection/yolo_classes.py
new file mode 100644
index 0000000..2339a11
--- /dev/null
+++ b/configs/detection/yolo_classes.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+YOLO_COCO_80_CLASSES = [
+"person",
+"bicycle",
+"car",
+"motorbike",
+"aeroplane",
+"bus",
+"train",
+"truck",
+"boat",
+"traffic light",
+"fire hydrant",
+"stop sign",
+"parking meter",
+"bench",
+"bird",
+"cat",
+"dog",
+"horse",
+"sheep",
+"cow",
+"elephant",
+"bear",
+"zebra",
+"giraffe",
+"backpack",
+"umbrella",
+"handbag",
+"tie",
+"suitcase",
+"frisbee",
+"skis",
+"snowboard",
+"sports ball",
+"kite",
+"baseball bat",
+"baseball glove",
+"skateboard",
+"surfboard",
+"tennis racket",
+"bottle",
+"wine glass",
+"cup",
+"fork",
+"knife",
+"spoon",
+"bowl",
+"banana",
+"apple",
+"sandwich",
+"orange",
+"broccoli",
+"carrot",
+"hot dog",
+"pizza",
+"donut",
+"cake",
+"chair",
+"sofa",
+"pottedplant",
+"bed",
+"diningtable",
+"toilet",
+"tvmonitor",
+"laptop",
+"mouse",
+"remote",
+"keyboard",
+"cell phone",
+"microwave",
+"oven",
+"toaster",
+"sink",
+"refrigerator",
+"book",
+"clock",
+"vase",
+"scissors",
+"teddy bear",
+"hair drier",
+"toothbrush"]
\ No newline at end of file
diff --git a/configs/detection/yolov3_d53_320_273e_coco.py b/configs/detection/yolov3_d53_320_273e_coco.py
new file mode 100644
index 0000000..d7e9cca
--- /dev/null
+++ b/configs/detection/yolov3_d53_320_273e_coco.py
@@ -0,0 +1,140 @@
+# model settings
+model = dict(
+    type='YOLOV3',
+    pretrained='open-mmlab://darknet53',
+    backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)),
+    neck=dict(
+        type='YOLOV3Neck',
+        num_scales=3,
+        in_channels=[1024, 512, 256],
+        out_channels=[512, 256, 128]),
+    bbox_head=dict(
+        type='YOLOV3Head',
+        num_classes=80,
+        in_channels=[512, 256, 128],
+        out_channels=[1024, 512, 256],
+        anchor_generator=dict(
+            type='YOLOAnchorGenerator',
+            base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                        [(30, 61), (62, 45), (59, 119)],
+                        [(10, 13), (16, 30), (33, 23)]],
+            strides=[32, 16, 8]),
+        bbox_coder=dict(type='YOLOBBoxCoder'),
+        featmap_strides=[32, 16, 8],
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_conf=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_xy=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=2.0,
+            reduction='sum'),
+        loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='GridAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0)),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        conf_thr=0.005,
+        nms=dict(type='nms', iou_threshold=0.45),
+        max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco'
+img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='PhotoMetricDistortion'),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(320, 320),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,  # same as burn-in in darknet
+    warmup_ratio=0.1,
+    step=[218, 246])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=273)
+evaluation = dict(interval=1, metric=['bbox'])
+
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+custom_hooks = [dict(type='NumClassCheckHook')]
+
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/pose/ViTPose_base_coco_256x192.py b/configs/pose/ViTPose_base_coco_256x192.py
new file mode 100644
index 0000000..f61b314
--- /dev/null
+++ b/configs/pose/ViTPose_base_coco_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../_base_/default_runtime.py',
+    '../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.75,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/configs/pose/ViTPose_base_simple_coco_256x192.py b/configs/pose/ViTPose_base_simple_coco_256x192.py
new file mode 100644
index 0000000..59985e4
--- /dev/null
+++ b/configs/pose/ViTPose_base_simple_coco_256x192.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../_base_/default_runtime.py',
+    '../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.75,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=0,
+        num_deconv_filters=[],
+        num_deconv_kernels=[],
+        upsample=4,
+        extra=dict(final_conv_kernel=3, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/configs/pose/ViTPose_small_coco_256x192.py b/configs/pose/ViTPose_small_coco_256x192.py
new file mode 100644
index 0000000..80683c6
--- /dev/null
+++ b/configs/pose/ViTPose_small_coco_256x192.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../_base_/default_runtime.py',
+    '../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
diff --git a/configs/pose3d/MB_ft_h36m.yaml b/configs/pose3d/MB_ft_h36m.yaml
new file mode 100644
index 0000000..b52f5a2
--- /dev/null
+++ b/configs/pose3d/MB_ft_h36m.yaml
@@ -0,0 +1,50 @@
+# General  
+train_2d: False
+no_eval: False
+finetune: True
+partial_train: null
+
+# Traning 
+epochs: 60
+checkpoint_frequency: 30
+batch_size: 32 
+dropout: 0.0
+learning_rate: 0.0002
+weight_decay: 0.01
+lr_decay: 0.99
+
+# Model
+maxlen: 243
+dim_feat: 512
+mlp_ratio: 2
+depth: 5
+dim_rep: 512
+num_heads: 8
+att_fuse: True
+
+# Data
+data_root: data/motion3d/MB3D_f243s81/
+subset_list: [H36M-SH]
+dt_file: h36m_sh_conf_cam_source_final.pkl
+clip_len: 243
+data_stride: 81
+rootrel: True
+sample_stride: 1
+num_joints: 17
+no_conf: False
+gt_2d: False
+
+# Loss
+lambda_3d_velocity: 20.0
+lambda_scale: 0.5
+lambda_lv: 0.0
+lambda_lg: 0.0
+lambda_a: 0.0
+lambda_av: 0.0
+
+# Augmentation
+synthetic: False
+flip: True
+mask_ratio: 0.
+mask_T_ratio: 0.
+noise: False
diff --git a/gafa_utils.py b/gafa_utils.py
new file mode 100644
index 0000000..53345f7
--- /dev/null
+++ b/gafa_utils.py
@@ -0,0 +1,448 @@
+
+
+import numpy as np
+import cv2
+import torch
+import torch.nn.functional as F
+from torchvision.transforms.functional import resize
+import numpy as np
+import torchvision.transforms.functional as TF
+from torchvision import transforms
+from albumentations.core.transforms_interface import DualTransform, to_tuple
+import albumentations as A
+
+import torchvision.transforms as T
+from torchvision.transforms import Compose as ComposeTransform
+
+import matplotlib.pyplot as plt
+
+from PIL import Image
+
+from utils import *
+
+MIN_CONF_THRESH = 0.3
+MIN_IDXS_COUNT = 50
+
+
+class SingleAttrTransform:
+    """
+    Superclass for data transformation
+    """
+
+    def __init__(self, input_key, output_key):
+        self.input_keys = self._validate_key_arg(input_key)
+        self.output_keys = self._validate_key_arg(output_key)
+        if len(self.input_keys) != len(self.output_keys):
+            raise Exception(
+                f"len(input_keys) != len(output_keys): {len(self.input_keys)} != {len(self.output_keys)}"
+            )
+
+    def __call__(self, item):
+        """
+        item: dictionary containing each variable in a dataset
+        """
+        self.before_transform(item)
+        for in_key, out_key in zip(self.input_keys, self.output_keys):
+            input_seq = item[in_key]
+            item[out_key] = self.transform(input_seq)
+        return item
+
+    def transform(self, input_seq):
+        raise NotImplementedError
+
+    def before_transform(self, item):
+        return
+
+    def _validate_key_arg(self, key_or_keys):
+        if isinstance(key_or_keys, str):
+            return [key_or_keys]
+        else:
+            return key_or_keys
+
+
+class ImageTransform:
+    def __init__(self, img_key, transform):
+        self.img_key = img_key
+        self.transform = transform
+
+    def __call__(self, item):
+        item[self.img_key] = self.transform(item[self.img_key])
+        return item
+
+######################################
+############ Bounding Box ##############
+#####################################
+class ExpandBB(SingleAttrTransform):
+    """
+    Expand or shurink the bounding box by multiplying specified arguments
+    """
+
+    def __init__(self, t, b, l, r, input_key="bb", output_key=None):
+        output_key = output_key or input_key
+        super().__init__(input_key, output_key)
+        self.t = t
+        self.b = b
+        self.l = l
+        self.r = r
+
+    def transform(self, bb):
+        old_w, old_h = bb["w"], bb["h"]
+        old_u, old_v = bb["u"], bb["v"]
+
+        lpad = int(old_w * self.l)
+        rpad = int(old_w * self.r)
+        tpad = int(old_h * self.t)
+        bpad = int(old_h * self.b)
+
+        return {
+            "w": old_w + lpad + rpad,
+            "h": old_h + tpad + bpad,
+            "u": old_u - lpad,
+            "v": old_v - tpad,
+        }
+
+class SquareFromWidth(SingleAttrTransform):
+    """
+    Expand or shurink the bounding box by multiplying specified arguments
+    """
+
+    def __init__(self, t, b, l, r, input_key="bb", output_key=None):
+        output_key = output_key or input_key
+        super().__init__(input_key, output_key)
+        self.t = t
+        self.b = b
+        self.l = l
+        self.r = r
+
+    def transform(self, bb):
+        old_w, old_h = bb["w"], bb["h"]
+        old_u, old_v = bb["u"], bb["v"]
+
+        lpad = 0 #int(old_w * self.l)
+        rpad = 0 #int(old_w * self.r)
+        tpad = 0 #int(old_h * self.t)
+        bpad = 0 #int(old_h * self.b)
+
+        return {
+            "w": old_w + lpad + rpad,
+            "h": old_h + tpad + bpad,
+            "u": old_u - lpad,
+            "v": old_v - tpad,
+        }
+
+
+class ExpandBBRect(SingleAttrTransform):
+    """
+    Make bonding box rectangle.
+    """
+
+    def __init__(self, input_key="bb", output_key=None):
+        output_key = output_key or input_key
+        super().__init__(input_key, output_key)
+
+    def transform(self, bb):
+        old_w, old_h = bb["w"], bb["h"]
+        old_u, old_v = bb["u"], bb["v"]
+
+        if old_w <= old_h:
+            diff = old_h - old_w
+            lpad = diff // 2
+
+            return {"w": old_h, "h": old_h, "u": old_u - lpad, "v": old_v}
+
+        if old_h < old_w:
+            diff = old_w - old_h
+            tpad = diff // 2
+
+            return {"w": old_w, "h": old_w, "u": old_u, "v": old_v - tpad}
+
+
+class ReshapeBBRect(SingleAttrTransform):
+    """
+    Crop or Expand the BB tp specified ratio
+    """
+
+    def __init__(self, img_ratio, input_key="bb", output_key=None):
+        output_key = output_key or input_key
+        super().__init__(input_key, output_key)
+
+        assert len(img_ratio) == 2
+        self.height = img_ratio[0]
+        self.width = img_ratio[1]
+
+    def transform(self, bb):
+        old_w, old_h = bb["w"], bb["h"]
+        old_u, old_v = bb["u"], bb["v"]
+
+        old_ratio = old_h / old_w
+        new_ratio = self.height / self.width
+
+        # 縦が長すぎる場合
+        if old_ratio > new_ratio:
+            diff = old_h - old_w * (self.height / self.width)
+            lpad = diff // 2
+
+            return {"w": old_w, "h": old_h - diff, "u": old_u, "v": old_v + lpad}
+
+        # 横が長すぎる場合
+        else:
+            diff = old_w - old_h * (self.width / self.height)
+            lpad = diff // 2
+
+            return {"w": old_w - diff, "h": old_h, "u": old_u + lpad, "v": old_v}
+
+
+class CropBB:
+    def __init__(self, img_key="image", bb_key="bb", out_key="image"):
+        self.img_key = img_key
+        self.bb_key = bb_key
+        self.out_key = out_key
+
+    def __call__(self, item):
+        # self._check_keys(item)
+        bb = item[self.bb_key]
+        item[self.out_key] = TF.crop(
+            item[self.img_key], top=int(bb["v"]), left=int(bb["u"]), height=int(bb["h"]), width=int(bb["w"])
+        )
+        return item
+
+
+class KeypointsToBB:
+    def __init__(self, kp_indices):
+        if hasattr(kp_indices, "__iter__"):
+            kp_indices = list(kp_indices)
+        self.kp_indices = kp_indices
+
+    def __call__(self, item):
+        out = {k: v for k, v in item.items()}
+        kp = item["keypoints"]
+
+        kp = kp[self.kp_indices]
+        kp = kp[np.all(kp != 0, axis=1), :]
+        u, v = np.min(kp.astype(np.int64), axis=0)
+        umax, vmax = np.max(kp.astype(np.int64), axis=0)
+        out["bb"] = {"u": u, "v": v, "w": umax - u, "h": vmax - v}
+        return out
+
+
+
+
+# define transforms
+head_transform = ComposeTransform(
+    [
+        # KeypointsToBB((0, 1, 15, 16, 17, 18)),
+        KeypointsToBB((0,1,2,3,4,5,6)), #coco17 corresponding
+        ExpandBB(0.85, -0.2, 0.1, 0.1, "bb"),
+        ExpandBBRect("bb"),
+    ]
+)
+
+# define transforms
+head_transform_rest = ComposeTransform(
+    [
+        # KeypointsToBB((0, 1, 15, 16, 17, 18)),
+        KeypointsToBB((0,1,2,3,4,5,6)), #coco17 corresponding
+        ExpandBB(0.1, -0.2, 0.1, 0.1, "bb"),
+        ExpandBBRect("bb"),
+    ]
+)
+
+# define transforms
+head_transform_face = ComposeTransform(
+    [
+        # KeypointsToBB((0, 1, 15, 16, 17, 18)),
+        KeypointsToBB((0,1,2,3,4)), #coco17 corresponding
+        ExpandBB(3.0, 2.5, 0.5, 0.5, "bb"),
+        # ExpandBBRect("bb"),
+    ]
+)
+
+
+
+body_transform = ComposeTransform(
+    [
+        KeypointsToBB(slice(None)),
+        ExpandBB(0.15, 0.05, 0.2, 0.2, "bb"),
+        ExpandBBRect("bb"),
+        ReshapeBBRect((256, 192)),
+        CropBB(bb_key="bb"),
+        ImageTransform(
+            "image",
+            T.Compose(
+                [
+                    T.Resize((256, 192)),
+                ]
+            ),
+        ),
+    ]
+)
+
+body_transform_from_bb = ComposeTransform(
+    [
+        ExpandBB(0.15, 0.05, 0.2, 0.2, "bb"),
+        ExpandBBRect("bb"),
+        ReshapeBBRect((256, 192)),
+        CropBB(bb_key="bb"),
+        ImageTransform(
+            "image",
+            T.Compose(
+                [
+                    T.Resize((256, 192)),
+                ]
+            ),
+        ),
+    ]
+)
+
+normalize_img = A.Compose([
+            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+normalize_img_torch = T.Compose([
+            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+@timeit
+def get_valid_ids(body_json):
+    #  count valid detections per idx to find the valid ones
+    idxs_count = {}
+    for det in body_json:
+        idx = det["idx"]
+        kpts = np.array(det["keypoints"]).reshape((-1, 3))
+        if (kpts[:, 2] > MIN_CONF_THRESH).all():
+            if idx in idxs_count.keys():
+                idxs_count[idx] += 1
+            else:
+                idxs_count[idx] = 1
+
+    valid_idxs = []
+    for idx, count in idxs_count.items():
+        if count > MIN_IDXS_COUNT:
+            valid_idxs.append(idx)
+
+    return (valid_idxs)
+
+@timeit
+def get_valid_frames_by_keys(valid_idxs, body_results):
+    out = {}
+    for idx in valid_idxs:
+        out[idx] = []
+
+    for det in body_results:
+        if det["idx"] in valid_idxs:
+            kpts = np.array(det["keypoints"]).reshape((-1, 3))
+            if (kpts[:, 2] > MIN_CONF_THRESH).all():
+
+                # add the timestamp to the frame detection
+                date_str = det["image_id"].split(".")[0].split("_ts_")[-1]
+                date_format = '%Y_%m_%d_%H_%M_%S_%f'
+                timestamp = datetime.strptime(date_str, date_format)
+                det["timestamp"] = timestamp
+
+                # check previous timestamp
+                if len(out[det["idx"]]) > 0:
+                    last_ts = out[det["idx"]][-1]["timestamp"]
+                    diff_ts = (timestamp - last_ts).total_seconds()
+                else:
+                    diff_ts = 0
+
+                assert(diff_ts >= 0)
+
+                # if diff_ts < 0.3:
+                #     # add the frame detection to the output dic by idx
+                #     out[det["idx"]].append(det)
+                # else:
+                #     print(det["idx"], "Discard det because ts diff too high ({} > 0.2 s)".format(diff_ts), tag = "warning", tag_color = "yellow", color = "white")
+                
+                out[det["idx"]].append(det)
+
+    return out
+
+
+
+@timeit
+def get_inputs(f_i, valid_frames, n_frames):
+    
+    if f_i < n_frames:
+        # not enough past frames
+        return None, None, None, None, None, None
+    else:
+        imgs = torch.zeros((1, n_frames, 3, 256, 192))
+        head_masks = torch.zeros((1, n_frames, 1, 256, 192))
+        body_dvs = torch.zeros((1, n_frames, 2))
+
+        norm_body_center = np.zeros((n_frames, 2))
+
+
+        sequences_ids = [f_i + off for off in range(-n_frames + 1, 1)]
+        image_ids = []
+        print(sequences_ids)
+        for k, i in enumerate(sequences_ids):
+            seq_frame_i = valid_frames[i]
+            # load images
+            image_ids.append(seq_frame_i["image_id"])
+            image_path = os.path.join(images_root, seq_frame_i["image_id"])
+            img_org = Image.open(image_path)
+            kpts = np.array(seq_frame_i["keypoints"]).reshape((-1,3))
+            assert((kpts[:,2] > MIN_CONF_THRESH).all())
+            
+            item = {
+                "image": img_org,
+                "keypoints": kpts[:, :2],
+            }
+
+            # get head bb in pixels
+            head_trans = head_transform(item)
+            head_bb = head_trans['bb']
+            head_bb = np.array([head_bb['u'], head_bb['v'], head_bb['w'], head_bb['h']]).astype(np.float32)
+            
+            # get body bb in pixels
+            body_trans = body_transform(item) 
+            body_bb = body_trans['bb']
+            body_bb = np.array([body_bb['u'], body_bb['v'], body_bb['w'], body_bb['h']])
+            body_image = np.array(body_trans['image'])
+            
+            # change head bb to relative to body bb
+            head_bb_abs = head_bb.copy()
+            
+            head_bb[0] -= body_bb[0]
+            head_bb[1] -= body_bb[1]
+            
+            head_bb[0] = head_bb[0] / body_bb[2]
+            head_bb[1] = head_bb[1] / body_bb[3]
+            head_bb[2] = head_bb[2] / body_bb[2]
+            head_bb[3] = head_bb[3] / body_bb[3]
+                    
+            # store body center
+            norm_body_center[k,:] = (body_bb[[0, 1]] + body_bb[[2, 3]] / 2) / body_bb[[2,3]]
+            
+            # normalize image
+            img = normalize_img(image = body_image)['image']
+            img = torch.from_numpy(img.transpose(2, 0, 1))
+            
+            assert(img.shape[0] == 3)
+            assert(img.shape[1] == 256)
+            assert(img.shape[2] == 192)
+            
+            # create mask of head bounding box
+            head_mask = torch.zeros(1, img.shape[1], img.shape[2])
+            head_bb_int = head_bb.copy()
+            head_bb_int[[0, 2]] *= img.shape[2]
+            head_bb_int[[1, 3]] *= img.shape[1]
+            head_bb_int[2] += head_bb_int[0]
+            head_bb_int[3] += head_bb_int[1]
+            head_bb_int = head_bb_int.astype(np.int64)
+            head_bb_int[head_bb_int < 0] = 0
+            
+            print(head_bb, color = "red")
+            print(head_bb_int, color = "red")
+            head_mask[:, head_bb_int[1]:head_bb_int[3], head_bb_int[0]:head_bb_int[2]] = 1
+
+            # assign
+            head_masks[0, k, :, :, :] = head_mask
+            imgs[0, k, :, :, :] = img
+        
+        # compute dv
+        body_dvs[0, :, :] = torch.from_numpy(norm_body_center - np.roll(norm_body_center, shift=1, axis=0))
+        
+        return imgs, head_masks, body_dvs, head_bb_abs, image_ids, body_bb
\ No newline at end of file
diff --git a/launch.sh b/launch.sh
new file mode 100644
index 0000000..89d1e65
--- /dev/null
+++ b/launch.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+IsRunning=`docker ps -f name=rgbd_detect | grep -c "rgbd_detect"`;
+if [ $IsRunning -eq "0" ]; then
+    xhost +local:docker
+    docker run --rm \
+        --gpus all \
+        -e DISPLAY=$DISPLAY \
+        -e XAUTHORITY=$XAUTHORITY \
+        -e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \
+        -e NVIDIA_DRIVER_CAPABILITIES=all \
+        -e 'QT_X11_NO_MITSHM=1' \
+        -v /tmp/.X11-unix:/tmp/.X11-unix:rw \
+        -v /tmp/docker_share:/tmp/docker_share \
+        -v `pwd`:/workspace/rgbd_pose_and_depth \
+        --ipc host \
+        --device /dev/dri \
+        --device /dev/snd \
+        --device /dev/input \
+        --device /dev/bus/usb \
+        --privileged \
+        --ulimit rtprio=99 \
+        --net host \
+        --name rgbd_detect \
+        --entrypoint /bin/bash \
+        -ti inria_docker:rgbd_detect
+else
+    echo "Docker image is already running. Opening new terminal...";
+    docker exec -ti rgbd_detect /bin/bash
+fi
\ No newline at end of file
diff --git a/lib/data/augmentation.py b/lib/data/augmentation.py
new file mode 100644
index 0000000..0818d64
--- /dev/null
+++ b/lib/data/augmentation.py
@@ -0,0 +1,99 @@
+import numpy as np
+import os
+import random
+import torch
+import copy
+import torch.nn as nn
+from lib.utils.tools import read_pkl
+from lib.utils.utils_data import flip_data, crop_scale_3d
+    
+class Augmenter2D(object):
+    """
+        Make 2D augmentations on the fly. PyTorch batch-processing GPU version.
+    """
+    def __init__(self, args):
+        self.d2c_params = read_pkl(args.d2c_params_path)
+        self.noise = torch.load(args.noise_path)
+        self.mask_ratio = args.mask_ratio
+        self.mask_T_ratio = args.mask_T_ratio
+        self.num_Kframes = 27
+        self.noise_std = 0.002
+
+    def dis2conf(self, dis, a, b, m, s):
+        f = a/(dis+a)+b*dis
+        shift = torch.randn(*dis.shape)*s + m
+        # if torch.cuda.is_available():
+        shift = shift.to(dis.device)
+        return f + shift
+    
+    def add_noise(self, motion_2d):
+        a, b, m, s = self.d2c_params["a"], self.d2c_params["b"], self.d2c_params["m"], self.d2c_params["s"]
+        if "uniform_range" in self.noise.keys():
+            uniform_range = self.noise["uniform_range"]
+        else:
+            uniform_range = 0.06
+        motion_2d = motion_2d[:,:,:,:2]
+        batch_size = motion_2d.shape[0]
+        num_frames = motion_2d.shape[1]
+        num_joints = motion_2d.shape[2]
+        mean = self.noise['mean'].float()
+        std = self.noise['std'].float()
+        weight = self.noise['weight'][:,None].float()
+        sel = torch.rand((batch_size, self.num_Kframes, num_joints, 1))
+        gaussian_sample = (torch.randn(batch_size, self.num_Kframes, num_joints, 2) * std + mean) 
+        uniform_sample = (torch.rand((batch_size, self.num_Kframes, num_joints, 2))-0.5) * uniform_range
+        noise_mean = 0
+        delta_noise = torch.randn(num_frames, num_joints, 2) * self.noise_std + noise_mean
+        # if torch.cuda.is_available():
+        mean = mean.to(motion_2d.device)
+        std = std.to(motion_2d.device)
+        weight = weight.to(motion_2d.device)
+        gaussian_sample = gaussian_sample.to(motion_2d.device)
+        uniform_sample = uniform_sample.to(motion_2d.device)
+        sel = sel.to(motion_2d.device)
+        delta_noise = delta_noise.to(motion_2d.device)
+            
+        delta = gaussian_sample*(sel<weight) + uniform_sample*(sel>=weight)
+        delta_expand = torch.nn.functional.interpolate(delta.unsqueeze(1), [num_frames, num_joints, 2], mode='trilinear', align_corners=True)[:,0]
+        delta_final = delta_expand + delta_noise      
+        motion_2d = motion_2d + delta_final 
+        dx = delta_final[:,:,:,0]
+        dy = delta_final[:,:,:,1]
+        dis2 = dx*dx+dy*dy
+        dis = torch.sqrt(dis2)
+        conf = self.dis2conf(dis, a, b, m, s).clip(0,1).reshape([batch_size, num_frames, num_joints, -1])
+        return torch.cat((motion_2d, conf), dim=3)
+        
+    def add_mask(self, x):
+        ''' motion_2d: (N,T,17,3)
+        '''
+        N,T,J,C = x.shape
+        mask = torch.rand(N,T,J,1, dtype=x.dtype, device=x.device) > self.mask_ratio
+        mask_T = torch.rand(1,T,1,1, dtype=x.dtype, device=x.device) > self.mask_T_ratio
+        x = x * mask * mask_T
+        return x
+    
+    def augment2D(self, motion_2d, mask=False, noise=False):     
+        if noise:
+            motion_2d = self.add_noise(motion_2d)
+        if mask:
+            motion_2d = self.add_mask(motion_2d)
+        return motion_2d
+    
+class Augmenter3D(object):
+    """
+        Make 3D augmentations when dataloaders get items. NumPy single motion version.
+    """
+    def __init__(self, args):
+        self.flip = args.flip
+        if hasattr(args, "scale_range_pretrain"):
+            self.scale_range_pretrain = args.scale_range_pretrain
+        else:
+            self.scale_range_pretrain = None
+    
+    def augment3D(self, motion_3d):
+        if self.scale_range_pretrain:
+            motion_3d = crop_scale_3d(motion_3d, self.scale_range_pretrain)
+        if self.flip and random.random()>0.5:                       
+            motion_3d = flip_data(motion_3d)
+        return motion_3d
\ No newline at end of file
diff --git a/lib/data/datareader_h36m.py b/lib/data/datareader_h36m.py
new file mode 100644
index 0000000..b0f20b6
--- /dev/null
+++ b/lib/data/datareader_h36m.py
@@ -0,0 +1,136 @@
+# Adapted from Optimizing Network Structure for 3D Human Pose Estimation (ICCV 2019) (https://github.com/CHUNYUWANG/lcn-pose/blob/master/tools/data.py)
+
+import numpy as np
+import os, sys
+import random
+import copy
+from lib.utils.tools import read_pkl
+from lib.utils.utils_data import split_clips
+random.seed(0)
+    
+class DataReaderH36M(object):
+    def __init__(self, n_frames, sample_stride, data_stride_train, data_stride_test, read_confidence=True, dt_root = 'data/motion3d', dt_file = 'h36m_cpn_cam_source.pkl'):
+        self.gt_trainset = None
+        self.gt_testset = None
+        self.split_id_train = None
+        self.split_id_test = None
+        self.test_hw = None
+        self.dt_dataset = read_pkl('%s/%s' % (dt_root, dt_file))
+        self.n_frames = n_frames
+        self.sample_stride = sample_stride
+        self.data_stride_train = data_stride_train
+        self.data_stride_test = data_stride_test
+        self.read_confidence = read_confidence
+        
+    def read_2d(self):
+        trainset = self.dt_dataset['train']['joint_2d'][::self.sample_stride, :, :2].astype(np.float32)  # [N, 17, 2]
+        testset = self.dt_dataset['test']['joint_2d'][::self.sample_stride, :, :2].astype(np.float32)  # [N, 17, 2]
+        # map to [-1, 1]
+        for idx, camera_name in enumerate(self.dt_dataset['train']['camera_name']):
+            if camera_name == '54138969' or camera_name == '60457274':
+                res_w, res_h = 1000, 1002
+            elif camera_name == '55011271' or camera_name == '58860488':
+                res_w, res_h = 1000, 1000
+            else:
+                assert 0, '%d data item has an invalid camera name' % idx
+            trainset[idx, :, :] = trainset[idx, :, :] / res_w * 2 - [1, res_h / res_w]
+        for idx, camera_name in enumerate(self.dt_dataset['test']['camera_name']):
+            if camera_name == '54138969' or camera_name == '60457274':
+                res_w, res_h = 1000, 1002
+            elif camera_name == '55011271' or camera_name == '58860488':
+                res_w, res_h = 1000, 1000
+            else:
+                assert 0, '%d data item has an invalid camera name' % idx
+            testset[idx, :, :] = testset[idx, :, :] / res_w * 2 - [1, res_h / res_w]
+        if self.read_confidence:
+            if 'confidence' in self.dt_dataset['train'].keys():
+                train_confidence = self.dt_dataset['train']['confidence'][::self.sample_stride].astype(np.float32)  
+                test_confidence = self.dt_dataset['test']['confidence'][::self.sample_stride].astype(np.float32)  
+                if len(train_confidence.shape)==2: # (1559752, 17)
+                    train_confidence = train_confidence[:,:,None]
+                    test_confidence = test_confidence[:,:,None]
+            else:
+                # No conf provided, fill with 1.
+                train_confidence = np.ones(trainset.shape)[:,:,0:1]
+                test_confidence = np.ones(testset.shape)[:,:,0:1]
+            trainset = np.concatenate((trainset, train_confidence), axis=2)  # [N, 17, 3]
+            testset = np.concatenate((testset, test_confidence), axis=2)  # [N, 17, 3]
+        return trainset, testset
+
+    def read_3d(self):
+        train_labels = self.dt_dataset['train']['joint3d_image'][::self.sample_stride, :, :3].astype(np.float32)  # [N, 17, 3]
+        test_labels = self.dt_dataset['test']['joint3d_image'][::self.sample_stride, :, :3].astype(np.float32)    # [N, 17, 3]
+        # map to [-1, 1]
+        for idx, camera_name in enumerate(self.dt_dataset['train']['camera_name']):
+            if camera_name == '54138969' or camera_name == '60457274':
+                res_w, res_h = 1000, 1002
+            elif camera_name == '55011271' or camera_name == '58860488':
+                res_w, res_h = 1000, 1000
+            else:
+                assert 0, '%d data item has an invalid camera name' % idx
+            train_labels[idx, :, :2] = train_labels[idx, :, :2] / res_w * 2 - [1, res_h / res_w]
+            train_labels[idx, :, 2:] = train_labels[idx, :, 2:] / res_w * 2
+            
+        for idx, camera_name in enumerate(self.dt_dataset['test']['camera_name']):
+            if camera_name == '54138969' or camera_name == '60457274':
+                res_w, res_h = 1000, 1002
+            elif camera_name == '55011271' or camera_name == '58860488':
+                res_w, res_h = 1000, 1000
+            else:
+                assert 0, '%d data item has an invalid camera name' % idx
+            test_labels[idx, :, :2] = test_labels[idx, :, :2] / res_w * 2 - [1, res_h / res_w]
+            test_labels[idx, :, 2:] = test_labels[idx, :, 2:] / res_w * 2
+            
+        return train_labels, test_labels
+    def read_hw(self):
+        if self.test_hw is not None:
+            return self.test_hw
+        test_hw = np.zeros((len(self.dt_dataset['test']['camera_name']), 2))
+        for idx, camera_name in enumerate(self.dt_dataset['test']['camera_name']):
+            if camera_name == '54138969' or camera_name == '60457274':
+                res_w, res_h = 1000, 1002
+            elif camera_name == '55011271' or camera_name == '58860488':
+                res_w, res_h = 1000, 1000
+            else:
+                assert 0, '%d data item has an invalid camera name' % idx
+            test_hw[idx] = res_w, res_h
+        self.test_hw = test_hw
+        return test_hw
+    
+    def get_split_id(self):
+        if self.split_id_train is not None and self.split_id_test is not None:
+            return self.split_id_train, self.split_id_test
+        vid_list_train = self.dt_dataset['train']['source'][::self.sample_stride]                          # (1559752,)
+        vid_list_test = self.dt_dataset['test']['source'][::self.sample_stride]                           # (566920,)
+        self.split_id_train = split_clips(vid_list_train, self.n_frames, data_stride=self.data_stride_train) 
+        self.split_id_test = split_clips(vid_list_test, self.n_frames, data_stride=self.data_stride_test)
+        return self.split_id_train, self.split_id_test
+    
+    def get_hw(self):
+#       Only Testset HW is needed for denormalization
+        test_hw = self.read_hw()                                     # train_data (1559752, 2) test_data (566920, 2)
+        split_id_train, split_id_test = self.get_split_id()
+        test_hw = test_hw[split_id_test][:,0,:]                      # (N, 2)
+        return test_hw
+    
+    def get_sliced_data(self):
+        train_data, test_data = self.read_2d()     # train_data (1559752, 17, 3) test_data (566920, 17, 3)
+        train_labels, test_labels = self.read_3d() # train_labels (1559752, 17, 3) test_labels (566920, 17, 3)
+        split_id_train, split_id_test = self.get_split_id()
+        train_data, test_data = train_data[split_id_train], test_data[split_id_test]                # (N, 27, 17, 3)
+        train_labels, test_labels = train_labels[split_id_train], test_labels[split_id_test]        # (N, 27, 17, 3)
+        # ipdb.set_trace()
+        return train_data, test_data, train_labels, test_labels
+    
+    def denormalize(self, test_data):
+#       data: (N, n_frames, 51) or data: (N, n_frames, 17, 3)        
+        n_clips = test_data.shape[0]
+        test_hw = self.get_hw()
+        data = test_data.reshape([n_clips, -1, 17, 3])
+        assert len(data) == len(test_hw)
+        # denormalize (x,y,z) coordiantes for results
+        for idx, item in enumerate(data):
+            res_w, res_h = test_hw[idx]
+            data[idx, :, :, :2] = (data[idx, :, :, :2] + np.array([1, res_h / res_w])) * res_w / 2
+            data[idx, :, :, 2:] = data[idx, :, :, 2:] * res_w / 2
+        return data # [n_clips, -1, 17, 3]
diff --git a/lib/data/datareader_mesh.py b/lib/data/datareader_mesh.py
new file mode 100644
index 0000000..7cb1e87
--- /dev/null
+++ b/lib/data/datareader_mesh.py
@@ -0,0 +1,59 @@
+import numpy as np
+import os, sys
+import copy
+from lib.utils.tools import read_pkl
+from lib.utils.utils_data import split_clips
+
+class DataReaderMesh(object):
+    def __init__(self, n_frames, sample_stride, data_stride_train, data_stride_test, read_confidence=True, dt_root = 'data/mesh', dt_file = 'pw3d_det.pkl', res=[1920, 1920]):
+        self.split_id_train = None
+        self.split_id_test = None
+        self.dt_dataset = read_pkl('%s/%s' % (dt_root, dt_file))
+        self.n_frames = n_frames
+        self.sample_stride = sample_stride
+        self.data_stride_train = data_stride_train
+        self.data_stride_test = data_stride_test
+        self.read_confidence = read_confidence
+        self.res = res
+        
+    def read_2d(self):
+        if self.res is not None:
+            res_w, res_h = self.res
+            offset = [1, res_h / res_w]
+        else:
+            res = np.array(self.dt_dataset['train']['img_hw'])[::self.sample_stride].astype(np.float32)
+            res_w, res_h = res.max(1)[:, None, None], res.max(1)[:, None, None]
+            offset = 1
+        trainset = self.dt_dataset['train']['joint_2d'][::self.sample_stride, :, :2].astype(np.float32)  # [N, 17, 2]
+        testset = self.dt_dataset['test']['joint_2d'][::self.sample_stride, :, :2].astype(np.float32)    # [N, 17, 2] 
+        # res_w, res_h = self.res
+        trainset = trainset / res_w * 2 - offset
+        testset = testset / res_w * 2 - offset
+        if self.read_confidence:
+            train_confidence = self.dt_dataset['train']['confidence'][::self.sample_stride].astype(np.float32)  
+            test_confidence = self.dt_dataset['test']['confidence'][::self.sample_stride].astype(np.float32)  
+            if len(train_confidence.shape)==2: 
+                train_confidence = train_confidence[:,:,None]
+                test_confidence = test_confidence[:,:,None]
+            trainset = np.concatenate((trainset, train_confidence), axis=2)  # [N, 17, 3]
+            testset = np.concatenate((testset, test_confidence), axis=2)  # [N, 17, 3]
+        return trainset, testset
+    
+    def get_split_id(self):
+        if self.split_id_train is not None and self.split_id_test is not None:
+            return self.split_id_train, self.split_id_test
+        vid_list_train = self.dt_dataset['train']['source'][::self.sample_stride]                          
+        vid_list_test = self.dt_dataset['test']['source'][::self.sample_stride]                          
+        self.split_id_train = split_clips(vid_list_train, self.n_frames, self.data_stride_train)  
+        self.split_id_test = split_clips(vid_list_test, self.n_frames, self.data_stride_test)  
+        return self.split_id_train, self.split_id_test
+    
+    def get_sliced_data(self):
+        train_data, test_data = self.read_2d()     
+        train_labels, test_labels = self.read_3d() 
+        split_id_train, split_id_test = self.get_split_id()
+        train_data, test_data = train_data[split_id_train], test_data[split_id_test]                     # (N, 27, 17, 3)
+        train_labels, test_labels = train_labels[split_id_train], test_labels[split_id_test]             # (N, 27, 17, 3)
+        return train_data, test_data, train_labels, test_labels
+
+    
\ No newline at end of file
diff --git a/lib/data/dataset_action.py b/lib/data/dataset_action.py
new file mode 100644
index 0000000..87bc5de
--- /dev/null
+++ b/lib/data/dataset_action.py
@@ -0,0 +1,206 @@
+import torch
+import numpy as np
+import os
+import random
+import copy
+from torch.utils.data import Dataset, DataLoader
+from lib.utils.utils_data import crop_scale, resample
+from lib.utils.tools import read_pkl
+    
+def get_action_names(file_path = "data/action/ntu_actions.txt"):
+    f = open(file_path, "r")
+    s = f.read()
+    actions = s.split('\n')
+    action_names = []
+    for a in actions:
+        action_names.append(a.split('.')[1][1:])
+    return action_names
+
+def make_cam(x, img_shape):
+    '''
+        Input: x (M x T x V x C)
+               img_shape (height, width)
+    '''
+    h, w = img_shape
+    if w >= h:
+        x_cam = x / w * 2 - 1
+    else:
+        x_cam = x / h * 2 - 1
+    return x_cam
+
+def coco2h36m(x):
+    '''
+        Input: x (M x T x V x C)
+        
+        COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank}
+        
+        H36M:
+        0: 'root',
+        1: 'rhip',
+        2: 'rkne',
+        3: 'rank',
+        4: 'lhip',
+        5: 'lkne',
+        6: 'lank',
+        7: 'belly',
+        8: 'neck',
+        9: 'nose',
+        10: 'head',
+        11: 'lsho',
+        12: 'lelb',
+        13: 'lwri',
+        14: 'rsho',
+        15: 'relb',
+        16: 'rwri'
+    '''
+    y = np.zeros(x.shape)
+    y[:,:,0,:] = (x[:,:,11,:] + x[:,:,12,:]) * 0.5
+    y[:,:,1,:] = x[:,:,12,:]
+    y[:,:,2,:] = x[:,:,14,:]
+    y[:,:,3,:] = x[:,:,16,:]
+    y[:,:,4,:] = x[:,:,11,:]
+    y[:,:,5,:] = x[:,:,13,:]
+    y[:,:,6,:] = x[:,:,15,:]
+    y[:,:,8,:] = (x[:,:,5,:] + x[:,:,6,:]) * 0.5
+    y[:,:,7,:] = (y[:,:,0,:] + y[:,:,8,:]) * 0.5
+    y[:,:,9,:] = x[:,:,0,:]
+    y[:,:,10,:] = (x[:,:,1,:] + x[:,:,2,:]) * 0.5
+    y[:,:,11,:] = x[:,:,5,:]
+    y[:,:,12,:] = x[:,:,7,:]
+    y[:,:,13,:] = x[:,:,9,:]
+    y[:,:,14,:] = x[:,:,6,:]
+    y[:,:,15,:] = x[:,:,8,:]
+    y[:,:,16,:] = x[:,:,10,:]
+    return y
+    
+def random_move(data_numpy,
+                angle_range=[-10., 10.],
+                scale_range=[0.9, 1.1],
+                transform_range=[-0.1, 0.1],
+                move_time_candidate=[1]):
+    data_numpy = np.transpose(data_numpy, (3,1,2,0)) # M,T,V,C-> C,T,V,M
+    C, T, V, M = data_numpy.shape
+    move_time = random.choice(move_time_candidate)
+    node = np.arange(0, T, T * 1.0 / move_time).round().astype(int)
+    node = np.append(node, T)
+    num_node = len(node)
+    A = np.random.uniform(angle_range[0], angle_range[1], num_node)
+    S = np.random.uniform(scale_range[0], scale_range[1], num_node)
+    T_x = np.random.uniform(transform_range[0], transform_range[1], num_node)
+    T_y = np.random.uniform(transform_range[0], transform_range[1], num_node)
+    a = np.zeros(T)
+    s = np.zeros(T)
+    t_x = np.zeros(T)
+    t_y = np.zeros(T)
+    # linspace
+    for i in range(num_node - 1):
+        a[node[i]:node[i + 1]] = np.linspace(
+            A[i], A[i + 1], node[i + 1] - node[i]) * np.pi / 180
+        s[node[i]:node[i + 1]] = np.linspace(S[i], S[i + 1], node[i + 1] - node[i])
+        t_x[node[i]:node[i + 1]] = np.linspace(T_x[i], T_x[i + 1], node[i + 1] - node[i])
+        t_y[node[i]:node[i + 1]] = np.linspace(T_y[i], T_y[i + 1], node[i + 1] - node[i])
+    theta = np.array([[np.cos(a) * s, -np.sin(a) * s],
+                      [np.sin(a) * s, np.cos(a) * s]])
+    # perform transformation
+    for i_frame in range(T):
+        xy = data_numpy[0:2, i_frame, :, :]
+        new_xy = np.dot(theta[:, :, i_frame], xy.reshape(2, -1))
+        new_xy[0] += t_x[i_frame]
+        new_xy[1] += t_y[i_frame]
+        data_numpy[0:2, i_frame, :, :] = new_xy.reshape(2, V, M)
+    data_numpy = np.transpose(data_numpy, (3,1,2,0)) # C,T,V,M -> M,T,V,C
+    return data_numpy    
+
+def human_tracking(x):
+    M, T = x.shape[:2]
+    if M==1:
+        return x
+    else:
+        diff0 = np.sum(np.linalg.norm(x[0,1:] - x[0,:-1], axis=-1), axis=-1)        # (T-1, V, C) -> (T-1)
+        diff1 = np.sum(np.linalg.norm(x[0,1:] - x[1,:-1], axis=-1), axis=-1)
+        x_new = np.zeros(x.shape)
+        sel = np.cumsum(diff0 > diff1) % 2
+        sel = sel[:,None,None]
+        x_new[0][0] = x[0][0]
+        x_new[1][0] = x[1][0]
+        x_new[0,1:] = x[1,1:] * sel + x[0,1:] * (1-sel)
+        x_new[1,1:] = x[0,1:] * sel + x[1,1:] * (1-sel)
+        return x_new
+
+class ActionDataset(Dataset):
+    def __init__(self, data_path, data_split, n_frames=243, random_move=True, scale_range=[1,1], check_split=True):   # data_split: train/test etc.
+        np.random.seed(0)
+        dataset = read_pkl(data_path)
+        if check_split:
+            assert data_split in dataset['split'].keys()
+            self.split = dataset['split'][data_split]
+        annotations = dataset['annotations']
+        self.random_move = random_move
+        self.is_train = "train" in data_split or (check_split==False)
+        if "oneshot" in data_split:
+            self.is_train = False
+        self.scale_range = scale_range
+        motions = []
+        labels = []
+        for sample in annotations:
+            if check_split and (not sample['frame_dir'] in self.split):
+                continue
+            resample_id = resample(ori_len=sample['total_frames'], target_len=n_frames, randomness=self.is_train)
+            motion_cam = make_cam(x=sample['keypoint'], img_shape=sample['img_shape'])
+            motion_cam = human_tracking(motion_cam)
+            motion_cam = coco2h36m(motion_cam)
+            motion_conf = sample['keypoint_score'][..., None]
+            motion = np.concatenate((motion_cam[:,resample_id], motion_conf[:,resample_id]), axis=-1)
+            if motion.shape[0]==1:                                  # Single person, make a fake zero person
+                fake = np.zeros(motion.shape)
+                motion = np.concatenate((motion, fake), axis=0)
+            motions.append(motion.astype(np.float32)) 
+            labels.append(sample['label'])
+        self.motions = np.array(motions)
+        self.labels = np.array(labels)
+        
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.motions)
+
+    def __getitem__(self, index):
+        raise NotImplementedError 
+
+class NTURGBD(ActionDataset):
+    def __init__(self, data_path, data_split, n_frames=243, random_move=True, scale_range=[1,1]):
+        super(NTURGBD, self).__init__(data_path, data_split, n_frames, random_move, scale_range)
+        
+    def __getitem__(self, idx):
+        'Generates one sample of data'
+        motion, label = self.motions[idx], self.labels[idx] # (M,T,J,C)
+        if self.random_move:
+            motion = random_move(motion)
+        if self.scale_range:
+            result = crop_scale(motion, scale_range=self.scale_range)
+        else:
+            result = motion
+        return result.astype(np.float32), label
+    
+class NTURGBD1Shot(ActionDataset):
+    def __init__(self, data_path, data_split, n_frames=243, random_move=True, scale_range=[1,1], check_split=False):
+        super(NTURGBD1Shot, self).__init__(data_path, data_split, n_frames, random_move, scale_range, check_split)
+        oneshot_classes = [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108, 114]
+        new_classes = set(range(120)) - set(oneshot_classes)
+        old2new = {}
+        for i, cid in enumerate(new_classes):
+            old2new[cid] = i
+        filtered = [not (x in oneshot_classes) for x in self.labels]
+        self.motions = self.motions[filtered]
+        filtered_labels = self.labels[filtered]
+        self.labels = [old2new[x] for x in filtered_labels]
+        
+    def __getitem__(self, idx):
+        'Generates one sample of data'
+        motion, label = self.motions[idx], self.labels[idx] # (M,T,J,C)
+        if self.random_move:
+            motion = random_move(motion)
+        if self.scale_range:
+            result = crop_scale(motion, scale_range=self.scale_range)
+        else:
+            result = motion
+        return result.astype(np.float32), label
\ No newline at end of file
diff --git a/lib/data/dataset_mesh.py b/lib/data/dataset_mesh.py
new file mode 100644
index 0000000..c496a3a
--- /dev/null
+++ b/lib/data/dataset_mesh.py
@@ -0,0 +1,97 @@
+import torch
+import numpy as np
+import glob
+import os
+import io
+import random
+import pickle
+from torch.utils.data import Dataset, DataLoader
+from lib.data.augmentation import Augmenter3D
+from lib.utils.tools import read_pkl
+from lib.utils.utils_data import flip_data, crop_scale
+from lib.utils.utils_mesh import flip_thetas
+from lib.utils.utils_smpl import SMPL
+from torch.utils.data import Dataset, DataLoader
+from lib.data.datareader_h36m import DataReaderH36M  
+from lib.data.datareader_mesh import DataReaderMesh  
+from lib.data.dataset_action import random_move  
+
+class SMPLDataset(Dataset):
+    def __init__(self, args, data_split, dataset): # data_split: train/test; dataset: h36m, coco, pw3d
+        random.seed(0)
+        np.random.seed(0)
+        self.clip_len = args.clip_len
+        self.data_split = data_split
+        if dataset=="h36m":
+            datareader = DataReaderH36M(n_frames=self.clip_len, sample_stride=args.sample_stride, data_stride_train=args.data_stride, data_stride_test=self.clip_len, dt_root=args.data_root, dt_file=args.dt_file_h36m)
+        elif dataset=="coco":
+            datareader = DataReaderMesh(n_frames=1, sample_stride=args.sample_stride, data_stride_train=1, data_stride_test=1, dt_root=args.data_root, dt_file=args.dt_file_coco, res=[640, 640])
+        elif dataset=="pw3d":
+            datareader = DataReaderMesh(n_frames=self.clip_len, sample_stride=args.sample_stride, data_stride_train=args.data_stride, data_stride_test=self.clip_len, dt_root=args.data_root, dt_file=args.dt_file_pw3d, res=[1920, 1920])
+        else:
+            raise Exception("Mesh dataset undefined.")
+
+        split_id_train, split_id_test = datareader.get_split_id()                        # Index of clips
+        train_data, test_data = datareader.read_2d()
+        train_data, test_data = train_data[split_id_train], test_data[split_id_test]     # Input: (N, T, 17, 3)
+        self.motion_2d = {'train': train_data, 'test': test_data}[data_split]
+
+        dt = datareader.dt_dataset
+        smpl_pose_train = dt['train']['smpl_pose'][split_id_train]                       # (N, T, 72)
+        smpl_shape_train = dt['train']['smpl_shape'][split_id_train]                     # (N, T, 10)
+        smpl_pose_test = dt['test']['smpl_pose'][split_id_test]                          # (N, T, 72)
+        smpl_shape_test = dt['test']['smpl_shape'][split_id_test]                        # (N, T, 10)
+        
+        self.motion_smpl_3d = {'train': {'pose': smpl_pose_train, 'shape': smpl_shape_train}, 'test': {'pose': smpl_pose_test, 'shape': smpl_shape_test}}[data_split]
+        self.smpl = SMPL(
+            args.data_root,
+            batch_size=1,
+        )
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.motion_2d)
+
+    def __getitem__(self, index):
+        raise NotImplementedError 
+
+class MotionSMPL(SMPLDataset):
+    def __init__(self, args, data_split, dataset):
+        super(MotionSMPL, self).__init__(args, data_split, dataset)
+        self.flip = args.flip
+        
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        motion_2d = self.motion_2d[index]                                            # motion_2d: (T,17,3)     
+        motion_2d[:,:,2] = np.clip(motion_2d[:,:,2], 0, 1)
+        motion_smpl_pose = self.motion_smpl_3d['pose'][index].reshape(-1, 24, 3)     # motion_smpl_3d: (T, 24, 3)    
+        motion_smpl_shape = self.motion_smpl_3d['shape'][index]                      # motion_smpl_3d: (T,10)    
+        
+        if self.data_split=="train":
+            if self.flip and random.random() > 0.5:                                  # Training augmentation - random flipping
+                motion_2d = flip_data(motion_2d)
+                motion_smpl_pose = flip_thetas(motion_smpl_pose)                
+
+            
+        motion_smpl_pose = torch.from_numpy(motion_smpl_pose).reshape(-1, 72).float()
+        motion_smpl_shape = torch.from_numpy(motion_smpl_shape).reshape(-1, 10).float()
+        motion_smpl = self.smpl(
+            betas=motion_smpl_shape,
+            body_pose=motion_smpl_pose[:, 3:],
+            global_orient=motion_smpl_pose[:, :3],
+            pose2rot=True
+        )
+        motion_verts = motion_smpl.vertices.detach()*1000.0
+        J_regressor = self.smpl.J_regressor_h36m
+        J_regressor_batch = J_regressor[None, :].expand(motion_verts.shape[0], -1, -1).to(motion_verts.device)
+        motion_3d_reg = torch.matmul(J_regressor_batch, motion_verts)                 # motion_3d: (T,17,3)  
+        motion_verts = motion_verts - motion_3d_reg[:, :1, :]
+        motion_3d_reg = motion_3d_reg - motion_3d_reg[:, :1, :]                       # motion_3d: (T,17,3)    
+        motion_theta = torch.cat((motion_smpl_pose, motion_smpl_shape), -1)
+        motion_smpl_3d = {
+            'theta': motion_theta,       # smpl pose and shape
+            'kp_3d': motion_3d_reg,      # 3D keypoints
+            'verts': motion_verts,       # 3D mesh vertices
+        }
+        return motion_2d, motion_smpl_3d
\ No newline at end of file
diff --git a/lib/data/dataset_motion_2d.py b/lib/data/dataset_motion_2d.py
new file mode 100644
index 0000000..b136f33
--- /dev/null
+++ b/lib/data/dataset_motion_2d.py
@@ -0,0 +1,148 @@
+import sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import os
+import random
+import copy
+import json
+from collections import defaultdict
+from lib.utils.utils_data import crop_scale, flip_data, resample, split_clips
+
+def posetrack2h36m(x):
+    '''
+        Input: x (T x V x C)
+
+        PoseTrack keypoints = [ 'nose',
+                                'head_bottom',
+                                'head_top',
+                                'left_ear',
+                                'right_ear',
+                                'left_shoulder',
+                                'right_shoulder',
+                                'left_elbow',
+                                'right_elbow',
+                                'left_wrist',
+                                'right_wrist',
+                                'left_hip',
+                                'right_hip',
+                                'left_knee',
+                                'right_knee',
+                                'left_ankle',
+                                'right_ankle']
+        H36M:
+        0: 'root',
+        1: 'rhip',
+        2: 'rkne',
+        3: 'rank',
+        4: 'lhip',
+        5: 'lkne',
+        6: 'lank',
+        7: 'belly',
+        8: 'neck',
+        9: 'nose',
+        10: 'head',
+        11: 'lsho',
+        12: 'lelb',
+        13: 'lwri',
+        14: 'rsho',
+        15: 'relb',
+        16: 'rwri'
+    '''
+    y = np.zeros(x.shape)
+    y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5
+    y[:,1,:] = x[:,12,:]
+    y[:,2,:] = x[:,14,:]
+    y[:,3,:] = x[:,16,:]
+    y[:,4,:] = x[:,11,:]
+    y[:,5,:] = x[:,13,:]
+    y[:,6,:] = x[:,15,:]
+    y[:,8,:] = x[:,1,:]
+    y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5
+    y[:,9,:] = x[:,0,:]
+    y[:,10,:] = x[:,2,:]
+    y[:,11,:] = x[:,5,:]
+    y[:,12,:] = x[:,7,:]
+    y[:,13,:] = x[:,9,:]
+    y[:,14,:] = x[:,6,:]
+    y[:,15,:] = x[:,8,:]
+    y[:,16,:] = x[:,10,:]
+    y[:,0,2] = np.minimum(x[:,11,2], x[:,12,2])
+    y[:,7,2] = np.minimum(y[:,0,2], y[:,8,2])
+    return y
+
+
+class PoseTrackDataset2D(Dataset):
+    def __init__(self, flip=True, scale_range=[0.25, 1]):
+        super(PoseTrackDataset2D, self).__init__()
+        self.flip = flip
+        data_root = "data/motion2d/posetrack18_annotations/train/"
+        file_list = sorted(os.listdir(data_root))
+        all_motions = []
+        all_motions_filtered = []
+        self.scale_range = scale_range
+        for filename in file_list:
+            with open(os.path.join(data_root, filename), 'r') as file:
+                json_dict = json.load(file)
+                annots = json_dict['annotations']
+                imgs = json_dict['images']
+                motions = defaultdict(list)
+                for annot in annots:
+                    tid = annot['track_id']
+                    pose2d = np.array(annot['keypoints']).reshape(-1,3)
+                    motions[tid].append(pose2d)
+            all_motions += list(motions.values())
+        for motion in all_motions:
+            if len(motion)<30:
+                continue
+            motion = np.array(motion[:30])
+            if np.sum(motion[:,:,2]) <= 306:  # Valid joint num threshold
+                continue
+            motion = crop_scale(motion, self.scale_range) 
+            motion = posetrack2h36m(motion)
+            motion[motion[:,:,2]==0] = 0
+            if np.sum(motion[:,0,2]) < 30:
+                continue                      # Root all visible (needed for framewise rootrel)
+            all_motions_filtered.append(motion)
+        all_motions_filtered = np.array(all_motions_filtered)
+        self.motions_2d = all_motions_filtered
+        
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.motions_2d)
+
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        motion_2d = torch.FloatTensor(self.motions_2d[index])
+        if self.flip and random.random()>0.5:                       
+            motion_2d = flip_data(motion_2d)
+        return motion_2d, motion_2d
+    
+class InstaVDataset2D(Dataset):
+    def __init__(self, n_frames=81, data_stride=27, flip=True, valid_threshold=0.0, scale_range=[0.25, 1]):
+        super(InstaVDataset2D, self).__init__()
+        self.flip = flip
+        self.scale_range = scale_range
+        motion_all = np.load('data/motion2d/InstaVariety/motion_all.npy')
+        id_all = np.load('data/motion2d/InstaVariety/id_all.npy')
+        split_id = split_clips(id_all, n_frames, data_stride)  
+        motions_2d = motion_all[split_id]                        # [N, T, 17, 3]
+        valid_idx = (motions_2d[:,0,0,2] > valid_threshold)
+        self.motions_2d = motions_2d[valid_idx]
+        
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.motions_2d)
+
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        motion_2d = self.motions_2d[index]
+        motion_2d = crop_scale(motion_2d, self.scale_range) 
+        motion_2d[motion_2d[:,:,2]==0] = 0
+        if self.flip and random.random()>0.5:                       
+            motion_2d = flip_data(motion_2d)
+        motion_2d = torch.FloatTensor(motion_2d)
+        return motion_2d, motion_2d
+        
\ No newline at end of file
diff --git a/lib/data/dataset_motion_3d.py b/lib/data/dataset_motion_3d.py
new file mode 100644
index 0000000..a2de10d
--- /dev/null
+++ b/lib/data/dataset_motion_3d.py
@@ -0,0 +1,68 @@
+import torch
+import numpy as np
+import glob
+import os
+import io
+import random
+import pickle
+from torch.utils.data import Dataset, DataLoader
+from lib.data.augmentation import Augmenter3D
+from lib.utils.tools import read_pkl
+from lib.utils.utils_data import flip_data
+    
+class MotionDataset(Dataset):
+    def __init__(self, args, subset_list, data_split): # data_split: train/test
+        np.random.seed(0)
+        self.data_root = args.data_root
+        self.subset_list = subset_list
+        self.data_split = data_split
+        file_list_all = []
+        for subset in self.subset_list:
+            data_path = os.path.join(self.data_root, subset, self.data_split)
+            motion_list = sorted(os.listdir(data_path))
+            for i in motion_list:
+                file_list_all.append(os.path.join(data_path, i))
+        self.file_list = file_list_all
+        
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.file_list)
+
+    def __getitem__(self, index):
+        raise NotImplementedError 
+
+class MotionDataset3D(MotionDataset):
+    def __init__(self, args, subset_list, data_split):
+        super(MotionDataset3D, self).__init__(args, subset_list, data_split)
+        self.flip = args.flip
+        self.synthetic = args.synthetic
+        self.aug = Augmenter3D(args)
+        self.gt_2d = args.gt_2d
+
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        file_path = self.file_list[index]
+        motion_file = read_pkl(file_path)
+        motion_3d = motion_file["data_label"]  
+        if self.data_split=="train":
+            if self.synthetic or self.gt_2d:
+                motion_3d = self.aug.augment3D(motion_3d)
+                motion_2d = np.zeros(motion_3d.shape, dtype=np.float32)
+                motion_2d[:,:,:2] = motion_3d[:,:,:2]
+                motion_2d[:,:,2] = 1                        # No 2D detection, use GT xy and c=1.
+            elif motion_file["data_input"] is not None:     # Have 2D detection 
+                motion_2d = motion_file["data_input"]
+                if self.flip and random.random() > 0.5:                        # Training augmentation - random flipping
+                    motion_2d = flip_data(motion_2d)
+                    motion_3d = flip_data(motion_3d)
+            else:
+                raise ValueError('Training illegal.') 
+        elif self.data_split=="test":                                           
+            motion_2d = motion_file["data_input"]
+            if self.gt_2d:
+                motion_2d[:,:,:2] = motion_3d[:,:,:2]
+                motion_2d[:,:,2] = 1
+        else:
+            raise ValueError('Data split unknown.')    
+        return torch.FloatTensor(motion_2d), torch.FloatTensor(motion_3d)
\ No newline at end of file
diff --git a/lib/data/dataset_wild.py b/lib/data/dataset_wild.py
new file mode 100644
index 0000000..1176b4e
--- /dev/null
+++ b/lib/data/dataset_wild.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+# @Author: Raphael
+# @Date:   2024-10-09 11:02:29
+# @Last Modified by:   Raphael
+# @Last Modified time: 2024-10-14 15:26:52
+import torch
+import numpy as np
+import ipdb
+import glob
+import os
+import io
+import math
+import random
+import json
+import pickle
+import math
+from torch.utils.data import Dataset, DataLoader
+from lib.utils.utils_data import crop_scale
+
+def halpe2h36m(x):
+    '''
+        Input: x (T x V x C)  
+       //Halpe 26 body keypoints
+    {0,  "Nose"},
+    {1,  "LEye"},
+    {2,  "REye"},
+    {3,  "LEar"},
+    {4,  "REar"},
+    {5,  "LShoulder"},
+    {6,  "RShoulder"},
+    {7,  "LElbow"},
+    {8,  "RElbow"},
+    {9,  "LWrist"},
+    {10, "RWrist"},
+    {11, "LHip"},
+    {12, "RHip"},
+    {13, "LKnee"},
+    {14, "Rknee"},
+    {15, "LAnkle"},
+    {16, "RAnkle"},
+    {17,  "Head"},
+    {18,  "Neck"},
+    {19,  "Hip"},
+    {20, "LBigToe"},
+    {21, "RBigToe"},
+    {22, "LSmallToe"},
+    {23, "RSmallToe"},
+    {24, "LHeel"},
+    {25, "RHeel"},
+    '''
+    T, V, C = x.shape
+    y = np.zeros([T,17,C])
+    y[:,0,:] = x[:,19,:]
+    y[:,1,:] = x[:,12,:]
+    y[:,2,:] = x[:,14,:]
+    y[:,3,:] = x[:,16,:]
+    y[:,4,:] = x[:,11,:]
+    y[:,5,:] = x[:,13,:]
+    y[:,6,:] = x[:,15,:]
+    y[:,7,:] = (x[:,18,:] + x[:,19,:]) * 0.5
+    y[:,8,:] = x[:,18,:]
+    y[:,9,:] = x[:,0,:]
+    y[:,10,:] = x[:,17,:]
+    y[:,11,:] = x[:,5,:]
+    y[:,12,:] = x[:,7,:]
+    y[:,13,:] = x[:,9,:]
+    y[:,14,:] = x[:,6,:]
+    y[:,15,:] = x[:,8,:]
+    y[:,16,:] = x[:,10,:]
+    return y
+
+
+def coco2h36m(x):
+    '''
+        Input: x (M x T x V x C)
+        
+        COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank}
+        
+        H36M:
+        0: 'root',
+        1: 'rhip',
+        2: 'rkne',
+        3: 'rank',
+        4: 'lhip',
+        5: 'lkne',
+        6: 'lank',
+        7: 'belly',
+        8: 'neck',
+        9: 'nose',
+        10: 'head',
+        11: 'lsho',
+        12: 'lelb',
+        13: 'lwri',
+        14: 'rsho',
+        15: 'relb',
+        16: 'rwri'
+    '''
+    y = np.zeros(x.shape)
+    y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5
+    y[:,1,:] = x[:,12,:]
+    y[:,2,:] = x[:,14,:]
+    y[:,3,:] = x[:,16,:]
+    y[:,4,:] = x[:,11,:]
+    y[:,5,:] = x[:,13,:]
+    y[:,6,:] = x[:,15,:]
+    y[:,8,:] = (x[:,5,:] + x[:,6,:]) * 0.5
+    y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5
+    y[:,9,:] = x[:,0,:]
+    y[:,10,:] = (x[:,1,:] + x[:,2,:]) * 0.5
+    y[:,11,:] = x[:,5,:]
+    y[:,12,:] = x[:,7,:]
+    y[:,13,:] = x[:,9,:]
+    y[:,14,:] = x[:,6,:]
+    y[:,15,:] = x[:,8,:]
+    y[:,16,:] = x[:,10,:]
+    return y
+    
+    
+def read_input(json_path, vid_size, scale_range, focus):
+    with open(json_path, "r") as read_file:
+        results = json.load(read_file)
+    kpts_all = []
+    image_ids = []
+    kpts_3d_all = []
+    for item in results:
+        if focus!=None and item['idx']!=focus:
+            continue
+        kpts = np.array(item['keypoints']).reshape([-1,3])
+        kpts_all.append(kpts)
+        image_ids.append(item["image_id"])
+        if "keypoints_3d" in item.keys():
+            kpts_3d = np.array(item['keypoints_3d']).reshape([-1,3])
+            kpts_3d_all.append(kpts_3d)        
+        
+    kpts_all = np.array(kpts_all)
+    kpts_3d_all = np.array(kpts_3d_all)
+
+    print(kpts_all.shape)
+
+    if kpts_all.shape[1] == 26:
+        kpts_all = halpe2h36m(kpts_all)
+        if len(kpts_3d_all) > 0:
+            assert(kpts_3d_all.shape[1] == 26)
+            kpts_3d_all = halpe2h36m(kpts_3d_all)
+            
+    elif kpts_all.shape[1] == 17:
+        print("WARNING : Using COCO17 input !")
+        kpts_all = coco2h36m(kpts_all)
+        if len(kpts_3d_all) > 0:
+            assert(kpts_3d_all.shape[1] == 17)
+            kpts_3d_all = coco2h36m(kpts_3d_all)
+    else:
+        print("Error, expecting kpts_all of shape [..., 17 or 26, ...]")
+        exit(0)
+
+    
+    if vid_size:
+        w, h = vid_size
+        scale = min(w,h) / 2.0
+        kpts_all[:,:,:2] = kpts_all[:,:,:2] - np.array([w, h]) / 2.0
+        kpts_all[:,:,:2] = kpts_all[:,:,:2] / scale
+        motion = kpts_all
+        
+    if scale_range:
+        motion = crop_scale(kpts_all, scale_range) 
+    
+    motion_3d = kpts_3d_all.astype(np.float32)
+        
+    return motion.astype(np.float32), image_ids, motion_3d
+
+class WildDetDataset(Dataset):
+    def __init__(self, json_path, clip_len=243, vid_size=None, scale_range=None, focus=None):
+        self.json_path = json_path
+        self.clip_len = clip_len
+        self.vid_all, self.image_ids, self.motion_3d = read_input(json_path, vid_size, scale_range, focus)
+        
+    def __len__(self):
+        'Denotes the total number of samples'
+        return math.ceil(len(self.vid_all) / self.clip_len)
+    
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        st = index*self.clip_len
+        end = min((index+1)*self.clip_len, len(self.vid_all))
+        return self.vid_all[st:end]
\ No newline at end of file
diff --git a/lib/model/DSTformer.py b/lib/model/DSTformer.py
new file mode 100644
index 0000000..2af2388
--- /dev/null
+++ b/lib/model/DSTformer.py
@@ -0,0 +1,362 @@
+import torch
+import torch.nn as nn
+import math
+import warnings
+import random
+import numpy as np
+from collections import OrderedDict
+from functools import partial
+from itertools import repeat
+from lib.model.drop import DropPath
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., st_mode='vanilla'):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.mode = st_mode
+        if self.mode == 'parallel':
+            self.ts_attn = nn.Linear(dim*2, dim*2)
+            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        else:
+            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.attn_count_s = None
+        self.attn_count_t = None
+
+    def forward(self, x, seqlen=1):
+        B, N, C = x.shape
+        
+        if self.mode == 'series':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_spatial(q, k, v)
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_temporal(q, k, v, seqlen=seqlen)
+        elif self.mode == 'parallel':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+            x_t = self.forward_temporal(q, k, v, seqlen=seqlen)
+            x_s = self.forward_spatial(q, k, v)
+            
+            alpha = torch.cat([x_s, x_t], dim=-1)
+            alpha = alpha.mean(dim=1, keepdim=True)
+            alpha = self.ts_attn(alpha).reshape(B, 1, C, 2)
+            alpha = alpha.softmax(dim=-1)
+            x = x_t * alpha[:,:,:,1] + x_s * alpha[:,:,:,0]
+        elif self.mode == 'coupling':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_coupling(q, k, v, seqlen=seqlen)
+        elif self.mode == 'vanilla':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_spatial(q, k, v)
+        elif self.mode == 'temporal':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_temporal(q, k, v, seqlen=seqlen)
+        elif self.mode == 'spatial':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_spatial(q, k, v)
+        else:
+            raise NotImplementedError(self.mode)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    
+    def reshape_T(self, x, seqlen=1, inverse=False):
+        if not inverse:
+            N, C = x.shape[-2:]
+            x = x.reshape(-1, seqlen, self.num_heads, N, C).transpose(1,2)
+            x = x.reshape(-1, self.num_heads, seqlen*N, C) #(B, H, TN, c)
+        else:
+            TN, C = x.shape[-2:]
+            x = x.reshape(-1, self.num_heads, seqlen, TN // seqlen, C).transpose(1,2)
+            x = x.reshape(-1, self.num_heads, TN // seqlen, C) #(BT, H, N, C)
+        return x 
+
+    def forward_coupling(self, q, k, v, seqlen=8):
+        BT, _, N, C = q.shape
+        q = self.reshape_T(q, seqlen)
+        k = self.reshape_T(k, seqlen)
+        v = self.reshape_T(v, seqlen)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @ v
+        x = self.reshape_T(x, seqlen, inverse=True)
+        x = x.transpose(1,2).reshape(BT, N, C*self.num_heads)
+        return x
+
+    def forward_spatial(self, q, k, v):
+        B, _, N, C = q.shape
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @ v
+        x = x.transpose(1,2).reshape(B, N, C*self.num_heads)
+        return x
+        
+    def forward_temporal(self, q, k, v, seqlen=8):
+        B, _, N, C = q.shape
+        qt = q.reshape(-1, seqlen, self.num_heads, N, C).permute(0, 2, 3, 1, 4) #(B, H, N, T, C)
+        kt = k.reshape(-1, seqlen, self.num_heads, N, C).permute(0, 2, 3, 1, 4) #(B, H, N, T, C)
+        vt = v.reshape(-1, seqlen, self.num_heads, N, C).permute(0, 2, 3, 1, 4) #(B, H, N, T, C)
+
+        attn = (qt @ kt.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @ vt #(B, H, N, T, C)
+        x = x.permute(0, 3, 2, 1, 4).reshape(B, N, C*self.num_heads)
+        return x
+
+    def count_attn(self, attn):
+        attn = attn.detach().cpu().numpy()
+        attn = attn.mean(axis=1)
+        attn_t = attn[:, :, 1].mean(axis=1)
+        attn_s = attn[:, :, 0].mean(axis=1)
+        if self.attn_count_s is None:
+            self.attn_count_s = attn_s
+            self.attn_count_t = attn_t
+        else:
+            self.attn_count_s = np.concatenate([self.attn_count_s, attn_s], axis=0)
+            self.attn_count_t = np.concatenate([self.attn_count_t, attn_t], axis=0)
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., mlp_out_ratio=1., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, st_mode='stage_st', att_fuse=False):
+        super().__init__()
+        # assert 'stage' in st_mode
+        self.st_mode = st_mode
+        self.norm1_s = norm_layer(dim)
+        self.norm1_t = norm_layer(dim)
+        self.attn_s = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, st_mode="spatial")
+        self.attn_t = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, st_mode="temporal")
+        
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2_s = norm_layer(dim)
+        self.norm2_t = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_out_dim = int(dim * mlp_out_ratio)
+        self.mlp_s = MLP(in_features=dim, hidden_features=mlp_hidden_dim, out_features=mlp_out_dim, act_layer=act_layer, drop=drop)
+        self.mlp_t = MLP(in_features=dim, hidden_features=mlp_hidden_dim, out_features=mlp_out_dim, act_layer=act_layer, drop=drop)
+        self.att_fuse = att_fuse
+        if self.att_fuse:
+            self.ts_attn = nn.Linear(dim*2, dim*2)
+    def forward(self, x, seqlen=1):
+        if self.st_mode=='stage_st':
+            x = x + self.drop_path(self.attn_s(self.norm1_s(x), seqlen))
+            x = x + self.drop_path(self.mlp_s(self.norm2_s(x)))
+            x = x + self.drop_path(self.attn_t(self.norm1_t(x), seqlen))
+            x = x + self.drop_path(self.mlp_t(self.norm2_t(x)))
+        elif self.st_mode=='stage_ts':
+            x = x + self.drop_path(self.attn_t(self.norm1_t(x), seqlen))
+            x = x + self.drop_path(self.mlp_t(self.norm2_t(x)))
+            x = x + self.drop_path(self.attn_s(self.norm1_s(x), seqlen))
+            x = x + self.drop_path(self.mlp_s(self.norm2_s(x)))
+        elif self.st_mode=='stage_para':
+            x_t = x + self.drop_path(self.attn_t(self.norm1_t(x), seqlen))
+            x_t = x_t + self.drop_path(self.mlp_t(self.norm2_t(x_t)))
+            x_s = x + self.drop_path(self.attn_s(self.norm1_s(x), seqlen))
+            x_s = x_s + self.drop_path(self.mlp_s(self.norm2_s(x_s)))
+            if self.att_fuse:
+                #             x_s, x_t: [BF, J, dim]
+                alpha = torch.cat([x_s, x_t], dim=-1)
+                BF, J = alpha.shape[:2]
+                # alpha = alpha.mean(dim=1, keepdim=True)
+                alpha = self.ts_attn(alpha).reshape(BF, J, -1, 2)
+                alpha = alpha.softmax(dim=-1)
+                x = x_t * alpha[:,:,:,1] + x_s * alpha[:,:,:,0]
+            else:
+                x = (x_s + x_t)*0.5
+        else:
+            raise NotImplementedError(self.st_mode)
+        return x
+    
+class DSTformer(nn.Module):
+    def __init__(self, dim_in=3, dim_out=3, dim_feat=256, dim_rep=512,
+                 depth=5, num_heads=8, mlp_ratio=4, 
+                 num_joints=17, maxlen=243, 
+                 qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, att_fuse=True):
+        super().__init__()
+        self.dim_out = dim_out
+        self.dim_feat = dim_feat
+        self.joints_embed = nn.Linear(dim_in, dim_feat)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks_st = nn.ModuleList([
+            Block(
+                dim=dim_feat, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, 
+                st_mode="stage_st")
+            for i in range(depth)])
+        self.blocks_ts = nn.ModuleList([
+            Block(
+                dim=dim_feat, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, 
+                st_mode="stage_ts")
+            for i in range(depth)])
+        self.norm = norm_layer(dim_feat)
+        if dim_rep:
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ('fc', nn.Linear(dim_feat, dim_rep)),
+                ('act', nn.Tanh())
+            ]))
+        else:
+            self.pre_logits = nn.Identity()
+        self.head = nn.Linear(dim_rep, dim_out) if dim_out > 0 else nn.Identity()            
+        self.temp_embed = nn.Parameter(torch.zeros(1, maxlen, 1, dim_feat))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_joints, dim_feat))
+        trunc_normal_(self.temp_embed, std=.02)
+        trunc_normal_(self.pos_embed, std=.02)
+        self.apply(self._init_weights)
+        self.att_fuse = att_fuse
+        if self.att_fuse:
+            self.ts_attn = nn.ModuleList([nn.Linear(dim_feat*2, 2) for i in range(depth)])
+            for i in range(depth):
+                self.ts_attn[i].weight.data.fill_(0)
+                self.ts_attn[i].bias.data.fill_(0.5)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, dim_out, global_pool=''):
+        self.dim_out = dim_out
+        self.head = nn.Linear(self.dim_feat, dim_out) if dim_out > 0 else nn.Identity()
+
+    def forward(self, x, return_rep=False):   
+        B, F, J, C = x.shape
+        x = x.reshape(-1, J, C)
+        BF = x.shape[0]
+        x = self.joints_embed(x)
+        x = x + self.pos_embed
+        _, J, C = x.shape
+        x = x.reshape(-1, F, J, C) + self.temp_embed[:,:F,:,:]
+        x = x.reshape(BF, J, C)
+        x = self.pos_drop(x)
+        alphas = []
+        for idx, (blk_st, blk_ts) in enumerate(zip(self.blocks_st, self.blocks_ts)):
+            x_st = blk_st(x, F)
+            x_ts = blk_ts(x, F)
+            if self.att_fuse:
+                att = self.ts_attn[idx]
+                alpha = torch.cat([x_st, x_ts], dim=-1)
+                BF, J = alpha.shape[:2]
+                alpha = att(alpha)
+                alpha = alpha.softmax(dim=-1)
+                x = x_st * alpha[:,:,0:1] + x_ts * alpha[:,:,1:2]
+            else:
+                x = (x_st + x_ts)*0.5
+        x = self.norm(x)
+        x = x.reshape(B, F, J, -1)
+        x = self.pre_logits(x)         # [B, F, J, dim_feat]
+        if return_rep:
+            return x
+        x = self.head(x)
+        return x
+
+    def get_representation(self, x):
+        return self.forward(x, return_rep=True)
+    
\ No newline at end of file
diff --git a/lib/model/drop.py b/lib/model/drop.py
new file mode 100644
index 0000000..efbed35
--- /dev/null
+++ b/lib/model/drop.py
@@ -0,0 +1,43 @@
+""" DropBlock, DropPath
+PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers.
+Papers:
+DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890)
+Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382)
+Code:
+DropBlock impl inspired by two Tensorflow impl that I liked:
+ - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74
+ - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
\ No newline at end of file
diff --git a/lib/model/loss.py b/lib/model/loss.py
new file mode 100644
index 0000000..4397ce1
--- /dev/null
+++ b/lib/model/loss.py
@@ -0,0 +1,204 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+
+# Numpy-based errors
+
+def mpjpe(predicted, target):
+    """
+    Mean per-joint position error (i.e. mean Euclidean distance),
+    often referred to as "Protocol #1" in many papers.
+    """
+    assert predicted.shape == target.shape
+    return np.mean(np.linalg.norm(predicted - target, axis=len(target.shape)-1), axis=1)
+
+def p_mpjpe(predicted, target):
+    """
+    Pose error: MPJPE after rigid alignment (scale, rotation, and translation),
+    often referred to as "Protocol #2" in many papers.
+    """
+    assert predicted.shape == target.shape
+    
+    muX = np.mean(target, axis=1, keepdims=True)
+    muY = np.mean(predicted, axis=1, keepdims=True)
+    
+    X0 = target - muX
+    Y0 = predicted - muY
+
+    normX = np.sqrt(np.sum(X0**2, axis=(1, 2), keepdims=True))
+    normY = np.sqrt(np.sum(Y0**2, axis=(1, 2), keepdims=True))
+    
+    X0 /= normX
+    Y0 /= normY
+
+    H = np.matmul(X0.transpose(0, 2, 1), Y0)
+    U, s, Vt = np.linalg.svd(H)
+    V = Vt.transpose(0, 2, 1)
+    R = np.matmul(V, U.transpose(0, 2, 1))
+
+    # Avoid improper rotations (reflections), i.e. rotations with det(R) = -1
+    sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1))
+    V[:, :, -1] *= sign_detR
+    s[:, -1] *= sign_detR.flatten()
+    R = np.matmul(V, U.transpose(0, 2, 1)) # Rotation
+    tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2)
+    a = tr * normX / normY # Scale
+    t = muX - a*np.matmul(muY, R) # Translation
+    # Perform rigid transformation on the input
+    predicted_aligned = a*np.matmul(predicted, R) + t
+    # Return MPJPE
+    return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape)-1), axis=1)
+
+
+# PyTorch-based errors (for losses)
+
+def loss_mpjpe(predicted, target):
+    """
+    Mean per-joint position error (i.e. mean Euclidean distance),
+    often referred to as "Protocol #1" in many papers.
+    """
+    assert predicted.shape == target.shape
+    return torch.mean(torch.norm(predicted - target, dim=len(target.shape)-1))
+    
+def weighted_mpjpe(predicted, target, w):
+    """
+    Weighted mean per-joint position error (i.e. mean Euclidean distance)
+    """
+    assert predicted.shape == target.shape
+    assert w.shape[0] == predicted.shape[0]
+    return torch.mean(w * torch.norm(predicted - target, dim=len(target.shape)-1))
+
+def loss_2d_weighted(predicted, target, conf):
+    assert predicted.shape == target.shape
+    predicted_2d = predicted[:,:,:,:2]
+    target_2d = target[:,:,:,:2]
+    diff = (predicted_2d - target_2d) * conf
+    return torch.mean(torch.norm(diff, dim=-1))
+    
+def n_mpjpe(predicted, target):
+    """
+    Normalized MPJPE (scale only), adapted from:
+    https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py
+    """
+    assert predicted.shape == target.shape
+    norm_predicted = torch.mean(torch.sum(predicted**2, dim=3, keepdim=True), dim=2, keepdim=True)
+    norm_target = torch.mean(torch.sum(target*predicted, dim=3, keepdim=True), dim=2, keepdim=True)
+    scale = norm_target / norm_predicted
+    return loss_mpjpe(scale * predicted, target)
+
+def weighted_bonelen_loss(predict_3d_length, gt_3d_length):
+    loss_length = 0.001 * torch.pow(predict_3d_length - gt_3d_length, 2).mean()
+    return loss_length
+
+def weighted_boneratio_loss(predict_3d_length, gt_3d_length):
+    loss_length = 0.1 * torch.pow((predict_3d_length - gt_3d_length)/gt_3d_length, 2).mean()
+    return loss_length
+
+def get_limb_lens(x):
+    '''
+        Input: (N, T, 17, 3)
+        Output: (N, T, 16)
+    '''
+    limbs_id = [[0,1], [1,2], [2,3],
+         [0,4], [4,5], [5,6],
+         [0,7], [7,8], [8,9], [9,10],
+         [8,11], [11,12], [12,13],
+         [8,14], [14,15], [15,16]
+        ]
+    limbs = x[:,:,limbs_id,:]
+    limbs = limbs[:,:,:,0,:]-limbs[:,:,:,1,:]
+    limb_lens = torch.norm(limbs, dim=-1)
+    return limb_lens
+
+def loss_limb_var(x):
+    '''
+        Input: (N, T, 17, 3)
+    '''
+    if x.shape[1]<=1:
+        return torch.FloatTensor(1).fill_(0.)[0].to(x.device)
+    limb_lens = get_limb_lens(x)
+    limb_lens_var = torch.var(limb_lens, dim=1)
+    limb_loss_var = torch.mean(limb_lens_var)
+    return limb_loss_var
+
+def loss_limb_gt(x, gt):
+    '''
+        Input: (N, T, 17, 3), (N, T, 17, 3)
+    '''
+    limb_lens_x = get_limb_lens(x)
+    limb_lens_gt = get_limb_lens(gt) # (N, T, 16)
+    return nn.L1Loss()(limb_lens_x, limb_lens_gt)
+
+def loss_velocity(predicted, target):
+    """
+    Mean per-joint velocity error (i.e. mean Euclidean distance of the 1st derivative)
+    """
+    assert predicted.shape == target.shape
+    if predicted.shape[1]<=1:
+        return torch.FloatTensor(1).fill_(0.)[0].to(predicted.device)
+    velocity_predicted = predicted[:,1:] - predicted[:,:-1]
+    velocity_target = target[:,1:] - target[:,:-1]
+    return torch.mean(torch.norm(velocity_predicted - velocity_target, dim=-1))
+
+def loss_joint(predicted, target):
+    assert predicted.shape == target.shape
+    return nn.L1Loss()(predicted, target)
+
+def get_angles(x):
+    '''
+        Input: (N, T, 17, 3)
+        Output: (N, T, 16)
+    '''
+    limbs_id = [[0,1], [1,2], [2,3],
+         [0,4], [4,5], [5,6],
+         [0,7], [7,8], [8,9], [9,10],
+         [8,11], [11,12], [12,13],
+         [8,14], [14,15], [15,16]
+        ]
+    angle_id = [[ 0,  3],
+                [ 0,  6],
+                [ 3,  6],
+                [ 0,  1],
+                [ 1,  2],
+                [ 3,  4],
+                [ 4,  5],
+                [ 6,  7],
+                [ 7, 10],
+                [ 7, 13],
+                [ 8, 13],
+                [10, 13],
+                [ 7,  8],
+                [ 8,  9],
+                [10, 11],
+                [11, 12],
+                [13, 14],
+                [14, 15] ]
+    eps = 1e-7
+    limbs = x[:,:,limbs_id,:]
+    limbs = limbs[:,:,:,0,:]-limbs[:,:,:,1,:]
+    angles = limbs[:,:,angle_id,:]
+    angle_cos = F.cosine_similarity(angles[:,:,:,0,:], angles[:,:,:,1,:], dim=-1)
+    return torch.acos(angle_cos.clamp(-1+eps, 1-eps)) 
+
+def loss_angle(x, gt):
+    '''
+        Input: (N, T, 17, 3), (N, T, 17, 3)
+    '''
+    limb_angles_x = get_angles(x)
+    limb_angles_gt = get_angles(gt)
+    return nn.L1Loss()(limb_angles_x, limb_angles_gt)
+
+def loss_angle_velocity(x, gt):
+    """
+    Mean per-angle velocity error (i.e. mean Euclidean distance of the 1st derivative)
+    """
+    assert x.shape == gt.shape
+    if x.shape[1]<=1:
+        return torch.FloatTensor(1).fill_(0.)[0].to(x.device)
+    x_a = get_angles(x)
+    gt_a = get_angles(gt)
+    x_av = x_a[:,1:] - x_a[:,:-1]
+    gt_av = gt_a[:,1:] - gt_a[:,:-1]
+    return nn.L1Loss()(x_av, gt_av)
+
diff --git a/lib/model/loss_mesh.py b/lib/model/loss_mesh.py
new file mode 100644
index 0000000..82f615f
--- /dev/null
+++ b/lib/model/loss_mesh.py
@@ -0,0 +1,68 @@
+import torch
+import torch.nn as nn
+import ipdb
+from lib.utils.utils_mesh import batch_rodrigues
+from lib.model.loss import *
+
+class MeshLoss(nn.Module):
+    def __init__(
+            self,
+            loss_type='MSE',
+            device='cuda',
+    ):
+        super(MeshLoss, self).__init__()
+        self.device = device
+        self.loss_type = loss_type
+        if loss_type == 'MSE': 
+            self.criterion_keypoints = nn.MSELoss(reduction='none').to(self.device)
+            self.criterion_regr = nn.MSELoss().to(self.device)
+        elif loss_type == 'L1': 
+            self.criterion_keypoints = nn.L1Loss(reduction='none').to(self.device)
+            self.criterion_regr = nn.L1Loss().to(self.device)
+
+    def forward(
+            self,
+            smpl_output,
+            data_gt,
+    ):
+        # to reduce time dimension
+        reduce = lambda x: x.reshape((x.shape[0] * x.shape[1],) + x.shape[2:])
+        data_3d_theta = reduce(data_gt['theta'])
+
+        preds = smpl_output[-1]
+        pred_theta = preds['theta']
+        theta_size = pred_theta.shape[:2]
+        pred_theta = reduce(pred_theta)
+        preds_local = preds['kp_3d'] - preds['kp_3d'][:, :, 0:1,:]  # (N, T, 17, 3)
+        gt_local = data_gt['kp_3d'] - data_gt['kp_3d'][:, :, 0:1,:]
+        real_shape, pred_shape = data_3d_theta[:, 72:], pred_theta[:, 72:]
+        real_pose, pred_pose = data_3d_theta[:, :72], pred_theta[:, :72]
+        loss_dict = {}
+        loss_dict['loss_3d_pos'] = loss_mpjpe(preds_local, gt_local)
+        loss_dict['loss_3d_scale'] = n_mpjpe(preds_local, gt_local)
+        loss_dict['loss_3d_velocity'] = loss_velocity(preds_local, gt_local)
+        loss_dict['loss_lv'] = loss_limb_var(preds_local)
+        loss_dict['loss_lg'] = loss_limb_gt(preds_local, gt_local)
+        loss_dict['loss_a'] = loss_angle(preds_local, gt_local)
+        loss_dict['loss_av'] = loss_angle_velocity(preds_local, gt_local)
+        
+        if pred_theta.shape[0] > 0:
+            loss_pose, loss_shape = self.smpl_losses(pred_pose, pred_shape, real_pose, real_shape)
+            loss_norm = torch.norm(pred_theta, dim=-1).mean()
+            loss_dict['loss_shape'] = loss_shape 
+            loss_dict['loss_pose'] = loss_pose 
+            loss_dict['loss_norm'] = loss_norm 
+        return loss_dict
+        
+    def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas):
+        pred_rotmat_valid = batch_rodrigues(pred_rotmat.reshape(-1,3)).reshape(-1, 24, 3, 3)
+        gt_rotmat_valid = batch_rodrigues(gt_pose.reshape(-1,3)).reshape(-1, 24, 3, 3)
+        pred_betas_valid = pred_betas
+        gt_betas_valid = gt_betas
+        if len(pred_rotmat_valid) > 0:
+            loss_regr_pose = self.criterion_regr(pred_rotmat_valid, gt_rotmat_valid)
+            loss_regr_betas = self.criterion_regr(pred_betas_valid, gt_betas_valid)
+        else:
+            loss_regr_pose = torch.FloatTensor(1).fill_(0.).to(self.device)
+            loss_regr_betas = torch.FloatTensor(1).fill_(0.).to(self.device)
+        return loss_regr_pose, loss_regr_betas
diff --git a/lib/model/loss_supcon.py b/lib/model/loss_supcon.py
new file mode 100644
index 0000000..17117d4
--- /dev/null
+++ b/lib/model/loss_supcon.py
@@ -0,0 +1,98 @@
+"""
+Author: Yonglong Tian (yonglong@mit.edu)
+Date: May 07, 2020
+"""
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+
+
+class SupConLoss(nn.Module):
+    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+    It also supports the unsupervised contrastive loss in SimCLR"""
+    def __init__(self, temperature=0.07, contrast_mode='all',
+                 base_temperature=0.07):
+        super(SupConLoss, self).__init__()
+        self.temperature = temperature
+        self.contrast_mode = contrast_mode
+        self.base_temperature = base_temperature
+
+    def forward(self, features, labels=None, mask=None):
+        """Compute loss for model. If both `labels` and `mask` are None,
+        it degenerates to SimCLR unsupervised loss:
+        https://arxiv.org/pdf/2002.05709.pdf
+
+        Args:
+            features: hidden vector of shape [bsz, n_views, ...].
+            labels: ground truth of shape [bsz].
+            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
+                has the same class as sample i. Can be asymmetric.
+        Returns:
+            A loss scalar.
+        """
+        device = (torch.device('cuda')
+                  if features.is_cuda
+                  else torch.device('cpu'))
+
+        if len(features.shape) < 3:
+            raise ValueError('`features` needs to be [bsz, n_views, ...],'
+                             'at least 3 dimensions are required')
+        if len(features.shape) > 3:
+            features = features.view(features.shape[0], features.shape[1], -1)
+
+        batch_size = features.shape[0]
+        if labels is not None and mask is not None:
+            raise ValueError('Cannot define both `labels` and `mask`')
+        elif labels is None and mask is None:
+            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
+        elif labels is not None:
+            labels = labels.contiguous().view(-1, 1)
+            if labels.shape[0] != batch_size:
+                raise ValueError('Num of labels does not match num of features')
+            mask = torch.eq(labels, labels.T).float().to(device)
+        else:
+            mask = mask.float().to(device)
+
+        contrast_count = features.shape[1]
+        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
+        if self.contrast_mode == 'one':
+            anchor_feature = features[:, 0]
+            anchor_count = 1
+        elif self.contrast_mode == 'all':
+            anchor_feature = contrast_feature
+            anchor_count = contrast_count
+        else:
+            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
+
+        # compute logits
+        anchor_dot_contrast = torch.div(
+            torch.matmul(anchor_feature, contrast_feature.T),
+            self.temperature)
+        # for numerical stability
+        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+
+        # tile mask
+        mask = mask.repeat(anchor_count, contrast_count)
+        # mask-out self-contrast cases
+        logits_mask = torch.scatter(
+            torch.ones_like(mask),
+            1,
+            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
+            0
+        )
+        mask = mask * logits_mask
+
+        # compute log_prob
+        exp_logits = torch.exp(logits) * logits_mask
+        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
+
+        # compute mean of log-likelihood over positive
+        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
+
+        # loss
+        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = loss.view(anchor_count, batch_size).mean()
+
+        return loss
diff --git a/lib/model/model_action.py b/lib/model/model_action.py
new file mode 100644
index 0000000..785ec26
--- /dev/null
+++ b/lib/model/model_action.py
@@ -0,0 +1,71 @@
+import sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+    
+class ActionHeadClassification(nn.Module):
+    def __init__(self, dropout_ratio=0., dim_rep=512, num_classes=60, num_joints=17, hidden_dim=2048):
+        super(ActionHeadClassification, self).__init__()
+        self.dropout = nn.Dropout(p=dropout_ratio)
+        self.bn = nn.BatchNorm1d(hidden_dim, momentum=0.1)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = nn.Linear(dim_rep*num_joints, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, num_classes)
+        
+    def forward(self, feat):
+        '''
+            Input: (N, M, T, J, C)
+        '''
+        N, M, T, J, C = feat.shape
+        feat = self.dropout(feat)
+        feat = feat.permute(0, 1, 3, 4, 2)      # (N, M, T, J, C) -> (N, M, J, C, T)
+        feat = feat.mean(dim=-1)
+        feat = feat.reshape(N, M, -1)           # (N, M, J*C)
+        feat = feat.mean(dim=1)
+        feat = self.fc1(feat)
+        feat = self.bn(feat)
+        feat = self.relu(feat)    
+        feat = self.fc2(feat)
+        return feat
+        
+class ActionHeadEmbed(nn.Module):
+    def __init__(self, dropout_ratio=0., dim_rep=512, num_joints=17, hidden_dim=2048):
+        super(ActionHeadEmbed, self).__init__()
+        self.dropout = nn.Dropout(p=dropout_ratio)
+        self.fc1 = nn.Linear(dim_rep*num_joints, hidden_dim)
+    def forward(self, feat):
+        '''
+            Input: (N, M, T, J, C)
+        '''
+        N, M, T, J, C = feat.shape
+        feat = self.dropout(feat)
+        feat = feat.permute(0, 1, 3, 4, 2)      # (N, M, T, J, C) -> (N, M, J, C, T)
+        feat = feat.mean(dim=-1)
+        feat = feat.reshape(N, M, -1)           # (N, M, J*C)
+        feat = feat.mean(dim=1)
+        feat = self.fc1(feat)
+        feat = F.normalize(feat, dim=-1)
+        return feat
+
+class ActionNet(nn.Module):
+    def __init__(self, backbone, dim_rep=512, num_classes=60, dropout_ratio=0., version='class', hidden_dim=2048, num_joints=17):
+        super(ActionNet, self).__init__()
+        self.backbone = backbone
+        self.feat_J = num_joints
+        if version=='class':
+            self.head = ActionHeadClassification(dropout_ratio=dropout_ratio, dim_rep=dim_rep, num_classes=num_classes, num_joints=num_joints)
+        elif version=='embed':
+            self.head = ActionHeadEmbed(dropout_ratio=dropout_ratio, dim_rep=dim_rep, hidden_dim=hidden_dim, num_joints=num_joints)
+        else:
+            raise Exception('Version Error.')
+        
+    def forward(self, x):
+        '''
+            Input: (N, M x T x 17 x 3) 
+        '''
+        N, M, T, J, C = x.shape
+        x = x.reshape(N*M, T, J, C)        
+        feat = self.backbone.get_representation(x)
+        feat = feat.reshape([N, M, T, self.feat_J, -1])      # (N, M, T, J, C)
+        out = self.head(feat)
+        return out
\ No newline at end of file
diff --git a/lib/model/model_mesh.py b/lib/model/model_mesh.py
new file mode 100644
index 0000000..dff579d
--- /dev/null
+++ b/lib/model/model_mesh.py
@@ -0,0 +1,101 @@
+import sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from lib.utils.utils_smpl import SMPL
+from lib.utils.utils_mesh import rotation_matrix_to_angle_axis, rot6d_to_rotmat
+
+class SMPLRegressor(nn.Module):
+    def __init__(self, args, dim_rep=512, num_joints=17, hidden_dim=2048, dropout_ratio=0.):
+        super(SMPLRegressor, self).__init__()
+        param_pose_dim = 24 * 6
+        self.dropout = nn.Dropout(p=dropout_ratio)
+        self.fc1 = nn.Linear(num_joints*dim_rep, hidden_dim)
+        self.pool2 = nn.AdaptiveAvgPool2d((None, 1))
+        self.fc2 = nn.Linear(num_joints*dim_rep, hidden_dim)
+        self.bn1 = nn.BatchNorm1d(hidden_dim, momentum=0.1)
+        self.bn2 = nn.BatchNorm1d(hidden_dim, momentum=0.1)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.head_pose = nn.Linear(hidden_dim, param_pose_dim)
+        self.head_shape = nn.Linear(hidden_dim, 10)
+        nn.init.xavier_uniform_(self.head_pose.weight, gain=0.01)
+        nn.init.xavier_uniform_(self.head_shape.weight, gain=0.01)
+        self.smpl = SMPL(
+            args.data_root,
+            batch_size=64,
+            create_transl=False,
+        )
+        mean_params = np.load(self.smpl.smpl_mean_params)
+        init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
+        init_shape = torch.from_numpy(mean_params['shape'][:].astype('float32')).unsqueeze(0)
+        self.register_buffer('init_pose', init_pose)
+        self.register_buffer('init_shape', init_shape)
+        self.J_regressor = self.smpl.J_regressor_h36m
+
+    def forward(self, feat, init_pose=None, init_shape=None):
+        N, T, J, C = feat.shape
+        NT = N * T
+        feat = feat.reshape(N, T, -1)
+
+        feat_pose = feat.reshape(NT, -1)     # (N*T, J*C)
+
+        feat_pose = self.dropout(feat_pose)
+        feat_pose = self.fc1(feat_pose)
+        feat_pose = self.bn1(feat_pose)
+        feat_pose = self.relu1(feat_pose)    # (NT, C)
+
+        feat_shape = feat.permute(0,2,1)     # (N, T, J*C) -> (N, J*C, T)
+        feat_shape = self.pool2(feat_shape).reshape(N, -1)          # (N, J*C)
+
+        feat_shape = self.dropout(feat_shape)
+        feat_shape = self.fc2(feat_shape)
+        feat_shape = self.bn2(feat_shape)
+        feat_shape = self.relu2(feat_shape)     # (N, C)
+
+        pred_pose = self.init_pose.expand(NT, -1)   # (NT, C)
+        pred_shape = self.init_shape.expand(N, -1)  # (N, C)
+
+        pred_pose = self.head_pose(feat_pose) + pred_pose
+        pred_shape = self.head_shape(feat_shape) + pred_shape
+        pred_shape = pred_shape.expand(T, N, -1).permute(1, 0, 2).reshape(NT, -1)
+        pred_rotmat = rot6d_to_rotmat(pred_pose).view(-1, 24, 3, 3)
+        pred_output = self.smpl(
+            betas=pred_shape,
+            body_pose=pred_rotmat[:, 1:],
+            global_orient=pred_rotmat[:, 0].unsqueeze(1),
+            pose2rot=False
+        )
+        pred_vertices = pred_output.vertices*1000.0
+        assert self.J_regressor is not None
+        J_regressor_batch = self.J_regressor[None, :].expand(pred_vertices.shape[0], -1, -1).to(pred_vertices.device)
+        pred_joints = torch.matmul(J_regressor_batch, pred_vertices)
+        pose = rotation_matrix_to_angle_axis(pred_rotmat.reshape(-1, 3, 3)).reshape(-1, 72)
+        output = [{
+            'theta'  : torch.cat([pose, pred_shape], dim=1),    # (N*T, 72+10)
+            'verts'  : pred_vertices,                           # (N*T, 6890, 3)
+            'kp_3d'  : pred_joints,                             # (N*T, 17, 3)
+        }]
+        return output
+
+class MeshRegressor(nn.Module):
+    def __init__(self, args, backbone, dim_rep=512, num_joints=17, hidden_dim=2048, dropout_ratio=0.5):
+        super(MeshRegressor, self).__init__()
+        self.backbone = backbone
+        self.feat_J = num_joints
+        self.head = SMPLRegressor(args, dim_rep, num_joints, hidden_dim, dropout_ratio)
+        
+    def forward(self, x, init_pose=None, init_shape=None, n_iter=3):
+        '''
+            Input: (N x T x 17 x 3) 
+        '''
+        N, T, J, C = x.shape  
+        feat = self.backbone.get_representation(x)
+        feat = feat.reshape([N, T, self.feat_J, -1])      # (N, T, J, C)
+        smpl_output = self.head(feat)
+        for s in smpl_output:
+            s['theta'] = s['theta'].reshape(N, T, -1)
+            s['verts'] = s['verts'].reshape(N, T, -1, 3)
+            s['kp_3d'] = s['kp_3d'].reshape(N, T, -1, 3)
+        return smpl_output
\ No newline at end of file
diff --git a/lib/utils/learning.py b/lib/utils/learning.py
new file mode 100644
index 0000000..191e669
--- /dev/null
+++ b/lib/utils/learning.py
@@ -0,0 +1,102 @@
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from functools import partial
+from lib.model.DSTformer import DSTformer
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+        
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+def load_pretrained_weights(model, checkpoint):
+    """Load pretrianed weights to model
+    Incompatible layers (unmatched in name or size) will be ignored
+    Args:
+    - model (nn.Module): network model, which must not be nn.DataParallel
+    - weight_path (str): path to pretrained weights
+    """
+    import collections
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    model_dict = model.state_dict()
+    new_state_dict = collections.OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for k, v in state_dict.items():
+        # If the pretrained state_dict was saved as nn.DataParallel,
+        # keys would contain "module.", which should be ignored.
+        if k.startswith('module.'):
+            k = k[7:]
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            discarded_layers.append(k)
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict, strict=True)
+    print('load_weight', len(matched_layers))
+    return model
+
+def partial_train_layers(model, partial_list):
+    """Train partial layers of a given model."""
+    for name, p in model.named_parameters():
+        p.requires_grad = False
+        for trainable in partial_list:
+            if trainable in name:
+                p.requires_grad = True
+                break
+    return model
+
+def load_backbone(args):
+    if not(hasattr(args, "backbone")):
+        args.backbone = 'DSTformer' # Default
+    if args.backbone=='DSTformer':
+        model_backbone = DSTformer(dim_in=3, dim_out=3, dim_feat=args.dim_feat, dim_rep=args.dim_rep, 
+                                   depth=args.depth, num_heads=args.num_heads, mlp_ratio=args.mlp_ratio, norm_layer=partial(nn.LayerNorm, eps=1e-6), 
+                                   maxlen=args.maxlen, num_joints=args.num_joints)
+    elif args.backbone=='TCN':
+        from lib.model.model_tcn import PoseTCN
+        model_backbone = PoseTCN()
+    elif args.backbone=='poseformer':
+        from lib.model.model_poseformer import PoseTransformer 
+        model_backbone = PoseTransformer(num_frame=args.maxlen, num_joints=args.num_joints, in_chans=3, embed_dim_ratio=32, depth=4,
+            num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,drop_path_rate=0, attn_mask=None) 
+    elif args.backbone=='mixste':
+        from lib.model.model_mixste import MixSTE2 
+        model_backbone = MixSTE2(num_frame=args.maxlen, num_joints=args.num_joints, in_chans=3, embed_dim_ratio=512, depth=8,
+        num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,drop_path_rate=0)
+    elif args.backbone=='stgcn':
+        from lib.model.model_stgcn import Model as STGCN 
+        model_backbone = STGCN()
+    else:
+        raise Exception("Undefined backbone type.")
+    return model_backbone
\ No newline at end of file
diff --git a/lib/utils/tools.py b/lib/utils/tools.py
new file mode 100644
index 0000000..b2b780f
--- /dev/null
+++ b/lib/utils/tools.py
@@ -0,0 +1,69 @@
+import numpy as np
+import os, sys
+import pickle
+import yaml
+from easydict import EasyDict as edict
+from typing import Any, IO
+
+ROOT_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
+
+class TextLogger:
+    def __init__(self, log_path):
+        self.log_path = log_path
+        with open(self.log_path, "w") as f:
+            f.write("")
+    def log(self, log):
+        with open(self.log_path, "a+") as f:
+            f.write(log + "\n")
+
+class Loader(yaml.SafeLoader):
+    """YAML Loader with `!include` constructor."""
+
+    def __init__(self, stream: IO) -> None:
+        """Initialise Loader."""
+
+        try:
+            self._root = os.path.split(stream.name)[0]
+        except AttributeError:
+            self._root = os.path.curdir
+
+        super().__init__(stream)
+
+def construct_include(loader: Loader, node: yaml.Node) -> Any:
+    """Include file referenced at node."""
+
+    filename = os.path.abspath(os.path.join(loader._root, loader.construct_scalar(node)))
+    extension = os.path.splitext(filename)[1].lstrip('.')
+
+    with open(filename, 'r') as f:
+        if extension in ('yaml', 'yml'):
+            return yaml.load(f, Loader)
+        elif extension in ('json', ):
+            return json.load(f)
+        else:
+            return ''.join(f.readlines())
+
+def get_config(config_path):
+    yaml.add_constructor('!include', construct_include, Loader)
+    with open(config_path, 'r') as stream:
+        config = yaml.load(stream, Loader=Loader)
+    config = edict(config)
+    _, config_filename = os.path.split(config_path)
+    config_name, _ = os.path.splitext(config_filename)
+    config.name = config_name
+    return config
+
+def ensure_dir(path):
+    """
+    create path by first checking its existence,
+    :param paths: path
+    :return:
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+        
+def read_pkl(data_url):
+    file = open(data_url,'rb')
+    content = pickle.load(file)
+    file.close()
+    return content
\ No newline at end of file
diff --git a/lib/utils/utils_data.py b/lib/utils/utils_data.py
new file mode 100644
index 0000000..df7b61e
--- /dev/null
+++ b/lib/utils/utils_data.py
@@ -0,0 +1,112 @@
+import os
+import torch
+import torch.nn.functional as F
+import numpy as np
+import copy
+
+def crop_scale(motion, scale_range=[1, 1]):
+    '''
+        Motion: [(M), T, 17, 3].
+        Normalize to [-1, 1]
+    '''
+    result = copy.deepcopy(motion)
+    valid_coords = motion[motion[..., 2]!=0][:,:2]
+    if len(valid_coords) < 4:
+        return np.zeros(motion.shape)
+    xmin = min(valid_coords[:,0])
+    xmax = max(valid_coords[:,0])
+    ymin = min(valid_coords[:,1])
+    ymax = max(valid_coords[:,1])
+    ratio = np.random.uniform(low=scale_range[0], high=scale_range[1], size=1)[0]
+    scale = max(xmax-xmin, ymax-ymin) * ratio
+    if scale==0:
+        return np.zeros(motion.shape)
+    xs = (xmin+xmax-scale) / 2
+    ys = (ymin+ymax-scale) / 2
+    result[...,:2] = (motion[..., :2]- [xs,ys]) / scale
+    result[...,:2] = (result[..., :2] - 0.5) * 2
+    result = np.clip(result, -1, 1)
+    return result
+
+def crop_scale_3d(motion, scale_range=[1, 1]):
+    '''
+        Motion: [T, 17, 3]. (x, y, z)
+        Normalize to [-1, 1]
+        Z is relative to the first frame's root.
+    '''
+    result = copy.deepcopy(motion)
+    result[:,:,2] = result[:,:,2] - result[0,0,2]
+    xmin = np.min(motion[...,0])
+    xmax = np.max(motion[...,0])
+    ymin = np.min(motion[...,1])
+    ymax = np.max(motion[...,1])
+    ratio = np.random.uniform(low=scale_range[0], high=scale_range[1], size=1)[0]
+    scale = max(xmax-xmin, ymax-ymin) / ratio
+    if scale==0:
+        return np.zeros(motion.shape)
+    xs = (xmin+xmax-scale) / 2
+    ys = (ymin+ymax-scale) / 2
+    result[...,:2] = (motion[..., :2]- [xs,ys]) / scale
+    result[...,2] = result[...,2] / scale
+    result = (result - 0.5) * 2
+    return result
+
+def flip_data(data):
+    """
+    horizontal flip
+        data: [N, F, 17, D] or [F, 17, D]. X (horizontal coordinate) is the first channel in D.
+    Return
+        result: same
+    """
+    left_joints = [4, 5, 6, 11, 12, 13]
+    right_joints = [1, 2, 3, 14, 15, 16]
+    flipped_data = copy.deepcopy(data)
+    flipped_data[..., 0] *= -1                                               # flip x of all joints
+    flipped_data[..., left_joints+right_joints, :] = flipped_data[..., right_joints+left_joints, :]
+    return flipped_data
+
+def resample(ori_len, target_len, replay=False, randomness=True):
+    if replay:
+        if ori_len > target_len:
+            st = np.random.randint(ori_len-target_len)
+            return range(st, st+target_len)  # Random clipping from sequence
+        else:
+            return np.array(range(target_len)) % ori_len  # Replay padding
+    else:
+        if randomness:
+            even = np.linspace(0, ori_len, num=target_len, endpoint=False)
+            if ori_len < target_len:
+                low = np.floor(even)
+                high = np.ceil(even)
+                sel = np.random.randint(2, size=even.shape)
+                result = np.sort(sel*low+(1-sel)*high)
+            else:
+                interval = even[1] - even[0]
+                result = np.random.random(even.shape)*interval + even
+            result = np.clip(result, a_min=0, a_max=ori_len-1).astype(np.uint32)
+        else:
+            result = np.linspace(0, ori_len, num=target_len, endpoint=False, dtype=int)
+        return result
+
+def split_clips(vid_list, n_frames, data_stride):
+    result = []
+    n_clips = 0
+    st = 0
+    i = 0
+    saved = set()
+    while i<len(vid_list):
+        i += 1
+        if i-st == n_frames:
+            result.append(range(st,i))
+            saved.add(vid_list[i-1])
+            st = st + data_stride
+            n_clips += 1
+        if i==len(vid_list):
+            break
+        if vid_list[i]!=vid_list[i-1]: 
+            if not (vid_list[i-1] in saved):
+                resampled = resample(i-st, n_frames) + st
+                result.append(resampled)
+                saved.add(vid_list[i-1])
+            st = i
+    return result
\ No newline at end of file
diff --git a/lib/utils/utils_mesh.py b/lib/utils/utils_mesh.py
new file mode 100644
index 0000000..209108c
--- /dev/null
+++ b/lib/utils/utils_mesh.py
@@ -0,0 +1,521 @@
+import torch
+import numpy as np
+from torch.nn import functional as F
+import copy
+# from lib.utils.rotation_conversions import axis_angle_to_matrix, matrix_to_rotation_6d
+
+
+def batch_rodrigues(axisang):
+    # This function is borrowed from https://github.com/MandyMo/pytorch_HMR/blob/master/src/util.py#L37
+    # axisang N x 3
+    axisang_norm = torch.norm(axisang + 1e-8, p=2, dim=1)
+    angle = torch.unsqueeze(axisang_norm, -1)
+    axisang_normalized = torch.div(axisang, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * axisang_normalized], dim=1)
+    rot_mat = quat2mat(quat)
+    rot_mat = rot_mat.view(rot_mat.shape[0], 9)
+    return rot_mat
+
+
+def quat2mat(quat):
+    """
+    This function is borrowed from https://github.com/MandyMo/pytorch_HMR/blob/master/src/util.py#L50
+
+    Convert quaternion coefficients to rotation matrix.
+    Args:
+        quat: size = [batch_size, 4] 4 <===>(w, x, y, z)
+    Returns:
+        Rotation matrix corresponding to the quaternion -- size = [batch_size, 3, 3]
+    """
+    norm_quat = quat
+    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:,
+                                                             2], norm_quat[:,
+                                                                           3]
+
+    batch_size = quat.size(0)
+
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w * x, w * y, w * z
+    xy, xz, yz = x * y, x * z, y * z
+
+    rotMat = torch.stack([
+        w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy,
+        w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz,
+        w2 - x2 - y2 + z2
+    ],
+                         dim=1).view(batch_size, 3, 3)
+    return rotMat
+
+
+def rotation_matrix_to_angle_axis(rotation_matrix):
+    """
+    This function is borrowed from https://github.com/kornia/kornia
+
+    Convert 3x4 rotation matrix to Rodrigues vector
+
+    Args:
+        rotation_matrix (Tensor): rotation matrix.
+
+    Returns:
+        Tensor: Rodrigues vector transformation.
+
+    Shape:
+        - Input: :math:`(N, 3, 4)`
+        - Output: :math:`(N, 3)`
+
+    Example:
+        >>> input = torch.rand(2, 3, 4)  # Nx4x4
+        >>> output = tgm.rotation_matrix_to_angle_axis(input)  # Nx3
+    """
+    if rotation_matrix.shape[1:] == (3,3):
+        rot_mat = rotation_matrix.reshape(-1, 3, 3)
+        hom = torch.tensor([0, 0, 1], dtype=torch.float32,
+                           device=rotation_matrix.device).reshape(1, 3, 1).expand(rot_mat.shape[0], -1, -1)
+        rotation_matrix = torch.cat([rot_mat, hom], dim=-1)
+
+    quaternion = rotation_matrix_to_quaternion(rotation_matrix)
+    aa = quaternion_to_angle_axis(quaternion)
+    aa[torch.isnan(aa)] = 0.0
+    return aa
+
+
+def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor:
+    """
+    This function is borrowed from https://github.com/kornia/kornia
+
+    Convert quaternion vector to angle axis of rotation.
+
+    Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
+
+    Args:
+        quaternion (torch.Tensor): tensor with quaternions.
+
+    Return:
+        torch.Tensor: tensor with angle axis of rotation.
+
+    Shape:
+        - Input: :math:`(*, 4)` where `*` means, any number of dimensions
+        - Output: :math:`(*, 3)`
+
+    Example:
+        >>> quaternion = torch.rand(2, 4)  # Nx4
+        >>> angle_axis = tgm.quaternion_to_angle_axis(quaternion)  # Nx3
+    """
+    if not torch.is_tensor(quaternion):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(quaternion)))
+
+    if not quaternion.shape[-1] == 4:
+        raise ValueError("Input must be a tensor of shape Nx4 or 4. Got {}"
+                         .format(quaternion.shape))
+    # unpack input and compute conversion
+    q1: torch.Tensor = quaternion[..., 1]
+    q2: torch.Tensor = quaternion[..., 2]
+    q3: torch.Tensor = quaternion[..., 3]
+    sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3
+
+    sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta)
+    cos_theta: torch.Tensor = quaternion[..., 0]
+    two_theta: torch.Tensor = 2.0 * torch.where(
+        cos_theta < 0.0,
+        torch.atan2(-sin_theta, -cos_theta),
+        torch.atan2(sin_theta, cos_theta))
+
+    k_pos: torch.Tensor = two_theta / sin_theta
+    k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta)
+    k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg)
+
+    angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3]
+    angle_axis[..., 0] += q1 * k
+    angle_axis[..., 1] += q2 * k
+    angle_axis[..., 2] += q3 * k
+    return angle_axis
+
+
+def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6):
+    """
+    This function is borrowed from https://github.com/kornia/kornia
+
+    Convert 3x4 rotation matrix to 4d quaternion vector
+
+    This algorithm is based on algorithm described in
+    https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201
+
+    Args:
+        rotation_matrix (Tensor): the rotation matrix to convert.
+
+    Return:
+        Tensor: the rotation in quaternion
+
+    Shape:
+        - Input: :math:`(N, 3, 4)`
+        - Output: :math:`(N, 4)`
+
+    Example:
+        >>> input = torch.rand(4, 3, 4)  # Nx3x4
+        >>> output = tgm.rotation_matrix_to_quaternion(input)  # Nx4
+    """
+    if not torch.is_tensor(rotation_matrix):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(rotation_matrix)))
+
+    if len(rotation_matrix.shape) > 3:
+        raise ValueError(
+            "Input size must be a three dimensional tensor. Got {}".format(
+                rotation_matrix.shape))
+    if not rotation_matrix.shape[-2:] == (3, 4):
+        raise ValueError(
+            "Input size must be a N x 3 x 4  tensor. Got {}".format(
+                rotation_matrix.shape))
+
+    rmat_t = torch.transpose(rotation_matrix, 1, 2)
+
+    mask_d2 = rmat_t[:, 2, 2] < eps
+
+    mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
+    mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]
+
+    t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
+    q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
+                      t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
+                      rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1)
+    t0_rep = t0.repeat(4, 1).t()
+
+    t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
+    q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
+                      rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
+                      t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1)
+    t1_rep = t1.repeat(4, 1).t()
+
+    t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
+    q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0],
+                      rmat_t[:, 2, 0] + rmat_t[:, 0, 2],
+                      rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1)
+    t2_rep = t2.repeat(4, 1).t()
+
+    t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
+    q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
+                      rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
+                      rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1)
+    t3_rep = t3.repeat(4, 1).t()
+
+    mask_c0 = mask_d2 * mask_d0_d1
+    mask_c1 = mask_d2 * ~mask_d0_d1
+    mask_c2 = ~mask_d2 * mask_d0_nd1
+    mask_c3 = ~mask_d2 * ~mask_d0_nd1
+    mask_c0 = mask_c0.view(-1, 1).type_as(q0)
+    mask_c1 = mask_c1.view(-1, 1).type_as(q1)
+    mask_c2 = mask_c2.view(-1, 1).type_as(q2)
+    mask_c3 = mask_c3.view(-1, 1).type_as(q3)
+
+    q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
+    q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 +  # noqa
+                    t2_rep * mask_c2 + t3_rep * mask_c3)  # noqa
+    q *= 0.5
+    return q
+
+
+def estimate_translation_np(S, joints_2d, joints_conf, focal_length=5000., img_size=224.):
+    """
+    This function is borrowed from https://github.com/nkolot/SPIN/utils/geometry.py
+
+    Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
+    Input:
+        S: (25, 3) 3D joint locations
+        joints: (25, 3) 2D joint locations and confidence
+    Returns:
+        (3,) camera translation vector
+    """
+
+    num_joints = S.shape[0]
+    # focal length
+    f = np.array([focal_length,focal_length])
+    # optical center
+    center = np.array([img_size/2., img_size/2.])
+
+    # transformations
+    Z = np.reshape(np.tile(S[:,2],(2,1)).T,-1)
+    XY = np.reshape(S[:,0:2],-1)
+    O = np.tile(center,num_joints)
+    F = np.tile(f,num_joints)
+    weight2 = np.reshape(np.tile(np.sqrt(joints_conf),(2,1)).T,-1)
+
+    # least squares
+    Q = np.array([F*np.tile(np.array([1,0]),num_joints), F*np.tile(np.array([0,1]),num_joints), O-np.reshape(joints_2d,-1)]).T
+    c = (np.reshape(joints_2d,-1)-O)*Z - F*XY
+
+    # weighted least squares
+    W = np.diagflat(weight2)
+    Q = np.dot(W,Q)
+    c = np.dot(W,c)
+
+    # square matrix
+    A = np.dot(Q.T,Q)
+    b = np.dot(Q.T,c)
+
+    # solution
+    trans = np.linalg.solve(A, b)
+
+    return trans
+
+
+def estimate_translation(S, joints_2d, focal_length=5000., img_size=224.):
+    """
+    This function is borrowed from https://github.com/nkolot/SPIN/utils/geometry.py
+
+    Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
+    Input:
+        S: (B, 49, 3) 3D joint locations
+        joints: (B, 49, 3) 2D joint locations and confidence
+    Returns:
+        (B, 3) camera translation vectors
+    """
+
+    device = S.device
+    # Use only joints 25:49 (GT joints)
+    S = S[:, 25:, :].cpu().numpy()
+    joints_2d = joints_2d[:, 25:, :].cpu().numpy()
+    joints_conf = joints_2d[:, :, -1]
+    joints_2d = joints_2d[:, :, :-1]
+    trans = np.zeros((S.shape[0], 3), dtype=np.float32)
+    # Find the translation for each example in the batch
+    for i in range(S.shape[0]):
+        S_i = S[i]
+        joints_i = joints_2d[i]
+        conf_i = joints_conf[i]
+        trans[i] = estimate_translation_np(S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size)
+    return torch.from_numpy(trans).to(device)
+
+
+def rot6d_to_rotmat_spin(x):
+    """Convert 6D rotation representation to 3x3 rotation matrix.
+    Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019
+    Input:
+        (B,6) Batch of 6-D rotation representations
+    Output:
+        (B,3,3) Batch of corresponding rotation matrices
+    """
+    x = x.view(-1,3,2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+
+    # inp = a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1
+    # denom = inp.pow(2).sum(dim=1).sqrt().unsqueeze(-1) + 1e-8
+    # b2 = inp / denom
+
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+
+
+def rot6d_to_rotmat(x):
+    x = x.view(-1,3,2)
+
+    # Normalize the first vector
+    b1 = F.normalize(x[:, :, 0], dim=1, eps=1e-6)
+
+    dot_prod = torch.sum(b1 * x[:, :, 1], dim=1, keepdim=True)
+    # Compute the second vector by finding the orthogonal complement to it
+    b2 = F.normalize(x[:, :, 1] - dot_prod * b1, dim=-1, eps=1e-6)
+
+    # Finish building the basis by taking the cross product
+    b3 = torch.cross(b1, b2, dim=1)
+    rot_mats = torch.stack([b1, b2, b3], dim=-1)
+
+    return rot_mats
+
+
+def rigid_transform_3D(A, B):
+    n, dim = A.shape
+    centroid_A = np.mean(A, axis = 0)
+    centroid_B = np.mean(B, axis = 0)
+    H = np.dot(np.transpose(A - centroid_A), B - centroid_B) / n
+    U, s, V = np.linalg.svd(H)
+    R = np.dot(np.transpose(V), np.transpose(U))
+    if np.linalg.det(R) < 0:
+        s[-1] = -s[-1]
+        V[2] = -V[2]
+        R = np.dot(np.transpose(V), np.transpose(U))
+
+    varP = np.var(A, axis=0).sum()
+    c = 1/varP * np.sum(s)
+
+    t = -np.dot(c*R, np.transpose(centroid_A)) + np.transpose(centroid_B)
+    return c, R, t
+
+
+def rigid_align(A, B):
+    c, R, t = rigid_transform_3D(A, B)
+    A2 = np.transpose(np.dot(c*R, np.transpose(A))) + t
+    return A2
+
+def compute_error(output, target):
+    with torch.no_grad():
+        pred_verts = output[0]['verts'].reshape(-1, 6890, 3)
+        target_verts = target['verts'].reshape(-1, 6890, 3)
+
+        pred_j3ds = output[0]['kp_3d'].reshape(-1, 17, 3)
+        target_j3ds = target['kp_3d'].reshape(-1, 17, 3)
+
+        # mpve
+        pred_verts = pred_verts - pred_j3ds[:, :1, :]
+        target_verts = target_verts - target_j3ds[:, :1, :]
+        mpves = torch.sqrt(((pred_verts - target_verts) ** 2).sum(dim=-1)).mean(dim=-1).cpu()
+
+        # mpjpe
+        pred_j3ds = pred_j3ds - pred_j3ds[:, :1, :]
+        target_j3ds = target_j3ds - target_j3ds[:, :1, :]
+        mpjpes = torch.sqrt(((pred_j3ds - target_j3ds) ** 2).sum(dim=-1)).mean(dim=-1).cpu()
+        return mpjpes.mean(), mpves.mean()
+    
+def compute_error_frames(output, target):
+    with torch.no_grad():
+        pred_verts = output[0]['verts'].reshape(-1, 6890, 3)
+        target_verts = target['verts'].reshape(-1, 6890, 3)
+
+        pred_j3ds = output[0]['kp_3d'].reshape(-1, 17, 3)
+        target_j3ds = target['kp_3d'].reshape(-1, 17, 3)
+
+        # mpve
+        pred_verts = pred_verts - pred_j3ds[:, :1, :]
+        target_verts = target_verts - target_j3ds[:, :1, :]
+        mpves = torch.sqrt(((pred_verts - target_verts) ** 2).sum(dim=-1)).mean(dim=-1).cpu()
+
+        # mpjpe
+        pred_j3ds = pred_j3ds - pred_j3ds[:, :1, :]
+        target_j3ds = target_j3ds - target_j3ds[:, :1, :]
+        mpjpes = torch.sqrt(((pred_j3ds - target_j3ds) ** 2).sum(dim=-1)).mean(dim=-1).cpu()
+        return mpjpes, mpves
+
+def evaluate_mesh(results):
+    pred_verts = results['verts'].reshape(-1, 6890, 3)
+    target_verts = results['verts_gt'].reshape(-1, 6890, 3)
+
+    pred_j3ds = results['kp_3d'].reshape(-1, 17, 3)
+    target_j3ds = results['kp_3d_gt'].reshape(-1, 17, 3)
+    num_samples = pred_j3ds.shape[0]
+
+    # mpve
+    pred_verts = pred_verts - pred_j3ds[:, :1, :]
+    target_verts = target_verts - target_j3ds[:, :1, :]
+    mpve = np.mean(np.mean(np.sqrt(np.square(pred_verts - target_verts).sum(axis=2)), axis=1))
+
+
+    # mpjpe-17 & mpjpe-14
+    h36m_17_to_14 = (1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16)
+    pred_j3ds_17j = (pred_j3ds - pred_j3ds[:, :1, :])
+    target_j3ds_17j = (target_j3ds - target_j3ds[:, :1, :])
+
+    pred_j3ds = pred_j3ds_17j[:, h36m_17_to_14, :].copy()
+    target_j3ds = target_j3ds_17j[:, h36m_17_to_14, :].copy()
+
+    mpjpe = np.mean(np.sqrt(np.square(pred_j3ds - target_j3ds).sum(axis=2)), axis=1)  # (N, )
+    mpjpe_17j = np.mean(np.sqrt(np.square(pred_j3ds_17j - target_j3ds_17j).sum(axis=2)), axis=1)  # (N, )
+
+    pred_j3ds_pa, pred_j3ds_pa_17j = [], []
+    for n in range(num_samples):
+        pred_j3ds_pa.append(rigid_align(pred_j3ds[n], target_j3ds[n]))
+        pred_j3ds_pa_17j.append(rigid_align(pred_j3ds_17j[n], target_j3ds_17j[n]))
+    pred_j3ds_pa = np.array(pred_j3ds_pa)
+    pred_j3ds_pa_17j = np.array(pred_j3ds_pa_17j)
+
+    pa_mpjpe = np.mean(np.sqrt(np.square(pred_j3ds_pa - target_j3ds).sum(axis=2)), axis=1)  # (N, )
+    pa_mpjpe_17j = np.mean(np.sqrt(np.square(pred_j3ds_pa_17j - target_j3ds_17j).sum(axis=2)), axis=1)  # (N, )
+
+
+    error_dict = {
+        'mpve': mpve.mean(),
+        'mpjpe': mpjpe.mean(),
+        'pa_mpjpe': pa_mpjpe.mean(),
+        'mpjpe_17j': mpjpe_17j.mean(),
+        'pa_mpjpe_17j': pa_mpjpe_17j.mean(),
+        }
+    return error_dict
+
+
+def rectify_pose(pose):
+    """
+    Rectify "upside down" people in global coord
+
+    Args:
+        pose (72,): Pose.
+
+    Returns:
+        Rotated pose.
+    """
+    pose = pose.copy()
+    R_mod = cv2.Rodrigues(np.array([np.pi, 0, 0]))[0]
+    R_root = cv2.Rodrigues(pose[:3])[0]
+    new_root = R_root.dot(R_mod)
+    pose[:3] = cv2.Rodrigues(new_root)[0].reshape(3)
+    return pose
+
+def flip_thetas(thetas):
+        """Flip thetas.
+
+        Parameters
+        ----------
+        thetas : numpy.ndarray
+            Joints in shape (F, num_thetas, 3)
+        theta_pairs : list
+            List of theta pairs.
+
+        Returns
+        -------
+        numpy.ndarray
+            Flipped thetas with shape (F, num_thetas, 3)
+
+        """
+        #Joint pairs which defines the pairs of joint to be swapped when the image is flipped horizontally.
+        theta_pairs = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23))
+        thetas_flip = thetas.copy()
+        # reflect horizontally
+        thetas_flip[:, :, 1] = -1 * thetas_flip[:, :, 1]
+        thetas_flip[:, :, 2] = -1 * thetas_flip[:, :, 2]
+        # change left-right parts
+        for pair in theta_pairs:
+            thetas_flip[:, pair[0], :], thetas_flip[:, pair[1], :] = \
+                thetas_flip[:, pair[1], :], thetas_flip[:, pair[0], :].copy()
+        return thetas_flip
+    
+def flip_thetas_batch(thetas):
+    """Flip thetas in batch.
+
+    Parameters
+    ----------
+    thetas : numpy.array
+        Joints in shape (N, F, num_thetas*3)
+    theta_pairs : list
+        List of theta pairs.
+
+    Returns
+    -------
+    numpy.array
+        Flipped thetas with shape (N, F, num_thetas*3)
+
+    """
+    #Joint pairs which defines the pairs of joint to be swapped when the image is flipped horizontally.
+    theta_pairs = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23))
+    thetas_flip = copy.deepcopy(thetas).reshape(*thetas.shape[:2], 24, 3)
+    # reflect horizontally
+    thetas_flip[:, :, :, 1] = -1 * thetas_flip[:, :, :, 1]
+    thetas_flip[:, :, :, 2] = -1 * thetas_flip[:, :, :, 2]
+    # change left-right parts
+    for pair in theta_pairs:
+        thetas_flip[:, :, pair[0], :], thetas_flip[:, :, pair[1], :] = \
+            thetas_flip[:, :, pair[1], :], thetas_flip[:, :, pair[0], :].clone()
+
+    return thetas_flip.reshape(*thetas.shape[:2], -1)
+
+# def smpl_aa_to_ortho6d(smpl_aa):
+# #     [...,72] -> [...,144]
+#     rot_aa = smpl_aa.reshape([-1,24,3])
+#     rotmat = axis_angle_to_matrix(rot_aa)
+#     rot6d = matrix_to_rotation_6d(rotmat)
+#     rot6d = rot6d.reshape(-1,24*6)
+#     return rot6d
\ No newline at end of file
diff --git a/lib/utils/utils_smpl.py b/lib/utils/utils_smpl.py
new file mode 100644
index 0000000..2215dd8
--- /dev/null
+++ b/lib/utils/utils_smpl.py
@@ -0,0 +1,88 @@
+# This script is borrowed and extended from https://github.com/nkolot/SPIN/blob/master/models/hmr.py
+# Adhere to their licence to use this script
+
+import torch
+import numpy as np
+import os.path as osp
+from smplx import SMPL as _SMPL
+from smplx.utils import ModelOutput, SMPLOutput
+from smplx.lbs import vertices2joints
+
+
+# Map joints to SMPL joints
+JOINT_MAP = {
+    'OP Nose': 24, 'OP Neck': 12, 'OP RShoulder': 17,
+    'OP RElbow': 19, 'OP RWrist': 21, 'OP LShoulder': 16,
+    'OP LElbow': 18, 'OP LWrist': 20, 'OP MidHip': 0,
+    'OP RHip': 2, 'OP RKnee': 5, 'OP RAnkle': 8,
+    'OP LHip': 1, 'OP LKnee': 4, 'OP LAnkle': 7,
+    'OP REye': 25, 'OP LEye': 26, 'OP REar': 27,
+    'OP LEar': 28, 'OP LBigToe': 29, 'OP LSmallToe': 30,
+    'OP LHeel': 31, 'OP RBigToe': 32, 'OP RSmallToe': 33, 'OP RHeel': 34,
+    'Right Ankle': 8, 'Right Knee': 5, 'Right Hip': 45,
+    'Left Hip': 46, 'Left Knee': 4, 'Left Ankle': 7,
+    'Right Wrist': 21, 'Right Elbow': 19, 'Right Shoulder': 17,
+    'Left Shoulder': 16, 'Left Elbow': 18, 'Left Wrist': 20,
+    'Neck (LSP)': 47, 'Top of Head (LSP)': 48,
+    'Pelvis (MPII)': 49, 'Thorax (MPII)': 50,
+    'Spine (H36M)': 51, 'Jaw (H36M)': 52,
+    'Head (H36M)': 53, 'Nose': 24, 'Left Eye': 26,
+    'Right Eye': 25, 'Left Ear': 28, 'Right Ear': 27
+}
+JOINT_NAMES = [
+    'OP Nose', 'OP Neck', 'OP RShoulder',
+    'OP RElbow', 'OP RWrist', 'OP LShoulder',
+    'OP LElbow', 'OP LWrist', 'OP MidHip',
+    'OP RHip', 'OP RKnee', 'OP RAnkle',
+    'OP LHip', 'OP LKnee', 'OP LAnkle',
+    'OP REye', 'OP LEye', 'OP REar',
+    'OP LEar', 'OP LBigToe', 'OP LSmallToe',
+    'OP LHeel', 'OP RBigToe', 'OP RSmallToe', 'OP RHeel',
+    'Right Ankle', 'Right Knee', 'Right Hip',
+    'Left Hip', 'Left Knee', 'Left Ankle',
+    'Right Wrist', 'Right Elbow', 'Right Shoulder',
+    'Left Shoulder', 'Left Elbow', 'Left Wrist',
+    'Neck (LSP)', 'Top of Head (LSP)',
+    'Pelvis (MPII)', 'Thorax (MPII)',
+    'Spine (H36M)', 'Jaw (H36M)',
+    'Head (H36M)', 'Nose', 'Left Eye',
+    'Right Eye', 'Left Ear', 'Right Ear'
+]
+
+JOINT_IDS = {JOINT_NAMES[i]: i for i in range(len(JOINT_NAMES))}
+SMPL_MODEL_DIR = 'data/mesh'
+H36M_TO_J17 = [6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9]
+H36M_TO_J14 = H36M_TO_J17[:14]
+
+
+class SMPL(_SMPL):
+    """ Extension of the official SMPL implementation to support more joints """
+
+    def __init__(self, *args, **kwargs):
+        super(SMPL, self).__init__(*args, **kwargs)
+        joints = [JOINT_MAP[i] for i in JOINT_NAMES]
+        self.smpl_mean_params = osp.join(args[0], 'smpl_mean_params.npz')
+        J_regressor_extra = np.load(osp.join(args[0], 'J_regressor_extra.npy'))
+        self.register_buffer('J_regressor_extra', torch.tensor(J_regressor_extra, dtype=torch.float32))
+        J_regressor_h36m = np.load(osp.join(args[0], 'J_regressor_h36m_correct.npy'))
+        self.register_buffer('J_regressor_h36m', torch.tensor(J_regressor_h36m, dtype=torch.float32))
+        self.joint_map = torch.tensor(joints, dtype=torch.long)
+
+    def forward(self, *args, **kwargs):
+        kwargs['get_skin'] = True
+        smpl_output = super(SMPL, self).forward(*args, **kwargs)
+        extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices)
+        joints = torch.cat([smpl_output.joints, extra_joints], dim=1)
+        joints = joints[:, self.joint_map, :]
+        output = SMPLOutput(vertices=smpl_output.vertices,
+                            global_orient=smpl_output.global_orient,
+                            body_pose=smpl_output.body_pose,
+                            joints=joints,
+                            betas=smpl_output.betas,
+                            full_pose=smpl_output.full_pose)
+        return output
+
+
+def get_smpl_faces():
+    smpl = SMPL(SMPL_MODEL_DIR, batch_size=1, create_transl=False)
+    return smpl.faces
\ No newline at end of file
diff --git a/lib/utils/vismo.py b/lib/utils/vismo.py
new file mode 100644
index 0000000..456c3d7
--- /dev/null
+++ b/lib/utils/vismo.py
@@ -0,0 +1,347 @@
+import numpy as np
+import os
+import cv2
+import math
+import copy
+import imageio
+import io
+from tqdm import tqdm
+from PIL import Image
+from lib.utils.tools import ensure_dir
+import matplotlib
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from lib.utils.utils_smpl import *
+import ipdb
+
+def render_and_save(motion_input, save_path, keep_imgs=False, fps=25, color="#F96706#FB8D43#FDB381", with_conf=False, draw_face=False):
+    ensure_dir(os.path.dirname(save_path))
+    motion = copy.deepcopy(motion_input)
+    if motion.shape[-1]==2 or motion.shape[-1]==3:
+        motion = np.transpose(motion, (1,2,0))   #(T,17,D) -> (17,D,T) 
+    if motion.shape[1]==2 or with_conf:
+        colors = hex2rgb(color)
+        if not with_conf:
+            J, D, T = motion.shape
+            motion_full = np.ones([J,3,T])
+            motion_full[:,:2,:] = motion
+        else:
+            motion_full = motion
+        motion_full[:,:2,:] = pixel2world_vis_motion(motion_full[:,:2,:])
+        motion2video(motion_full, save_path=save_path, colors=colors, fps=fps)
+    elif motion.shape[0]==6890:
+        # motion_world = pixel2world_vis_motion(motion, dim=3)
+        motion2video_mesh(motion, save_path=save_path, keep_imgs=keep_imgs, fps=fps, draw_face=draw_face)
+    else:
+        motion_world = pixel2world_vis_motion(motion, dim=3)
+        motion2video_3d(motion_world, save_path=save_path, keep_imgs=keep_imgs, fps=fps)
+        
+def pixel2world_vis(pose):
+#     pose: (17,2)
+    return (pose + [1, 1]) * 512 / 2
+
+def pixel2world_vis_motion(motion, dim=2, is_tensor=False):
+#     pose: (17,2,N)
+    N = motion.shape[-1]
+    if dim==2:
+        offset = np.ones([2,N]).astype(np.float32)
+    else:
+        offset = np.ones([3,N]).astype(np.float32)
+        offset[2,:] = 0
+    if is_tensor:
+        offset = torch.tensor(offset)
+    return (motion + offset) * 512 / 2
+
+def vis_data_batch(data_input, data_label, n_render=10, save_path='doodle/vis_train_data/'):
+    '''
+        data_input: [N,T,17,2/3]
+        data_label: [N,T,17,3]
+    '''
+    pathlib.Path(save_path).mkdir(parents=True, exist_ok=True) 
+    for i in range(min(len(data_input), n_render)):
+        render_and_save(data_input[i][:,:,:2], '%s/input_%d.mp4' % (save_path, i))
+        render_and_save(data_label[i], '%s/gt_%d.mp4' % (save_path, i))
+
+def get_img_from_fig(fig, dpi=120):
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight", pad_inches=0)
+    buf.seek(0)
+    img_arr = np.frombuffer(buf.getvalue(), dtype=np.uint8)
+    buf.close()
+    img = cv2.imdecode(img_arr, 1)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGBA)
+    return img
+
+def rgb2rgba(color):
+    return (color[0], color[1], color[2], 255)
+
+def hex2rgb(hex, number_of_colors=3):
+    h = hex
+    rgb = []
+    for i in range(number_of_colors):
+        h = h.lstrip('#')
+        hex_color = h[0:6]
+        rgb_color = [int(hex_color[i:i+2], 16) for i in (0, 2 ,4)]
+        rgb.append(rgb_color)
+        h = h[6:]
+    return rgb
+
+def joints2image(joints_position, colors, transparency=False, H=1000, W=1000, nr_joints=49, imtype=np.uint8, grayscale=False, bg_color=(255, 255, 255)):
+#     joints_position: [17*2]
+    nr_joints = joints_position.shape[0]
+
+    if nr_joints == 49: # full joints(49): basic(15) + eyes(2) + toes(2) + hands(30)
+        limbSeq = [[0, 1], [1, 2], [1, 5], [1, 8], [2, 3], [3, 4], [5, 6], [6, 7], \
+                   [8, 9], [8, 13], [9, 10], [10, 11], [11, 12], [13, 14], [14, 15], [15, 16],
+                   ]#[0, 17], [0, 18]] #ignore eyes
+
+        L = rgb2rgba(colors[0]) if transparency else colors[0]
+        M = rgb2rgba(colors[1]) if transparency else colors[1]
+        R = rgb2rgba(colors[2]) if transparency else colors[2]
+
+        colors_joints = [M, M, L, L, L, R, R,
+                  R, M, L, L, L, L, R, R, R,
+                  R, R, L] + [L] * 15 + [R] * 15
+
+        colors_limbs = [M, L, R, M, L, L, R,
+                  R, L, R, L, L, L, R, R, R,
+                  R, R]
+    elif nr_joints == 15: # basic joints(15) + (eyes(2))
+        limbSeq = [[0, 1], [1, 2], [1, 5], [1, 8], [2, 3], [3, 4], [5, 6], [6, 7],
+                   [8, 9], [8, 12], [9, 10], [10, 11], [12, 13], [13, 14]]
+                    # [0, 15], [0, 16] two eyes are not drawn
+
+        L = rgb2rgba(colors[0]) if transparency else colors[0]
+        M = rgb2rgba(colors[1]) if transparency else colors[1]
+        R = rgb2rgba(colors[2]) if transparency else colors[2]
+
+        colors_joints = [M, M, L, L, L, R, R,
+                         R, M, L, L, L, R, R, R]
+
+        colors_limbs = [M, L, R, M, L, L, R,
+                        R, L, R, L, L, R, R]
+    elif nr_joints == 17: # H36M, 0: 'root',
+    #                             1: 'rhip',
+    #                             2: 'rkne',
+    #                             3: 'rank',
+    #                             4: 'lhip',
+    #                             5: 'lkne',
+    #                             6: 'lank',
+    #                             7: 'belly',
+    #                             8: 'neck',
+    #                             9: 'nose',
+    #                             10: 'head',
+    #                             11: 'lsho',
+    #                             12: 'lelb',
+    #                             13: 'lwri',
+    #                             14: 'rsho',
+    #                             15: 'relb',
+    #                             16: 'rwri'
+        limbSeq = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7], [7, 8], [8, 9], [8, 11], [8, 14], [9, 10], [11, 12], [12, 13], [14, 15], [15, 16]]
+
+        L = rgb2rgba(colors[0]) if transparency else colors[0]
+        M = rgb2rgba(colors[1]) if transparency else colors[1]
+        R = rgb2rgba(colors[2]) if transparency else colors[2]
+
+        colors_joints = [M, R, R, R, L, L, L, M, M, M, M, L, L, L, R, R, R]
+        colors_limbs = [R, R, R, L, L, L, M, M, M, L, R, M, L, L, R, R]
+        
+    else:
+        raise ValueError("Only support number of joints be 49 or 17 or 15")
+
+    if transparency:
+        canvas = np.zeros(shape=(H, W, 4))
+    else:
+        canvas = np.ones(shape=(H, W, 3)) * np.array(bg_color).reshape([1, 1, 3])
+    hips = joints_position[0]
+    neck = joints_position[8]
+    torso_length = ((hips[1] - neck[1]) ** 2 + (hips[0] - neck[0]) ** 2) ** 0.5
+    head_radius = int(torso_length/4.5)
+    end_effectors_radius = int(torso_length/15)
+    end_effectors_radius = 7
+    joints_radius = 7
+    for i in range(0, len(colors_joints)):
+        if i in (17, 18):
+            continue
+        elif i > 18:
+            radius = 2
+        else:
+            radius = joints_radius
+        if len(joints_position[i])==3:                 # If there is confidence, weigh by confidence
+            weight = joints_position[i][2]
+            if weight==0:
+                continue
+        cv2.circle(canvas, (int(joints_position[i][0]),int(joints_position[i][1])), radius, colors_joints[i], thickness=-1)
+        
+    stickwidth = 2
+    for i in range(len(limbSeq)):
+        limb = limbSeq[i]
+        cur_canvas = canvas.copy()
+        point1_index = limb[0]
+        point2_index = limb[1]
+        point1 = joints_position[point1_index]
+        point2 = joints_position[point2_index]
+        if len(point1)==3:                             # If there is confidence, weigh by confidence
+            limb_weight = min(point1[2], point2[2])
+            if limb_weight==0:
+                bb = bounding_box(canvas)
+                canvas_cropped = canvas[:,bb[2]:bb[3], :]
+                continue
+        X = [point1[1], point2[1]]
+        Y = [point1[0], point2[0]]
+        mX = np.mean(X)
+        mY = np.mean(Y)
+        length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+        alpha = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+        polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(alpha), 0, 360, 1)
+        cv2.fillConvexPoly(cur_canvas, polygon, colors_limbs[i])
+        canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+        bb = bounding_box(canvas)
+        canvas_cropped = canvas[:,bb[2]:bb[3], :]
+    canvas = canvas.astype(imtype)
+    canvas_cropped = canvas_cropped.astype(imtype)
+    if grayscale:
+        if transparency:
+            canvas = cv2.cvtColor(canvas, cv2.COLOR_RGBA2GRAY)
+            canvas_cropped = cv2.cvtColor(canvas_cropped, cv2.COLOR_RGBA2GRAY)
+        else:
+            canvas = cv2.cvtColor(canvas, cv2.COLOR_RGB2GRAY)
+            canvas_cropped = cv2.cvtColor(canvas_cropped, cv2.COLOR_RGB2GRAY)
+    return [canvas, canvas_cropped]
+
+
+def motion2video(motion, save_path, colors, h=512, w=512, bg_color=(255, 255, 255), transparency=False, motion_tgt=None, fps=25, save_frame=False, grayscale=False, show_progress=True, as_array=False):
+    nr_joints = motion.shape[0]
+#     as_array = save_path.endswith(".npy")
+    vlen = motion.shape[-1]
+
+    out_array = np.zeros([vlen, h, w, 3]) if as_array else None
+    videowriter = None if as_array else imageio.get_writer(save_path, fps=fps)
+
+    if save_frame:
+        frames_dir = save_path[:-4] + '-frames'
+        ensure_dir(frames_dir)
+
+    iterator = range(vlen)
+    if show_progress: iterator = tqdm(iterator)
+    for i in iterator:
+        [img, img_cropped] = joints2image(motion[:, :, i], colors, transparency=transparency, bg_color=bg_color, H=h, W=w, nr_joints=nr_joints, grayscale=grayscale)
+        if motion_tgt is not None:
+            [img_tgt, img_tgt_cropped] = joints2image(motion_tgt[:, :, i], colors, transparency=transparency, bg_color=bg_color, H=h, W=w, nr_joints=nr_joints, grayscale=grayscale)
+            img_ori = img.copy()
+            img = cv2.addWeighted(img_tgt, 0.3, img_ori, 0.7, 0)
+            img_cropped = cv2.addWeighted(img_tgt, 0.3, img_ori, 0.7, 0)
+            bb = bounding_box(img_cropped)
+            img_cropped = img_cropped[:, bb[2]:bb[3], :]
+        if save_frame:
+            save_image(img_cropped, os.path.join(frames_dir, "%04d.png" % i))
+        if as_array: out_array[i] = img
+        else: videowriter.append_data(img)
+
+    if not as_array:
+        videowriter.close()
+
+    return out_array
+
+def motion2video_3d(motion, save_path, fps=25, keep_imgs = False):
+#     motion: (17,3,N)
+    videowriter = imageio.get_writer(save_path, fps=fps)
+    vlen = motion.shape[-1]
+    save_name = save_path.split('.')[0]
+    frames = []
+    joint_pairs = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7], [7, 8], [8, 9], [8, 11], [8, 14], [9, 10], [11, 12], [12, 13], [14, 15], [15, 16]]
+    joint_pairs_left = [[8, 11], [11, 12], [12, 13], [0, 4], [4, 5], [5, 6]]
+    joint_pairs_right = [[8, 14], [14, 15], [15, 16], [0, 1], [1, 2], [2, 3]]
+    
+    color_mid = "#00457E"
+    color_left = "#02315E"
+    color_right = "#2F70AF"
+    for f in tqdm(range(vlen)):
+        j3d = motion[:,:,f]
+        fig = plt.figure(0, figsize=(10, 10))
+        ax = plt.axes(projection="3d")
+        ax.set_xlim(-512, 0)
+        ax.set_ylim(-256, 256)
+        ax.set_zlim(-512, 0)
+        # ax.set_xlabel('X')
+        # ax.set_ylabel('Y')
+        # ax.set_zlabel('Z')
+        ax.view_init(elev=12., azim=80)
+        plt.tick_params(left = False, right = False , labelleft = False ,
+                        labelbottom = False, bottom = False)
+        for i in range(len(joint_pairs)):
+            limb = joint_pairs[i]
+            xs, ys, zs = [np.array([j3d[limb[0], j], j3d[limb[1], j]]) for j in range(3)]
+            if joint_pairs[i] in joint_pairs_left:
+                ax.plot(-xs, -zs, -ys, color=color_left, lw=3, marker='o', markerfacecolor='w', markersize=3, markeredgewidth=2) # axis transformation for visualization
+            elif joint_pairs[i] in joint_pairs_right:
+                ax.plot(-xs, -zs, -ys, color=color_right, lw=3, marker='o', markerfacecolor='w', markersize=3, markeredgewidth=2) # axis transformation for visualization
+            else:
+                ax.plot(-xs, -zs, -ys, color=color_mid, lw=3, marker='o', markerfacecolor='w', markersize=3, markeredgewidth=2) # axis transformation for visualization
+            
+        frame_vis = get_img_from_fig(fig)
+        videowriter.append_data(frame_vis)
+        plt.close()
+    videowriter.close()
+
+def motion2video_mesh(motion, save_path, fps=25, keep_imgs = False, draw_face=True):
+    videowriter = imageio.get_writer(save_path, fps=fps)
+    vlen = motion.shape[-1]
+    draw_skele = (motion.shape[0]==17)
+    save_name = save_path.split('.')[0]
+    smpl_faces = get_smpl_faces()
+    frames = []
+    joint_pairs = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7], [7, 8], [8, 9], [8, 11], [8, 14], [9, 10], [11, 12], [12, 13], [14, 15], [15, 16]]
+
+    
+    X, Y, Z = motion[:, 0], motion[:, 1], motion[:, 2]
+    max_range = np.array([X.max()-X.min(), Y.max()-Y.min(), Z.max()-Z.min()]).max() / 2.0
+    mid_x = (X.max()+X.min()) * 0.5
+    mid_y = (Y.max()+Y.min()) * 0.5
+    mid_z = (Z.max()+Z.min()) * 0.5
+    
+    for f in tqdm(range(vlen)):
+        j3d = motion[:,:,f]
+        plt.gca().set_axis_off()
+        plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
+        plt.gca().xaxis.set_major_locator(plt.NullLocator())
+        plt.gca().yaxis.set_major_locator(plt.NullLocator())
+        fig = plt.figure(0, figsize=(8, 8))
+        ax = plt.axes(projection="3d", proj_type = 'ortho')
+        ax.set_xlim(mid_x - max_range, mid_x + max_range)
+        ax.set_ylim(mid_y - max_range, mid_y + max_range)
+        ax.set_zlim(mid_z - max_range, mid_z + max_range)
+        ax.view_init(elev=-90, azim=-90)
+        plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
+        plt.margins(0, 0, 0)
+        plt.gca().xaxis.set_major_locator(plt.NullLocator())
+        plt.gca().yaxis.set_major_locator(plt.NullLocator())
+        plt.axis('off')
+        plt.xticks([])
+        plt.yticks([])
+        
+        # plt.savefig("filename.png", transparent=True, bbox_inches="tight", pad_inches=0)
+        
+        if draw_skele:
+            for i in range(len(joint_pairs)):
+                limb = joint_pairs[i]
+                xs, ys, zs = [np.array([j3d[limb[0], j], j3d[limb[1], j]]) for j in range(3)]
+                ax.plot(-xs, -zs, -ys, c=[0,0,0], lw=3, marker='o', markerfacecolor='w', markersize=3, markeredgewidth=2) # axis transformation for visualization
+        elif draw_face:
+            ax.plot_trisurf(j3d[:, 0], j3d[:, 1], triangles=smpl_faces, Z=j3d[:, 2], color=(166/255.0,188/255.0,218/255.0,0.9))
+        else:
+            ax.scatter(j3d[:, 0], j3d[:, 1], j3d[:, 2], s=3, c='w', edgecolors='grey')
+        frame_vis = get_img_from_fig(fig, dpi=128)
+        plt.cla()
+        videowriter.append_data(frame_vis)
+        plt.close()
+    videowriter.close()
+
+def save_image(image_numpy, image_path):
+    image_pil = Image.fromarray(image_numpy)
+    image_pil.save(image_path)
+
+def bounding_box(img):
+    a = np.where(img != 0)
+    bbox = np.min(a[0]), np.max(a[0]), np.min(a[1]), np.max(a[1])
+    return bbox
diff --git a/mmpose/.mim/configs b/mmpose/.mim/configs
new file mode 120000
index 0000000..5992d10
--- /dev/null
+++ b/mmpose/.mim/configs
@@ -0,0 +1 @@
+../../configs
\ No newline at end of file
diff --git a/mmpose/.mim/demo b/mmpose/.mim/demo
new file mode 120000
index 0000000..bf71256
--- /dev/null
+++ b/mmpose/.mim/demo
@@ -0,0 +1 @@
+../../demo
\ No newline at end of file
diff --git a/mmpose/.mim/model-index.yml b/mmpose/.mim/model-index.yml
new file mode 120000
index 0000000..a18c0b3
--- /dev/null
+++ b/mmpose/.mim/model-index.yml
@@ -0,0 +1 @@
+../../model-index.yml
\ No newline at end of file
diff --git a/mmpose/.mim/tools b/mmpose/.mim/tools
new file mode 120000
index 0000000..31941e9
--- /dev/null
+++ b/mmpose/.mim/tools
@@ -0,0 +1 @@
+../../tools
\ No newline at end of file
diff --git a/mmpose/__init__.py b/mmpose/__init__.py
new file mode 100644
index 0000000..e52beb9
--- /dev/null
+++ b/mmpose/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.3.8'
+mmcv_maximum_version = '1.5.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/mmpose/apis/__init__.py b/mmpose/apis/__init__.py
new file mode 100644
index 0000000..0e263ed
--- /dev/null
+++ b/mmpose/apis/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import (inference_bottom_up_pose_model,
+                        inference_top_down_pose_model, init_pose_model,
+                        process_mmdet_results, vis_pose_result)
+from .inference_3d import (extract_pose_sequence, inference_interhand_3d_model,
+                           inference_mesh_model, inference_pose_lifter_model,
+                           vis_3d_mesh_result, vis_3d_pose_result)
+from .inference_tracking import get_track_id, vis_pose_tracking_result
+from .test import multi_gpu_test, single_gpu_test
+from .train import init_random_seed, train_model
+
+__all__ = [
+    'train_model', 'init_pose_model', 'inference_top_down_pose_model',
+    'inference_bottom_up_pose_model', 'multi_gpu_test', 'single_gpu_test',
+    'vis_pose_result', 'get_track_id', 'vis_pose_tracking_result',
+    'inference_pose_lifter_model', 'vis_3d_pose_result',
+    'inference_interhand_3d_model', 'extract_pose_sequence',
+    'inference_mesh_model', 'vis_3d_mesh_result', 'process_mmdet_results',
+    'init_random_seed'
+]
diff --git a/mmpose/apis/inference.py b/mmpose/apis/inference.py
new file mode 100644
index 0000000..5363d40
--- /dev/null
+++ b/mmpose/apis/inference.py
@@ -0,0 +1,833 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import collate, scatter
+from mmcv.runner import load_checkpoint
+from PIL import Image
+
+from mmpose.core.post_processing import oks_nms
+from mmpose.datasets.dataset_info import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+from mmpose.models import build_posenet
+from mmpose.utils.hooks import OutputHook
+
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+
+
+def init_pose_model(config, checkpoint=None, device='cuda:0'):
+    """Initialize a pose model from config file.
+
+    Args:
+        config (str or :obj:`mmcv.Config`): Config file path or the config
+            object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, str):
+        config = mmcv.Config.fromfile(config)
+    elif not isinstance(config, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    config.model.pretrained = None
+    model = build_posenet(config.model)
+    if checkpoint is not None:
+        # load model checkpoint
+        load_checkpoint(model, checkpoint, map_location='cpu')
+    # save the config in the model for convenience
+    model.cfg = config
+    model.to(device)
+    model.eval()
+    return model
+
+
+def _xyxy2xywh(bbox_xyxy):
+    """Transform the bbox format from x1y1x2y2 to xywh.
+
+    Args:
+        bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5). (left, top, right, bottom, [score])
+
+    Returns:
+        np.ndarray: Bounding boxes (with scores),
+          shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    """
+    bbox_xywh = bbox_xyxy.copy()
+    bbox_xywh[:, 2] = bbox_xywh[:, 2] - bbox_xywh[:, 0] + 1
+    bbox_xywh[:, 3] = bbox_xywh[:, 3] - bbox_xywh[:, 1] + 1
+
+    return bbox_xywh
+
+
+def _xywh2xyxy(bbox_xywh):
+    """Transform the bbox format from xywh to x1y1x2y2.
+
+    Args:
+        bbox_xywh (ndarray): Bounding boxes (with scores),
+            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    Returns:
+        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
+          (n, 5). (left, top, right, bottom, [score])
+    """
+    bbox_xyxy = bbox_xywh.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1
+
+    return bbox_xyxy
+
+
+def _box2cs(cfg, box):
+    """This encodes bbox(x,y,w,h) into (center, scale)
+
+    Args:
+        x, y, w, h
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - np.ndarray[float32](2,): Center of the bbox (x, y).
+        - np.ndarray[float32](2,): Scale of the bbox w & h.
+    """
+
+    x, y, w, h = box[:4]
+    input_size = cfg.data_cfg['image_size']
+    aspect_ratio = input_size[0] / input_size[1]
+    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+    if w > aspect_ratio * h:
+        h = w * 1.0 / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+
+    # pixel std is 200.0
+    scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+    scale = scale * 1.25
+
+    return center, scale
+
+
+def _inference_single_pose_model(model,
+                                 img_or_path,
+                                 bboxes,
+                                 dataset='TopDownCocoDataset',
+                                 dataset_info=None,
+                                 return_heatmap=False):
+    """Inference human bounding boxes.
+
+    Note:
+        - num_bboxes: N
+        - num_keypoints: K
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str | np.ndarray): Image filename or loaded image.
+        bboxes (list | np.ndarray): All bounding boxes (with scores),
+            shaped (N, 4) or (N, 5). (left, top, width, height, [score])
+            where N is number of bounding boxes.
+        dataset (str): Dataset name. Deprecated.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        outputs (list[str] | tuple[str]): Names of layers whose output is
+            to be returned, default: None
+
+    Returns:
+        ndarray[NxKx3]: Predicted pose x, y, score.
+        heatmap[N, K, H, W]: Model output heatmap.
+    """
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    # build the data pipeline
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    assert len(bboxes[0]) in [4, 5]
+
+    if dataset_info is not None:
+        dataset_name = dataset_info.dataset_name
+        flip_pairs = dataset_info.flip_pairs
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset',
+                       'AnimalMacaqueDataset'):
+            flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
+                          [13, 14], [15, 16]]
+        elif dataset == 'TopDownCocoWholeBodyDataset':
+            body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
+                    [13, 14], [15, 16]]
+            foot = [[17, 20], [18, 21], [19, 22]]
+
+            face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34],
+                    [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46],
+                    [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66],
+                    [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75],
+                    [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]]
+
+            hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116],
+                    [96, 117], [97, 118], [98, 119], [99, 120], [100, 121],
+                    [101, 122], [102, 123], [103, 124], [104, 125], [105, 126],
+                    [106, 127], [107, 128], [108, 129], [109, 130], [110, 131],
+                    [111, 132]]
+            flip_pairs = body + foot + face + hand
+        elif dataset == 'TopDownAicDataset':
+            flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]]
+        elif dataset == 'TopDownMpiiDataset':
+            flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
+        elif dataset == 'TopDownMpiiTrbDataset':
+            flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11],
+                          [14, 15], [16, 22], [28, 34], [17, 23], [29, 35],
+                          [18, 24], [30, 36], [19, 25], [31, 37], [20, 26],
+                          [32, 38], [21, 27], [33, 39]]
+        elif dataset in ('OneHand10KDataset', 'FreiHandDataset',
+                         'PanopticDataset', 'InterHand2DDataset'):
+            flip_pairs = []
+        elif dataset in 'Face300WDataset':
+            flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11],
+                          [6, 10], [7, 9], [17, 26], [18, 25], [19, 24],
+                          [20, 23], [21, 22], [31, 35], [32, 34], [36, 45],
+                          [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
+                          [48, 54], [49, 53], [50, 52], [61, 63], [60, 64],
+                          [67, 65], [58, 56], [59, 55]]
+
+        elif dataset in 'FaceAFLWDataset':
+            flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
+                          [12, 14], [15, 17]]
+
+        elif dataset in 'FaceCOFWDataset':
+            flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11],
+                          [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]]
+
+        elif dataset in 'FaceWFLWDataset':
+            flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27],
+                          [6, 26], [7, 25], [8, 24], [9, 23], [10, 22],
+                          [11, 21], [12, 20], [13, 19], [14, 18], [15, 17],
+                          [33, 46], [34, 45], [35, 44], [36, 43], [37, 42],
+                          [38, 50], [39, 49], [40, 48], [41, 47], [60, 72],
+                          [61, 71], [62, 70], [63, 69], [64, 68], [65, 75],
+                          [66, 74], [67, 73], [55, 59], [56, 58], [76, 82],
+                          [77, 81], [78, 80], [87, 83], [86, 84], [88, 92],
+                          [89, 91], [95, 93], [96, 97]]
+
+        elif dataset in 'AnimalFlyDataset':
+            flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22],
+                          [11, 23], [12, 24], [13, 25], [14, 26], [15, 27],
+                          [16, 28], [17, 29], [30, 31]]
+        elif dataset in 'AnimalHorse10Dataset':
+            flip_pairs = []
+
+        elif dataset in 'AnimalLocustDataset':
+            flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24],
+                          [10, 25], [11, 26], [12, 27], [13, 28], [14, 29],
+                          [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]
+
+        elif dataset in 'AnimalZebraDataset':
+            flip_pairs = [[3, 4], [5, 6]]
+
+        elif dataset in 'AnimalPoseDataset':
+            flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15],
+                          [16, 17], [18, 19]]
+        else:
+            raise NotImplementedError()
+        dataset_name = dataset
+
+    batch_data = []
+    for bbox in bboxes:
+        center, scale = _box2cs(cfg, bbox)
+
+        # prepare data
+        data = {
+            'center':
+            center,
+            'scale':
+            scale,
+            'bbox_score':
+            bbox[4] if len(bbox) == 5 else 1,
+            'bbox_id':
+            0,  # need to be assigned if batch_size > 1
+            'dataset':
+            dataset_name,
+            'joints_3d':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'joints_3d_visible':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'rotation':
+            0,
+            'ann_info': {
+                'image_size': np.array(cfg.data_cfg['image_size']),
+                'num_joints': cfg.data_cfg['num_joints'],
+                'flip_pairs': flip_pairs
+            }
+        }
+        if isinstance(img_or_path, np.ndarray):
+            data['img'] = img_or_path
+        else:
+            data['image_file'] = img_or_path
+
+        data = test_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+    batch_data = scatter(batch_data, [device])[0]
+
+    # forward the model
+    with torch.no_grad():
+        result = model(
+            img=batch_data['img'],
+            img_metas=batch_data['img_metas'],
+            return_loss=False,
+            return_heatmap=return_heatmap)
+
+    return result['preds'], result['output_heatmap']
+
+
+def inference_top_down_pose_model(model,
+                                  img_or_path,
+                                  person_results=None,
+                                  bbox_thr=None,
+                                  format='xywh',
+                                  dataset='TopDownCocoDataset',
+                                  dataset_info=None,
+                                  return_heatmap=False,
+                                  outputs=None):
+    """Inference a single image with a list of person bounding boxes.
+
+    Note:
+        - num_people: P
+        - num_keypoints: K
+        - bbox height: H
+        - bbox width: W
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str| np.ndarray): Image filename or loaded image.
+        person_results (list(dict), optional): a list of detected persons that
+            contains ``bbox`` and/or ``track_id``:
+
+            - ``bbox`` (4, ) or (5, ): The person bounding box, which contains
+                4 box coordinates (and score).
+            - ``track_id`` (int): The unique id for each human instance. If
+                not provided, a dummy person result with a bbox covering
+                the entire image will be used. Default: None.
+        bbox_thr (float | None): Threshold for bounding boxes. Only bboxes
+            with higher scores will be fed into the pose detector.
+            If bbox_thr is None, all boxes will be used.
+        format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
+
+            - `xyxy` means (left, top, right, bottom),
+            - `xywh` means (left, top, width, height).
+        dataset (str): Dataset name, e.g. 'TopDownCocoDataset'.
+            It is deprecated. Please use dataset_info instead.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        return_heatmap (bool) : Flag to return heatmap, default: False
+        outputs (list(str) | tuple(str)) : Names of layers whose outputs
+            need to be returned. Default: None.
+
+    Returns:
+        tuple:
+        - pose_results (list[dict]): The bbox & pose info. \
+            Each item in the list is a dictionary, \
+            containing the bbox: (left, top, right, bottom, [score]) \
+            and the pose (ndarray[Kx3]): x, y, score.
+        - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \
+            torch.Tensor[N, K, H, W]]]): \
+            Output feature maps from layers specified in `outputs`. \
+            Includes 'heatmap' if `return_heatmap` is True.
+    """
+    # get dataset info
+    if (dataset_info is None and hasattr(model, 'cfg')
+            and 'dataset_info' in model.cfg):
+        dataset_info = DatasetInfo(model.cfg.dataset_info)
+    if dataset_info is None:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663'
+            ' for details.', DeprecationWarning)
+
+    # only two kinds of bbox format is supported.
+    assert format in ['xyxy', 'xywh']
+
+    pose_results = []
+    returned_outputs = []
+
+    if person_results is None:
+        # create dummy person results
+        if isinstance(img_or_path, str):
+            width, height = Image.open(img_or_path).size
+        else:
+            height, width = img_or_path.shape[:2]
+        person_results = [{'bbox': np.array([0, 0, width, height])}]
+
+    if len(person_results) == 0:
+        return pose_results, returned_outputs
+
+    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
+    bboxes = np.array([box['bbox'] for box in person_results])
+
+    # Select bboxes by score threshold
+    if bbox_thr is not None:
+        assert bboxes.shape[1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+        person_results = [person_results[i] for i in valid_idx]
+
+    if format == 'xyxy':
+        bboxes_xyxy = bboxes
+        bboxes_xywh = _xyxy2xywh(bboxes)
+    else:
+        # format is already 'xywh'
+        bboxes_xywh = bboxes
+        bboxes_xyxy = _xywh2xyxy(bboxes)
+
+    # if bbox_thr remove all bounding box
+    if len(bboxes_xywh) == 0:
+        return [], []
+
+    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
+        # poses is results['pred'] # N x 17x 3
+        poses, heatmap = _inference_single_pose_model(
+            model,
+            img_or_path,
+            bboxes_xywh,
+            dataset=dataset,
+            dataset_info=dataset_info,
+            return_heatmap=return_heatmap)
+
+        if return_heatmap:
+            h.layer_outputs['heatmap'] = heatmap
+
+        returned_outputs.append(h.layer_outputs)
+
+    assert len(poses) == len(person_results), print(
+        len(poses), len(person_results), len(bboxes_xyxy))
+    for pose, person_result, bbox_xyxy in zip(poses, person_results,
+                                              bboxes_xyxy):
+        pose_result = person_result.copy()
+        pose_result['keypoints'] = pose
+        pose_result['bbox'] = bbox_xyxy
+        pose_results.append(pose_result)
+
+    return pose_results, returned_outputs
+
+
+def inference_bottom_up_pose_model(model,
+                                   img_or_path,
+                                   dataset='BottomUpCocoDataset',
+                                   dataset_info=None,
+                                   pose_nms_thr=0.9,
+                                   return_heatmap=False,
+                                   outputs=None):
+    """Inference a single image with a bottom-up pose model.
+
+    Note:
+        - num_people: P
+        - num_keypoints: K
+        - bbox height: H
+        - bbox width: W
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str| np.ndarray): Image filename or loaded image.
+        dataset (str): Dataset name, e.g. 'BottomUpCocoDataset'.
+            It is deprecated. Please use dataset_info instead.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9.
+        return_heatmap (bool) : Flag to return heatmap, default: False.
+        outputs (list(str) | tuple(str)) : Names of layers whose outputs
+            need to be returned, default: None.
+
+    Returns:
+        tuple:
+        - pose_results (list[np.ndarray]): The predicted pose info. \
+            The length of the list is the number of people (P). \
+            Each item in the list is a ndarray, containing each \
+            person's pose (np.ndarray[Kx3]): x, y, score.
+        - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \
+            torch.Tensor[N, K, H, W]]]): \
+            Output feature maps from layers specified in `outputs`. \
+            Includes 'heatmap' if `return_heatmap` is True.
+    """
+    # get dataset info
+    if (dataset_info is None and hasattr(model, 'cfg')
+            and 'dataset_info' in model.cfg):
+        dataset_info = DatasetInfo(model.cfg.dataset_info)
+
+    if dataset_info is not None:
+        dataset_name = dataset_info.dataset_name
+        flip_index = dataset_info.flip_index
+        sigmas = getattr(dataset_info, 'sigmas', None)
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        assert (dataset == 'BottomUpCocoDataset')
+        dataset_name = dataset
+        flip_index = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+        sigmas = None
+
+    pose_results = []
+    returned_outputs = []
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    # build the data pipeline
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    # prepare data
+    data = {
+        'dataset': dataset_name,
+        'ann_info': {
+            'image_size': np.array(cfg.data_cfg['image_size']),
+            'num_joints': cfg.data_cfg['num_joints'],
+            'flip_index': flip_index,
+        }
+    }
+    if isinstance(img_or_path, np.ndarray):
+        data['img'] = img_or_path
+    else:
+        data['image_file'] = img_or_path
+
+    data = test_pipeline(data)
+    data = collate([data], samples_per_gpu=1)
+    data = scatter(data, [device])[0]
+
+    with OutputHook(model, outputs=outputs, as_tensor=False) as h:
+        # forward the model
+        with torch.no_grad():
+            result = model(
+                img=data['img'],
+                img_metas=data['img_metas'],
+                return_loss=False,
+                return_heatmap=return_heatmap)
+
+        if return_heatmap:
+            h.layer_outputs['heatmap'] = result['output_heatmap']
+
+        returned_outputs.append(h.layer_outputs)
+
+        for idx, pred in enumerate(result['preds']):
+            area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * (
+                np.max(pred[:, 1]) - np.min(pred[:, 1]))
+            pose_results.append({
+                'keypoints': pred[:, :3],
+                'score': result['scores'][idx],
+                'area': area,
+            })
+
+        # pose nms
+        score_per_joint = cfg.model.test_cfg.get('score_per_joint', False)
+        keep = oks_nms(
+            pose_results,
+            pose_nms_thr,
+            sigmas,
+            score_per_joint=score_per_joint)
+        pose_results = [pose_results[_keep] for _keep in keep]
+
+    return pose_results, returned_outputs
+
+
+def vis_pose_result(model,
+                    img,
+                    result,
+                    radius=4,
+                    thickness=1,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    dataset='TopDownCocoDataset',
+                    dataset_info=None,
+                    show=False,
+                    out_file=None):
+    """Visualize the detection results on the image.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | np.ndarray): Image filename or loaded image.
+        result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+        radius (int): Radius of circles.
+        thickness (int): Thickness of lines.
+        kpt_score_thr (float): The threshold to visualize the keypoints.
+        skeleton (list[tuple()]): Default None.
+        show (bool):  Whether to show the image. Default True.
+        out_file (str|None): The filename of the output visualization image.
+    """
+
+    # get dataset info
+    if (dataset_info is None and hasattr(model, 'cfg')
+            and 'dataset_info' in model.cfg):
+        dataset_info = DatasetInfo(model.cfg.dataset_info)
+
+    if dataset_info is not None:
+        skeleton = dataset_info.skeleton
+        pose_kpt_color = dataset_info.pose_kpt_color
+        pose_link_color = dataset_info.pose_link_color
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
+                            [230, 230, 0], [255, 153, 255], [153, 204, 255],
+                            [255, 102, 255], [255, 51, 255], [102, 178, 255],
+                            [51, 153, 255], [255, 153, 153], [255, 102, 102],
+                            [255, 51, 51], [153, 255, 153], [102, 255, 102],
+                            [51, 255, 51], [0, 255, 0], [0, 0, 255],
+                            [255, 0, 0], [255, 255, 255]])
+
+        if dataset in ('TopDownCocoDataset', 'BottomUpCocoDataset',
+                       'TopDownOCHumanDataset', 'AnimalMacaqueDataset'):
+            # show the results
+            skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],
+                        [3, 5], [4, 6]]
+
+            pose_link_color = palette[[
+                0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
+            ]]
+            pose_kpt_color = palette[[
+                16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0
+            ]]
+
+        elif dataset == 'TopDownCocoWholeBodyDataset':
+            # show the results
+            skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2],
+                        [1, 3], [2, 4], [3, 5], [4, 6], [15, 17], [15, 18],
+                        [15, 19], [16, 20], [16, 21], [16, 22], [91, 92],
+                        [92, 93], [93, 94], [94, 95], [91, 96], [96, 97],
+                        [97, 98], [98, 99], [91, 100], [100, 101], [101, 102],
+                        [102, 103], [91, 104], [104, 105], [105, 106],
+                        [106, 107], [91, 108], [108, 109], [109, 110],
+                        [110, 111], [112, 113], [113, 114], [114, 115],
+                        [115, 116], [112, 117], [117, 118], [118, 119],
+                        [119, 120], [112, 121], [121, 122], [122, 123],
+                        [123, 124], [112, 125], [125, 126], [126, 127],
+                        [127, 128], [112, 129], [129, 130], [130, 131],
+                        [131, 132]]
+
+            pose_link_color = palette[[
+                0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16
+            ] + [16, 16, 16, 16, 16, 16] + [
+                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16,
+                16
+            ] + [
+                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16,
+                16
+            ]]
+            pose_kpt_color = palette[
+                [16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0] +
+                [0, 0, 0, 0, 0, 0] + [19] * (68 + 42)]
+
+        elif dataset == 'TopDownAicDataset':
+            skeleton = [[2, 1], [1, 0], [0, 13], [13, 3], [3, 4], [4, 5],
+                        [8, 7], [7, 6], [6, 9], [9, 10], [10, 11], [12, 13],
+                        [0, 6], [3, 9]]
+
+            pose_link_color = palette[[
+                9, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 0, 7, 7
+            ]]
+            pose_kpt_color = palette[[
+                9, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 0, 0
+            ]]
+
+        elif dataset == 'TopDownMpiiDataset':
+            skeleton = [[0, 1], [1, 2], [2, 6], [6, 3], [3, 4], [4, 5], [6, 7],
+                        [7, 8], [8, 9], [8, 12], [12, 11], [11, 10], [8, 13],
+                        [13, 14], [14, 15]]
+
+            pose_link_color = palette[[
+                16, 16, 16, 16, 16, 16, 7, 7, 0, 9, 9, 9, 9, 9, 9
+            ]]
+            pose_kpt_color = palette[[
+                16, 16, 16, 16, 16, 16, 7, 7, 0, 0, 9, 9, 9, 9, 9, 9
+            ]]
+
+        elif dataset == 'TopDownMpiiTrbDataset':
+            skeleton = [[12, 13], [13, 0], [13, 1], [0, 2], [1, 3], [2, 4],
+                        [3, 5], [0, 6], [1, 7], [6, 7], [6, 8], [7,
+                                                                 9], [8, 10],
+                        [9, 11], [14, 15], [16, 17], [18, 19], [20, 21],
+                        [22, 23], [24, 25], [26, 27], [28, 29], [30, 31],
+                        [32, 33], [34, 35], [36, 37], [38, 39]]
+
+            pose_link_color = palette[[16] * 14 + [19] * 13]
+            pose_kpt_color = palette[[16] * 14 + [0] * 26]
+
+        elif dataset in ('OneHand10KDataset', 'FreiHandDataset',
+                         'PanopticDataset'):
+            skeleton = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7],
+                        [7, 8], [0, 9], [9, 10], [10, 11], [11, 12], [0, 13],
+                        [13, 14], [14, 15], [15, 16], [0, 17], [17, 18],
+                        [18, 19], [19, 20]]
+
+            pose_link_color = palette[[
+                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16,
+                16
+            ]]
+            pose_kpt_color = palette[[
+                0, 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16,
+                16, 16
+            ]]
+
+        elif dataset == 'InterHand2DDataset':
+            skeleton = [[0, 1], [1, 2], [2, 3], [4, 5], [5, 6], [6, 7], [8, 9],
+                        [9, 10], [10, 11], [12, 13], [13, 14], [14, 15],
+                        [16, 17], [17, 18], [18, 19], [3, 20], [7, 20],
+                        [11, 20], [15, 20], [19, 20]]
+
+            pose_link_color = palette[[
+                0, 0, 0, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 0, 4, 8, 12,
+                16
+            ]]
+            pose_kpt_color = palette[[
+                0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16,
+                16, 0
+            ]]
+
+        elif dataset == 'Face300WDataset':
+            # show the results
+            skeleton = []
+
+            pose_link_color = palette[[]]
+            pose_kpt_color = palette[[19] * 68]
+            kpt_score_thr = 0
+
+        elif dataset == 'FaceAFLWDataset':
+            # show the results
+            skeleton = []
+
+            pose_link_color = palette[[]]
+            pose_kpt_color = palette[[19] * 19]
+            kpt_score_thr = 0
+
+        elif dataset == 'FaceCOFWDataset':
+            # show the results
+            skeleton = []
+
+            pose_link_color = palette[[]]
+            pose_kpt_color = palette[[19] * 29]
+            kpt_score_thr = 0
+
+        elif dataset == 'FaceWFLWDataset':
+            # show the results
+            skeleton = []
+
+            pose_link_color = palette[[]]
+            pose_kpt_color = palette[[19] * 98]
+            kpt_score_thr = 0
+
+        elif dataset == 'AnimalHorse10Dataset':
+            skeleton = [[0, 1], [1, 12], [12, 16], [16, 21], [21, 17],
+                        [17, 11], [11, 10], [10, 8], [8, 9], [9, 12], [2, 3],
+                        [3, 4], [5, 6], [6, 7], [13, 14], [14, 15], [18, 19],
+                        [19, 20]]
+
+            pose_link_color = palette[[4] * 10 + [6] * 2 + [6] * 2 + [7] * 2 +
+                                      [7] * 2]
+            pose_kpt_color = palette[[
+                4, 4, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 7, 7, 7, 4, 4, 7, 7, 7,
+                4
+            ]]
+
+        elif dataset == 'AnimalFlyDataset':
+            skeleton = [[1, 0], [2, 0], [3, 0], [4, 3], [5, 4], [7, 6], [8, 7],
+                        [9, 8], [11, 10], [12, 11], [13, 12], [15, 14],
+                        [16, 15], [17, 16], [19, 18], [20, 19], [21, 20],
+                        [23, 22], [24, 23], [25, 24], [27, 26], [28, 27],
+                        [29, 28], [30, 3], [31, 3]]
+
+            pose_link_color = palette[[0] * 25]
+            pose_kpt_color = palette[[0] * 32]
+
+        elif dataset == 'AnimalLocustDataset':
+            skeleton = [[1, 0], [2, 1], [3, 2], [4, 3], [6, 5], [7, 6], [9, 8],
+                        [10, 9], [11, 10], [13, 12], [14, 13], [15, 14],
+                        [17, 16], [18, 17], [19, 18], [21, 20], [22, 21],
+                        [24, 23], [25, 24], [26, 25], [28, 27], [29, 28],
+                        [30, 29], [32, 31], [33, 32], [34, 33]]
+
+            pose_link_color = palette[[0] * 26]
+            pose_kpt_color = palette[[0] * 35]
+
+        elif dataset == 'AnimalZebraDataset':
+            skeleton = [[1, 0], [2, 1], [3, 2], [4, 2], [5, 7], [6, 7], [7, 2],
+                        [8, 7]]
+
+            pose_link_color = palette[[0] * 8]
+            pose_kpt_color = palette[[0] * 9]
+
+        elif dataset in 'AnimalPoseDataset':
+            skeleton = [[0, 1], [0, 2], [1, 3], [0, 4], [1, 4], [4, 5], [5, 7],
+                        [6, 7], [5, 8], [8, 12], [12, 16], [5, 9], [9, 13],
+                        [13, 17], [6, 10], [10, 14], [14, 18], [6, 11],
+                        [11, 15], [15, 19]]
+
+            pose_link_color = palette[[0] * 20]
+            pose_kpt_color = palette[[0] * 20]
+        else:
+            NotImplementedError()
+
+    if hasattr(model, 'module'):
+        model = model.module
+
+    img = model.show_result(
+        img,
+        result,
+        skeleton,
+        radius=radius,
+        thickness=thickness,
+        pose_kpt_color=pose_kpt_color,
+        pose_link_color=pose_link_color,
+        kpt_score_thr=kpt_score_thr,
+        bbox_color=bbox_color,
+        show=show,
+        out_file=out_file)
+
+    return img
+
+
+def process_mmdet_results(mmdet_results, cat_id=1):
+    """Process mmdet results, and return a list of bboxes.
+
+    Args:
+        mmdet_results (list|tuple): mmdet results.
+        cat_id (int): category id (default: 1 for human)
+
+    Returns:
+        person_results (list): a list of detected bounding boxes
+    """
+    if isinstance(mmdet_results, tuple):
+        det_results = mmdet_results[0]
+    else:
+        det_results = mmdet_results
+
+    bboxes = det_results[cat_id - 1]
+
+    person_results = []
+    for bbox in bboxes:
+        person = {}
+        person['bbox'] = bbox
+        person_results.append(person)
+
+    return person_results
diff --git a/mmpose/apis/inference_3d.py b/mmpose/apis/inference_3d.py
new file mode 100644
index 0000000..f59f20a
--- /dev/null
+++ b/mmpose/apis/inference_3d.py
@@ -0,0 +1,791 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+from mmcv.parallel import collate, scatter
+
+from mmpose.datasets.pipelines import Compose
+from .inference import _box2cs, _xywh2xyxy, _xyxy2xywh
+
+
+def extract_pose_sequence(pose_results, frame_idx, causal, seq_len, step=1):
+    """Extract the target frame from 2D pose results, and pad the sequence to a
+    fixed length.
+
+    Args:
+        pose_results (list[list[dict]]): Multi-frame pose detection results
+            stored in a nested list. Each element of the outer list is the
+            pose detection results of a single frame, and each element of the
+            inner list is the pose information of one person, which contains:
+
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required \
+                    when ``with_track_id==True``.
+                - bbox ((4, ) or (5, )): left, right, top, bottom, [score]
+
+        frame_idx (int): The index of the frame in the original video.
+        causal (bool): If True, the target frame is the last frame in
+            a sequence. Otherwise, the target frame is in the middle of
+            a sequence.
+        seq_len (int): The number of frames in the input sequence.
+        step (int): Step size to extract frames from the video.
+
+    Returns:
+        list[list[dict]]: Multi-frame pose detection results stored \
+            in a nested list with a length of seq_len.
+    """
+
+    if causal:
+        frames_left = seq_len - 1
+        frames_right = 0
+    else:
+        frames_left = (seq_len - 1) // 2
+        frames_right = frames_left
+    num_frames = len(pose_results)
+
+    # get the padded sequence
+    pad_left = max(0, frames_left - frame_idx // step)
+    pad_right = max(0, frames_right - (num_frames - 1 - frame_idx) // step)
+    start = max(frame_idx % step, frame_idx - frames_left * step)
+    end = min(num_frames - (num_frames - 1 - frame_idx) % step,
+              frame_idx + frames_right * step + 1)
+    pose_results_seq = [pose_results[0]] * pad_left + \
+        pose_results[start:end:step] + [pose_results[-1]] * pad_right
+    return pose_results_seq
+
+
+def _gather_pose_lifter_inputs(pose_results,
+                               bbox_center,
+                               bbox_scale,
+                               norm_pose_2d=False):
+    """Gather input data (keypoints and track_id) for pose lifter model.
+
+    Note:
+        - The temporal length of the pose detection results: T
+        - The number of the person instances: N
+        - The number of the keypoints: K
+        - The channel number of each keypoint: C
+
+    Args:
+        pose_results (List[List[Dict]]): Multi-frame pose detection results
+            stored in a nested list. Each element of the outer list is the
+            pose detection results of a single frame, and each element of the
+            inner list is the pose information of one person, which contains:
+
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+                - bbox ((4, ) or (5, )): left, right, top, bottom, [score]
+
+        bbox_center (ndarray[1, 2]): x, y. The average center coordinate of the
+            bboxes in the dataset.
+        bbox_scale (int|float): The average scale of the bboxes in the dataset.
+        norm_pose_2d (bool): If True, scale the bbox (along with the 2D
+            pose) to bbox_scale, and move the bbox (along with the 2D pose) to
+            bbox_center. Default: False.
+
+    Returns:
+        list[list[dict]]: Multi-frame pose detection results
+            stored in a nested list. Each element of the outer list is the
+            pose detection results of a single frame, and each element of the
+            inner list is the pose information of one person, which contains:
+
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required when
+                    ``with_track_id==True``
+    """
+    sequence_inputs = []
+    for frame in pose_results:
+        frame_inputs = []
+        for res in frame:
+            inputs = dict()
+
+            if norm_pose_2d:
+                bbox = res['bbox']
+                center = np.array([[(bbox[0] + bbox[2]) / 2,
+                                    (bbox[1] + bbox[3]) / 2]])
+                scale = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
+                inputs['keypoints'] = (res['keypoints'][:, :2] - center) \
+                    / scale * bbox_scale + bbox_center
+            else:
+                inputs['keypoints'] = res['keypoints'][:, :2]
+
+            if res['keypoints'].shape[1] == 3:
+                inputs['keypoints'] = np.concatenate(
+                    [inputs['keypoints'], res['keypoints'][:, 2:]], axis=1)
+
+            if 'track_id' in res:
+                inputs['track_id'] = res['track_id']
+            frame_inputs.append(inputs)
+        sequence_inputs.append(frame_inputs)
+    return sequence_inputs
+
+
+def _collate_pose_sequence(pose_results, with_track_id=True, target_frame=-1):
+    """Reorganize multi-frame pose detection results into individual pose
+    sequences.
+
+    Note:
+        - The temporal length of the pose detection results: T
+        - The number of the person instances: N
+        - The number of the keypoints: K
+        - The channel number of each keypoint: C
+
+    Args:
+        pose_results (List[List[Dict]]): Multi-frame pose detection results
+            stored in a nested list. Each element of the outer list is the
+            pose detection results of a single frame, and each element of the
+            inner list is the pose information of one person, which contains:
+
+                - keypoints (ndarray[K, 2 or 3]): x, y, [score]
+                - track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+
+        with_track_id (bool): If True, the element in pose_results is expected
+            to contain "track_id", which will be used to gather the pose
+            sequence of a person from multiple frames. Otherwise, the pose
+            results in each frame are expected to have a consistent number and
+            order of identities. Default is True.
+        target_frame (int): The index of the target frame. Default: -1.
+    """
+    T = len(pose_results)
+    assert T > 0
+
+    target_frame = (T + target_frame) % T  # convert negative index to positive
+
+    N = len(pose_results[target_frame])  # use identities in the target frame
+    if N == 0:
+        return []
+
+    K, C = pose_results[target_frame][0]['keypoints'].shape
+
+    track_ids = None
+    if with_track_id:
+        track_ids = [res['track_id'] for res in pose_results[target_frame]]
+
+    pose_sequences = []
+    for idx in range(N):
+        pose_seq = dict()
+        # gather static information
+        for k, v in pose_results[target_frame][idx].items():
+            if k != 'keypoints':
+                pose_seq[k] = v
+        # gather keypoints
+        if not with_track_id:
+            pose_seq['keypoints'] = np.stack(
+                [frame[idx]['keypoints'] for frame in pose_results])
+        else:
+            keypoints = np.zeros((T, K, C), dtype=np.float32)
+            keypoints[target_frame] = pose_results[target_frame][idx][
+                'keypoints']
+            # find the left most frame containing track_ids[idx]
+            for frame_idx in range(target_frame - 1, -1, -1):
+                contains_idx = False
+                for res in pose_results[frame_idx]:
+                    if res['track_id'] == track_ids[idx]:
+                        keypoints[frame_idx] = res['keypoints']
+                        contains_idx = True
+                        break
+                if not contains_idx:
+                    # replicate the left most frame
+                    keypoints[:frame_idx + 1] = keypoints[frame_idx + 1]
+                    break
+            # find the right most frame containing track_idx[idx]
+            for frame_idx in range(target_frame + 1, T):
+                contains_idx = False
+                for res in pose_results[frame_idx]:
+                    if res['track_id'] == track_ids[idx]:
+                        keypoints[frame_idx] = res['keypoints']
+                        contains_idx = True
+                        break
+                if not contains_idx:
+                    # replicate the right most frame
+                    keypoints[frame_idx + 1:] = keypoints[frame_idx]
+                    break
+            pose_seq['keypoints'] = keypoints
+        pose_sequences.append(pose_seq)
+
+    return pose_sequences
+
+
+def inference_pose_lifter_model(model,
+                                pose_results_2d,
+                                dataset=None,
+                                dataset_info=None,
+                                with_track_id=True,
+                                image_size=None,
+                                norm_pose_2d=False):
+    """Inference 3D pose from 2D pose sequences using a pose lifter model.
+
+    Args:
+        model (nn.Module): The loaded pose lifter model
+        pose_results_2d (list[list[dict]]): The 2D pose sequences stored in a
+            nested list. Each element of the outer list is the 2D pose results
+            of a single frame, and each element of the inner list is the 2D
+            pose of one person, which contains:
+
+            - "keypoints" (ndarray[K, 2 or 3]): x, y, [score]
+            - "track_id" (int)
+        dataset (str): Dataset name, e.g. 'Body3DH36MDataset'
+        with_track_id: If True, the element in pose_results_2d is expected to
+            contain "track_id", which will be used to gather the pose sequence
+            of a person from multiple frames. Otherwise, the pose results in
+            each frame are expected to have a consistent number and order of
+            identities. Default is True.
+        image_size (tuple|list): image width, image height. If None, image size
+            will not be contained in dict ``data``.
+        norm_pose_2d (bool): If True, scale the bbox (along with the 2D
+            pose) to the average bbox scale of the dataset, and move the bbox
+            (along with the 2D pose) to the average bbox center of the dataset.
+
+    Returns:
+        list[dict]: 3D pose inference results. Each element is the result of \
+            an instance, which contains:
+
+            - "keypoints_3d" (ndarray[K, 3]): predicted 3D keypoints
+            - "keypoints" (ndarray[K, 2 or 3]): from the last frame in \
+                ``pose_results_2d``.
+            - "track_id" (int): from the last frame in ``pose_results_2d``. \
+                If there is no valid instance, an empty list will be \
+                returned.
+    """
+    cfg = model.cfg
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    if dataset_info is not None:
+        flip_pairs = dataset_info.flip_pairs
+        assert 'stats_info' in dataset_info._dataset_info
+        bbox_center = dataset_info._dataset_info['stats_info']['bbox_center']
+        bbox_scale = dataset_info._dataset_info['stats_info']['bbox_scale']
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        if dataset == 'Body3DH36MDataset':
+            flip_pairs = [[1, 4], [2, 5], [3, 6], [11, 14], [12, 15], [13, 16]]
+            bbox_center = np.array([[528, 427]], dtype=np.float32)
+            bbox_scale = 400
+        else:
+            raise NotImplementedError()
+
+    target_idx = -1 if model.causal else len(pose_results_2d) // 2
+    pose_lifter_inputs = _gather_pose_lifter_inputs(pose_results_2d,
+                                                    bbox_center, bbox_scale,
+                                                    norm_pose_2d)
+    pose_sequences_2d = _collate_pose_sequence(pose_lifter_inputs,
+                                               with_track_id, target_idx)
+
+    if not pose_sequences_2d:
+        return []
+
+    batch_data = []
+    for seq in pose_sequences_2d:
+        pose_2d = seq['keypoints'].astype(np.float32)
+        T, K, C = pose_2d.shape
+
+        input_2d = pose_2d[..., :2]
+        input_2d_visible = pose_2d[..., 2:3]
+        if C > 2:
+            input_2d_visible = pose_2d[..., 2:3]
+        else:
+            input_2d_visible = np.ones((T, K, 1), dtype=np.float32)
+
+        # TODO: Will be removed in the later versions
+        # Dummy 3D input
+        # This is for compatibility with configs in mmpose<=v0.14.0, where a
+        # 3D input is required to generate denormalization parameters. This
+        # part will be removed in the future.
+        target = np.zeros((K, 3), dtype=np.float32)
+        target_visible = np.ones((K, 1), dtype=np.float32)
+
+        # Dummy image path
+        # This is for compatibility with configs in mmpose<=v0.14.0, where
+        # target_image_path is required. This part will be removed in the
+        # future.
+        target_image_path = None
+
+        data = {
+            'input_2d': input_2d,
+            'input_2d_visible': input_2d_visible,
+            'target': target,
+            'target_visible': target_visible,
+            'target_image_path': target_image_path,
+            'ann_info': {
+                'num_joints': K,
+                'flip_pairs': flip_pairs
+            }
+        }
+
+        if image_size is not None:
+            assert len(image_size) == 2
+            data['image_width'] = image_size[0]
+            data['image_height'] = image_size[1]
+
+        data = test_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+    batch_data = scatter(batch_data, target_gpus=[device])[0]
+
+    with torch.no_grad():
+        result = model(
+            input=batch_data['input'],
+            metas=batch_data['metas'],
+            return_loss=False)
+
+    poses_3d = result['preds']
+    if poses_3d.shape[-1] != 4:
+        assert poses_3d.shape[-1] == 3
+        dummy_score = np.ones(
+            poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype)
+        poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1)
+    pose_results = []
+    for pose_2d, pose_3d in zip(pose_sequences_2d, poses_3d):
+        pose_result = pose_2d.copy()
+        pose_result['keypoints_3d'] = pose_3d
+        pose_results.append(pose_result)
+
+    return pose_results
+
+
+def vis_3d_pose_result(model,
+                       result,
+                       img=None,
+                       dataset='Body3DH36MDataset',
+                       dataset_info=None,
+                       kpt_score_thr=0.3,
+                       radius=8,
+                       thickness=2,
+                       num_instances=-1,
+                       show=False,
+                       out_file=None):
+    """Visualize the 3D pose estimation results.
+
+    Args:
+        model (nn.Module): The loaded model.
+        result (list[dict])
+    """
+
+    if dataset_info is not None:
+        skeleton = dataset_info.skeleton
+        pose_kpt_color = dataset_info.pose_kpt_color
+        pose_link_color = dataset_info.pose_link_color
+    else:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
+                            [230, 230, 0], [255, 153, 255], [153, 204, 255],
+                            [255, 102, 255], [255, 51, 255], [102, 178, 255],
+                            [51, 153, 255], [255, 153, 153], [255, 102, 102],
+                            [255, 51, 51], [153, 255, 153], [102, 255, 102],
+                            [51, 255, 51], [0, 255, 0], [0, 0, 255],
+                            [255, 0, 0], [255, 255, 255]])
+
+        if dataset == 'Body3DH36MDataset':
+            skeleton = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7],
+                        [7, 8], [8, 9], [9, 10], [8, 11], [11, 12], [12, 13],
+                        [8, 14], [14, 15], [15, 16]]
+
+            pose_kpt_color = palette[[
+                9, 0, 0, 0, 16, 16, 16, 9, 9, 9, 9, 16, 16, 16, 0, 0, 0
+            ]]
+            pose_link_color = palette[[
+                0, 0, 0, 16, 16, 16, 9, 9, 9, 9, 16, 16, 16, 0, 0, 0
+            ]]
+
+        elif dataset == 'InterHand3DDataset':
+            skeleton = [[0, 1], [1, 2], [2, 3], [3, 20], [4, 5], [5, 6],
+                        [6, 7], [7, 20], [8, 9], [9, 10], [10, 11], [11, 20],
+                        [12, 13], [13, 14], [14, 15], [15, 20], [16, 17],
+                        [17, 18], [18, 19], [19, 20], [21, 22], [22, 23],
+                        [23, 24], [24, 41], [25, 26], [26, 27], [27, 28],
+                        [28, 41], [29, 30], [30, 31], [31, 32], [32, 41],
+                        [33, 34], [34, 35], [35, 36], [36, 41], [37, 38],
+                        [38, 39], [39, 40], [40, 41]]
+
+            pose_kpt_color = [[14, 128, 250], [14, 128, 250], [14, 128, 250],
+                              [14, 128, 250], [80, 127, 255], [80, 127, 255],
+                              [80, 127, 255], [80, 127, 255], [71, 99, 255],
+                              [71, 99, 255], [71, 99, 255], [71, 99, 255],
+                              [0, 36, 255], [0, 36, 255], [0, 36, 255],
+                              [0, 36, 255], [0, 0, 230], [0, 0, 230],
+                              [0, 0, 230], [0, 0, 230], [0, 0, 139],
+                              [237, 149, 100], [237, 149, 100],
+                              [237, 149, 100], [237, 149, 100], [230, 128, 77],
+                              [230, 128, 77], [230, 128, 77], [230, 128, 77],
+                              [255, 144, 30], [255, 144, 30], [255, 144, 30],
+                              [255, 144, 30], [153, 51, 0], [153, 51, 0],
+                              [153, 51, 0], [153, 51, 0], [255, 51, 13],
+                              [255, 51, 13], [255, 51, 13], [255, 51, 13],
+                              [103, 37, 8]]
+
+            pose_link_color = [[14, 128, 250], [14, 128, 250], [14, 128, 250],
+                               [14, 128, 250], [80, 127, 255], [80, 127, 255],
+                               [80, 127, 255], [80, 127, 255], [71, 99, 255],
+                               [71, 99, 255], [71, 99, 255], [71, 99, 255],
+                               [0, 36, 255], [0, 36, 255], [0, 36, 255],
+                               [0, 36, 255], [0, 0, 230], [0, 0, 230],
+                               [0, 0, 230], [0, 0, 230], [237, 149, 100],
+                               [237, 149, 100], [237, 149, 100],
+                               [237, 149, 100], [230, 128, 77], [230, 128, 77],
+                               [230, 128, 77], [230, 128, 77], [255, 144, 30],
+                               [255, 144, 30], [255, 144, 30], [255, 144, 30],
+                               [153, 51, 0], [153, 51, 0], [153, 51, 0],
+                               [153, 51, 0], [255, 51, 13], [255, 51, 13],
+                               [255, 51, 13], [255, 51, 13]]
+        else:
+            raise NotImplementedError
+
+    if hasattr(model, 'module'):
+        model = model.module
+
+    img = model.show_result(
+        result,
+        img,
+        skeleton,
+        radius=radius,
+        thickness=thickness,
+        pose_kpt_color=pose_kpt_color,
+        pose_link_color=pose_link_color,
+        num_instances=num_instances,
+        show=show,
+        out_file=out_file)
+
+    return img
+
+
+def inference_interhand_3d_model(model,
+                                 img_or_path,
+                                 det_results,
+                                 bbox_thr=None,
+                                 format='xywh',
+                                 dataset='InterHand3DDataset'):
+    """Inference a single image with a list of hand bounding boxes.
+
+    Note:
+        - num_bboxes: N
+        - num_keypoints: K
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str | np.ndarray): Image filename or loaded image.
+        det_results (list[dict]): The 2D bbox sequences stored in a list.
+            Each each element of the list is the bbox of one person, whose
+            shape is (ndarray[4 or 5]), containing 4 box coordinates
+            (and score).
+        dataset (str): Dataset name.
+        format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
+            'xyxy' means (left, top, right, bottom),
+            'xywh' means (left, top, width, height).
+
+    Returns:
+        list[dict]: 3D pose inference results. Each element is the result \
+            of an instance, which contains the predicted 3D keypoints with \
+            shape (ndarray[K,3]). If there is no valid instance, an \
+            empty list will be returned.
+    """
+
+    assert format in ['xyxy', 'xywh']
+
+    pose_results = []
+
+    if len(det_results) == 0:
+        return pose_results
+
+    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
+    bboxes = np.array([box['bbox'] for box in det_results])
+
+    # Select bboxes by score threshold
+    if bbox_thr is not None:
+        assert bboxes.shape[1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+        det_results = [det_results[i] for i in valid_idx]
+
+    if format == 'xyxy':
+        bboxes_xyxy = bboxes
+        bboxes_xywh = _xyxy2xywh(bboxes)
+    else:
+        # format is already 'xywh'
+        bboxes_xywh = bboxes
+        bboxes_xyxy = _xywh2xyxy(bboxes)
+
+    # if bbox_thr remove all bounding box
+    if len(bboxes_xywh) == 0:
+        return []
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    # build the data pipeline
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    assert len(bboxes[0]) in [4, 5]
+
+    if dataset == 'InterHand3DDataset':
+        flip_pairs = [[i, 21 + i] for i in range(21)]
+    else:
+        raise NotImplementedError()
+
+    batch_data = []
+    for bbox in bboxes:
+        center, scale = _box2cs(cfg, bbox)
+
+        # prepare data
+        data = {
+            'center':
+            center,
+            'scale':
+            scale,
+            'bbox_score':
+            bbox[4] if len(bbox) == 5 else 1,
+            'bbox_id':
+            0,  # need to be assigned if batch_size > 1
+            'dataset':
+            dataset,
+            'joints_3d':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'joints_3d_visible':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'rotation':
+            0,
+            'ann_info': {
+                'image_size': np.array(cfg.data_cfg['image_size']),
+                'num_joints': cfg.data_cfg['num_joints'],
+                'flip_pairs': flip_pairs,
+                'heatmap3d_depth_bound': cfg.data_cfg['heatmap3d_depth_bound'],
+                'heatmap_size_root': cfg.data_cfg['heatmap_size_root'],
+                'root_depth_bound': cfg.data_cfg['root_depth_bound']
+            }
+        }
+
+        if isinstance(img_or_path, np.ndarray):
+            data['img'] = img_or_path
+        else:
+            data['image_file'] = img_or_path
+
+        data = test_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+    batch_data = scatter(batch_data, [device])[0]
+
+    # forward the model
+    with torch.no_grad():
+        result = model(
+            img=batch_data['img'],
+            img_metas=batch_data['img_metas'],
+            return_loss=False)
+
+    poses_3d = result['preds']
+    rel_root_depth = result['rel_root_depth']
+    hand_type = result['hand_type']
+    if poses_3d.shape[-1] != 4:
+        assert poses_3d.shape[-1] == 3
+        dummy_score = np.ones(
+            poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype)
+        poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1)
+
+    # add relative root depth to left hand joints
+    poses_3d[:, 21:, 2] += rel_root_depth
+
+    # set joint scores according to hand type
+    poses_3d[:, :21, 3] *= hand_type[:, [0]]
+    poses_3d[:, 21:, 3] *= hand_type[:, [1]]
+
+    pose_results = []
+    for pose_3d, person_res, bbox_xyxy in zip(poses_3d, det_results,
+                                              bboxes_xyxy):
+        pose_res = person_res.copy()
+        pose_res['keypoints_3d'] = pose_3d
+        pose_res['bbox'] = bbox_xyxy
+        pose_results.append(pose_res)
+
+    return pose_results
+
+
+def inference_mesh_model(model,
+                         img_or_path,
+                         det_results,
+                         bbox_thr=None,
+                         format='xywh',
+                         dataset='MeshH36MDataset'):
+    """Inference a single image with a list of bounding boxes.
+
+    Note:
+        - num_bboxes: N
+        - num_keypoints: K
+        - num_vertices: V
+        - num_faces: F
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (str | np.ndarray): Image filename or loaded image.
+        det_results (list[dict]): The 2D bbox sequences stored in a list.
+            Each element of the list is the bbox of one person.
+            "bbox" (ndarray[4 or 5]): The person bounding box,
+            which contains 4 box coordinates (and score).
+        bbox_thr (float | None): Threshold for bounding boxes.
+            Only bboxes with higher scores will be fed into the pose
+            detector. If bbox_thr is None, all boxes will be used.
+        format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
+
+            - 'xyxy' means (left, top, right, bottom),
+            - 'xywh' means (left, top, width, height).
+        dataset (str): Dataset name.
+
+    Returns:
+        list[dict]: 3D pose inference results. Each element \
+            is the result of an instance, which contains:
+
+            - 'bbox' (ndarray[4]): instance bounding bbox
+            - 'center' (ndarray[2]): bbox center
+            - 'scale' (ndarray[2]): bbox scale
+            - 'keypoints_3d' (ndarray[K,3]): predicted 3D keypoints
+            - 'camera' (ndarray[3]): camera parameters
+            - 'vertices' (ndarray[V, 3]): predicted 3D vertices
+            - 'faces' (ndarray[F, 3]): mesh faces
+
+            If there is no valid instance, an empty list
+            will be returned.
+    """
+
+    assert format in ['xyxy', 'xywh']
+
+    pose_results = []
+
+    if len(det_results) == 0:
+        return pose_results
+
+    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
+    bboxes = np.array([box['bbox'] for box in det_results])
+
+    # Select bboxes by score threshold
+    if bbox_thr is not None:
+        assert bboxes.shape[1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+        det_results = [det_results[i] for i in valid_idx]
+
+    if format == 'xyxy':
+        bboxes_xyxy = bboxes
+        bboxes_xywh = _xyxy2xywh(bboxes)
+    else:
+        # format is already 'xywh'
+        bboxes_xywh = bboxes
+        bboxes_xyxy = _xywh2xyxy(bboxes)
+
+    # if bbox_thr remove all bounding box
+    if len(bboxes_xywh) == 0:
+        return []
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    if device.type == 'cpu':
+        device = -1
+
+    # build the data pipeline
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    assert len(bboxes[0]) in [4, 5]
+
+    if dataset == 'MeshH36MDataset':
+        flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9],
+                      [20, 21], [22, 23]]
+    else:
+        raise NotImplementedError()
+
+    batch_data = []
+    for bbox in bboxes:
+        center, scale = _box2cs(cfg, bbox)
+
+        # prepare data
+        data = {
+            'image_file':
+            img_or_path,
+            'center':
+            center,
+            'scale':
+            scale,
+            'rotation':
+            0,
+            'bbox_score':
+            bbox[4] if len(bbox) == 5 else 1,
+            'dataset':
+            dataset,
+            'joints_2d':
+            np.zeros((cfg.data_cfg.num_joints, 2), dtype=np.float32),
+            'joints_2d_visible':
+            np.zeros((cfg.data_cfg.num_joints, 1), dtype=np.float32),
+            'joints_3d':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'joints_3d_visible':
+            np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32),
+            'pose':
+            np.zeros(72, dtype=np.float32),
+            'beta':
+            np.zeros(10, dtype=np.float32),
+            'has_smpl':
+            0,
+            'ann_info': {
+                'image_size': np.array(cfg.data_cfg['image_size']),
+                'num_joints': cfg.data_cfg['num_joints'],
+                'flip_pairs': flip_pairs,
+            }
+        }
+
+        data = test_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+    batch_data = scatter(batch_data, target_gpus=[device])[0]
+
+    # forward the model
+    with torch.no_grad():
+        preds = model(
+            img=batch_data['img'],
+            img_metas=batch_data['img_metas'],
+            return_loss=False,
+            return_vertices=True,
+            return_faces=True)
+
+    for idx in range(len(det_results)):
+        pose_res = det_results[idx].copy()
+        pose_res['bbox'] = bboxes_xyxy[idx]
+        pose_res['center'] = batch_data['img_metas'][idx]['center']
+        pose_res['scale'] = batch_data['img_metas'][idx]['scale']
+        pose_res['keypoints_3d'] = preds['keypoints_3d'][idx]
+        pose_res['camera'] = preds['camera'][idx]
+        pose_res['vertices'] = preds['vertices'][idx]
+        pose_res['faces'] = preds['faces']
+        pose_results.append(pose_res)
+    return pose_results
+
+
+def vis_3d_mesh_result(model, result, img=None, show=False, out_file=None):
+    """Visualize the 3D mesh estimation results.
+
+    Args:
+        model (nn.Module): The loaded model.
+        result (list[dict]): 3D mesh estimation results.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+
+    img = model.show_result(result, img, show=show, out_file=out_file)
+
+    return img
diff --git a/mmpose/apis/inference_tracking.py b/mmpose/apis/inference_tracking.py
new file mode 100644
index 0000000..9494fba
--- /dev/null
+++ b/mmpose/apis/inference_tracking.py
@@ -0,0 +1,347 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+
+from mmpose.core import OneEuroFilter, oks_iou
+
+
+def _compute_iou(bboxA, bboxB):
+    """Compute the Intersection over Union (IoU) between two boxes .
+
+    Args:
+        bboxA (list): The first bbox info (left, top, right, bottom, score).
+        bboxB (list): The second bbox info (left, top, right, bottom, score).
+
+    Returns:
+        float: The IoU value.
+    """
+
+    x1 = max(bboxA[0], bboxB[0])
+    y1 = max(bboxA[1], bboxB[1])
+    x2 = min(bboxA[2], bboxB[2])
+    y2 = min(bboxA[3], bboxB[3])
+
+    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
+
+    bboxA_area = (bboxA[2] - bboxA[0]) * (bboxA[3] - bboxA[1])
+    bboxB_area = (bboxB[2] - bboxB[0]) * (bboxB[3] - bboxB[1])
+    union_area = float(bboxA_area + bboxB_area - inter_area)
+    if union_area == 0:
+        union_area = 1e-5
+        warnings.warn('union_area=0 is unexpected')
+
+    iou = inter_area / union_area
+
+    return iou
+
+
+def _track_by_iou(res, results_last, thr):
+    """Get track id using IoU tracking greedily.
+
+    Args:
+        res (dict): The bbox & pose results of the person instance.
+        results_last (list[dict]): The bbox & pose & track_id info of the
+            last frame (bbox_result, pose_result, track_id).
+        thr (float): The threshold for iou tracking.
+
+    Returns:
+        int: The track id for the new person instance.
+        list[dict]: The bbox & pose & track_id info of the persons
+            that have not been matched on the last frame.
+        dict: The matched person instance on the last frame.
+    """
+
+    bbox = list(res['bbox'])
+
+    max_iou_score = -1
+    max_index = -1
+    match_result = {}
+    for index, res_last in enumerate(results_last):
+        bbox_last = list(res_last['bbox'])
+
+        iou_score = _compute_iou(bbox, bbox_last)
+        if iou_score > max_iou_score:
+            max_iou_score = iou_score
+            max_index = index
+
+    if max_iou_score > thr:
+        track_id = results_last[max_index]['track_id']
+        match_result = results_last[max_index]
+        del results_last[max_index]
+    else:
+        track_id = -1
+
+    return track_id, results_last, match_result
+
+
+def _track_by_oks(res, results_last, thr):
+    """Get track id using OKS tracking greedily.
+
+    Args:
+        res (dict): The pose results of the person instance.
+        results_last (list[dict]): The pose & track_id info of the
+            last frame (pose_result, track_id).
+        thr (float): The threshold for oks tracking.
+
+    Returns:
+        int: The track id for the new person instance.
+        list[dict]: The pose & track_id info of the persons
+            that have not been matched on the last frame.
+        dict: The matched person instance on the last frame.
+    """
+    pose = res['keypoints'].reshape((-1))
+    area = res['area']
+    max_index = -1
+    match_result = {}
+
+    if len(results_last) == 0:
+        return -1, results_last, match_result
+
+    pose_last = np.array(
+        [res_last['keypoints'].reshape((-1)) for res_last in results_last])
+    area_last = np.array([res_last['area'] for res_last in results_last])
+
+    oks_score = oks_iou(pose, pose_last, area, area_last)
+
+    max_index = np.argmax(oks_score)
+
+    if oks_score[max_index] > thr:
+        track_id = results_last[max_index]['track_id']
+        match_result = results_last[max_index]
+        del results_last[max_index]
+    else:
+        track_id = -1
+
+    return track_id, results_last, match_result
+
+
+def _get_area(results):
+    """Get bbox for each person instance on the current frame.
+
+    Args:
+        results (list[dict]): The pose results of the current frame
+            (pose_result).
+    Returns:
+        list[dict]: The bbox & pose info of the current frame
+            (bbox_result, pose_result, area).
+    """
+    for result in results:
+        if 'bbox' in result:
+            result['area'] = ((result['bbox'][2] - result['bbox'][0]) *
+                              (result['bbox'][3] - result['bbox'][1]))
+        else:
+            xmin = np.min(
+                result['keypoints'][:, 0][result['keypoints'][:, 0] > 0],
+                initial=1e10)
+            xmax = np.max(result['keypoints'][:, 0])
+            ymin = np.min(
+                result['keypoints'][:, 1][result['keypoints'][:, 1] > 0],
+                initial=1e10)
+            ymax = np.max(result['keypoints'][:, 1])
+            result['area'] = (xmax - xmin) * (ymax - ymin)
+            result['bbox'] = np.array([xmin, ymin, xmax, ymax])
+    return results
+
+
+def _temporal_refine(result, match_result, fps=None):
+    """Refine koypoints using tracked person instance on last frame.
+
+    Args:
+        results (dict): The pose results of the current frame
+                (pose_result).
+        match_result (dict): The pose results of the last frame
+                (match_result)
+    Returns:
+        (array): The person keypoints after refine.
+    """
+    if 'one_euro' in match_result:
+        result['keypoints'][:, :2] = match_result['one_euro'](
+            result['keypoints'][:, :2])
+        result['one_euro'] = match_result['one_euro']
+    else:
+        result['one_euro'] = OneEuroFilter(result['keypoints'][:, :2], fps=fps)
+    return result['keypoints']
+
+
+def get_track_id(results,
+                 results_last,
+                 next_id,
+                 min_keypoints=3,
+                 use_oks=False,
+                 tracking_thr=0.3,
+                 use_one_euro=False,
+                 fps=None):
+    """Get track id for each person instance on the current frame.
+
+    Args:
+        results (list[dict]): The bbox & pose results of the current frame
+            (bbox_result, pose_result).
+        results_last (list[dict]): The bbox & pose & track_id info of the
+            last frame (bbox_result, pose_result, track_id).
+        next_id (int): The track id for the new person instance.
+        min_keypoints (int): Minimum number of keypoints recognized as person.
+            default: 3.
+        use_oks (bool): Flag to using oks tracking. default: False.
+        tracking_thr (float): The threshold for tracking.
+        use_one_euro (bool): Option to use one-euro-filter. default: False.
+        fps (optional): Parameters that d_cutoff
+            when one-euro-filter is used as a video input
+
+    Returns:
+        tuple:
+        - results (list[dict]): The bbox & pose & track_id info of the \
+            current frame (bbox_result, pose_result, track_id).
+        - next_id (int): The track id for the new person instance.
+    """
+    results = _get_area(results)
+
+    if use_oks:
+        _track = _track_by_oks
+    else:
+        _track = _track_by_iou
+
+    for result in results:
+        track_id, results_last, match_result = _track(result, results_last,
+                                                      tracking_thr)
+        if track_id == -1:
+            if np.count_nonzero(result['keypoints'][:, 1]) > min_keypoints:
+                result['track_id'] = next_id
+                next_id += 1
+            else:
+                # If the number of keypoints detected is small,
+                # delete that person instance.
+                result['keypoints'][:, 1] = -10
+                result['bbox'] *= 0
+                result['track_id'] = -1
+        else:
+            result['track_id'] = track_id
+        if use_one_euro:
+            result['keypoints'] = _temporal_refine(
+                result, match_result, fps=fps)
+        del match_result
+
+    return results, next_id
+
+
+def vis_pose_tracking_result(model,
+                             img,
+                             result,
+                             radius=4,
+                             thickness=1,
+                             kpt_score_thr=0.3,
+                             dataset='TopDownCocoDataset',
+                             dataset_info=None,
+                             show=False,
+                             out_file=None):
+    """Visualize the pose tracking results on the image.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | np.ndarray): Image filename or loaded image.
+        result (list[dict]): The results to draw over `img`
+            (bbox_result, pose_result).
+        radius (int): Radius of circles.
+        thickness (int): Thickness of lines.
+        kpt_score_thr (float): The threshold to visualize the keypoints.
+        skeleton (list[tuple]): Default None.
+        show (bool):  Whether to show the image. Default True.
+        out_file (str|None): The filename of the output visualization image.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+
+    palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
+                        [230, 230, 0], [255, 153, 255], [153, 204, 255],
+                        [255, 102, 255], [255, 51, 255], [102, 178, 255],
+                        [51, 153, 255], [255, 153, 153], [255, 102, 102],
+                        [255, 51, 51], [153, 255, 153], [102, 255, 102],
+                        [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0],
+                        [255, 255, 255]])
+
+    if dataset_info is None and dataset is not None:
+        warnings.warn(
+            'dataset is deprecated.'
+            'Please set `dataset_info` in the config.'
+            'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+            DeprecationWarning)
+        # TODO: These will be removed in the later versions.
+        if dataset in ('TopDownCocoDataset', 'BottomUpCocoDataset',
+                       'TopDownOCHumanDataset'):
+            kpt_num = 17
+            skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],
+                        [3, 5], [4, 6]]
+
+        elif dataset == 'TopDownCocoWholeBodyDataset':
+            kpt_num = 133
+            skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12],
+                        [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
+                        [8, 10], [1, 2], [0, 1], [0, 2],
+                        [1, 3], [2, 4], [3, 5], [4, 6], [15, 17], [15, 18],
+                        [15, 19], [16, 20], [16, 21], [16, 22], [91, 92],
+                        [92, 93], [93, 94], [94, 95], [91, 96], [96, 97],
+                        [97, 98], [98, 99], [91, 100], [100, 101], [101, 102],
+                        [102, 103], [91, 104], [104, 105], [105, 106],
+                        [106, 107], [91, 108], [108, 109], [109, 110],
+                        [110, 111], [112, 113], [113, 114], [114, 115],
+                        [115, 116], [112, 117], [117, 118], [118, 119],
+                        [119, 120], [112, 121], [121, 122], [122, 123],
+                        [123, 124], [112, 125], [125, 126], [126, 127],
+                        [127, 128], [112, 129], [129, 130], [130, 131],
+                        [131, 132]]
+            radius = 1
+
+        elif dataset == 'TopDownAicDataset':
+            kpt_num = 14
+            skeleton = [[2, 1], [1, 0], [0, 13], [13, 3], [3, 4], [4, 5],
+                        [8, 7], [7, 6], [6, 9], [9, 10], [10, 11], [12, 13],
+                        [0, 6], [3, 9]]
+
+        elif dataset == 'TopDownMpiiDataset':
+            kpt_num = 16
+            skeleton = [[0, 1], [1, 2], [2, 6], [6, 3], [3, 4], [4, 5], [6, 7],
+                        [7, 8], [8, 9], [8, 12], [12, 11], [11, 10], [8, 13],
+                        [13, 14], [14, 15]]
+
+        elif dataset in ('OneHand10KDataset', 'FreiHandDataset',
+                         'PanopticDataset'):
+            kpt_num = 21
+            skeleton = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7],
+                        [7, 8], [0, 9], [9, 10], [10, 11], [11, 12], [0, 13],
+                        [13, 14], [14, 15], [15, 16], [0, 17], [17, 18],
+                        [18, 19], [19, 20]]
+
+        elif dataset == 'InterHand2DDataset':
+            kpt_num = 21
+            skeleton = [[0, 1], [1, 2], [2, 3], [4, 5], [5, 6], [6, 7], [8, 9],
+                        [9, 10], [10, 11], [12, 13], [13, 14], [14, 15],
+                        [16, 17], [17, 18], [18, 19], [3, 20], [7, 20],
+                        [11, 20], [15, 20], [19, 20]]
+
+        else:
+            raise NotImplementedError()
+
+    elif dataset_info is not None:
+        kpt_num = dataset_info.keypoint_num
+        skeleton = dataset_info.skeleton
+
+    for res in result:
+        track_id = res['track_id']
+        bbox_color = palette[track_id % len(palette)]
+        pose_kpt_color = palette[[track_id % len(palette)] * kpt_num]
+        pose_link_color = palette[[track_id % len(palette)] * len(skeleton)]
+        img = model.show_result(
+            img, [res],
+            skeleton,
+            radius=radius,
+            thickness=thickness,
+            pose_kpt_color=pose_kpt_color,
+            pose_link_color=pose_link_color,
+            bbox_color=tuple(bbox_color.tolist()),
+            kpt_score_thr=kpt_score_thr,
+            show=show,
+            out_file=out_file)
+
+    return img
diff --git a/mmpose/apis/test.py b/mmpose/apis/test.py
new file mode 100644
index 0000000..3843b5a
--- /dev/null
+++ b/mmpose/apis/test.py
@@ -0,0 +1,191 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+
+def single_gpu_test(model, data_loader):
+    """Test model with a single gpu.
+
+    This method tests model with a single gpu and displays test progress bar.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+
+
+    Returns:
+        list: The prediction results.
+    """
+
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.append(result)
+
+        # use the first key as main key to calculate the batch size
+        batch_size = len(next(iter(data.values())))
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.append(result)
+
+        if rank == 0:
+            # use the first key as main key to calculate the batch size
+            batch_size = len(next(iter(data.values())))
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    """Collect results in cpu mode.
+
+    It saves the results on different gpus to 'tmpdir' and collects
+    them by the rank 0 worker.
+
+    Args:
+        result_part (list): Results to be collected
+        size (int): Result size.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode. Default: None
+
+    Returns:
+        list: Ordered results.
+    """
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # synchronizes all processes to make sure tmpdir exist
+    dist.barrier()
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    # synchronizes all processes for loading pickle file
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+
+    # load results of all parts from tmp dir
+    part_list = []
+    for i in range(world_size):
+        part_file = osp.join(tmpdir, f'part_{i}.pkl')
+        part_list.append(mmcv.load(part_file))
+    # sort the results
+    ordered_results = []
+    for res in zip(*part_list):
+        ordered_results.extend(list(res))
+    # the dataloader may pad some samples
+    ordered_results = ordered_results[:size]
+    # remove tmp dir
+    shutil.rmtree(tmpdir)
+    return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    """Collect results in gpu mode.
+
+    It encodes results to gpu tensors and use gpu communication for results
+    collection.
+
+    Args:
+        result_part (list): Results to be collected
+        size (int): Result size.
+
+    Returns:
+        list: Ordered results.
+    """
+
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_list.append(
+                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
+    return None
diff --git a/mmpose/apis/train.py b/mmpose/apis/train.py
new file mode 100644
index 0000000..7c31f8b
--- /dev/null
+++ b/mmpose/apis/train.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
+                         get_dist_info)
+from mmcv.utils import digit_version
+
+from mmpose.core import DistEvalHook, EvalHook, build_optimizers
+from mmpose.core.distributed_wrapper import DistributedDataParallelWrapper
+from mmpose.datasets import build_dataloader, build_dataset
+from mmpose.utils import get_root_logger
+
+try:
+    from mmcv.runner import Fp16OptimizerHook
+except ImportError:
+    warnings.warn(
+        'Fp16OptimizerHook from mmpose will be deprecated from '
+        'v0.15.0. Please install mmcv>=1.1.4', DeprecationWarning)
+    from mmpose.core import Fp16OptimizerHook
+
+
+def init_random_seed(seed=None, device='cuda'):
+    """Initialize random seed.
+
+    If the seed is not set, the seed will be automatically randomized,
+    and then broadcast to all processes to prevent some potential bugs.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is not None:
+        return seed
+
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    rank, world_size = get_dist_info()
+    seed = np.random.randint(2**31)
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """Train model entry function.
+
+    Args:
+        model (nn.Module): The model to be trained.
+        dataset (Dataset): Train dataset.
+        cfg (dict): The config dict for training.
+        distributed (bool): Whether to use distributed training.
+            Default: False.
+        validate (bool): Whether to do evaluation. Default: False.
+        timestamp (str | None): Local time for runner. Default: None.
+        meta (dict | None): Meta dict to record some important information.
+            Default: None
+    """
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    # step 1: give default values and override (if exist) from cfg.data
+    loader_cfg = {
+        **dict(
+            seed=cfg.get('seed'),
+            drop_last=False,
+            dist=distributed,
+            num_gpus=len(cfg.gpu_ids)),
+        **({} if torch.__version__ != 'parrots' else dict(
+               prefetch_num=2,
+               pin_memory=False,
+           )),
+        **dict((k, cfg.data[k]) for k in [
+                   'samples_per_gpu',
+                   'workers_per_gpu',
+                   'shuffle',
+                   'seed',
+                   'drop_last',
+                   'prefetch_num',
+                   'pin_memory',
+                   'persistent_workers',
+               ] if k in cfg.data)
+    }
+
+    # step 2: cfg.data.train_dataloader has highest priority
+    train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {}))
+
+    data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]
+
+    # determine whether use adversarial training precess or not
+    use_adverserial_train = cfg.get('use_adversarial_train', False)
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+
+        if use_adverserial_train:
+            # Use DistributedDataParallelWrapper for adversarial training
+            model = DistributedDataParallelWrapper(
+                model,
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+        else:
+            model = MMDistributedDataParallel(
+                model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        if digit_version(mmcv.__version__) >= digit_version(
+                '1.4.4') or torch.cuda.is_available():
+            model = MMDataParallel(model, device_ids=cfg.gpu_ids)
+        else:
+            warnings.warn(
+                'We recommend to use MMCV >= 1.4.4 for CPU training. '
+                'See https://github.com/open-mmlab/mmpose/pull/1157 for '
+                'details.')
+
+    # build runner
+    optimizer = build_optimizers(model, cfg.optimizer)
+
+    runner = EpochBasedRunner(
+        model,
+        optimizer=optimizer,
+        work_dir=cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    if use_adverserial_train:
+        # The optimizer step process is included in the train_step function
+        # of the model, so the runner should NOT include optimizer hook.
+        optimizer_config = None
+    else:
+        # fp16 setting
+        fp16_cfg = cfg.get('fp16', None)
+        if fp16_cfg is not None:
+            optimizer_config = Fp16OptimizerHook(
+                **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+        elif distributed and 'type' not in cfg.optimizer_config:
+            optimizer_config = OptimizerHook(**cfg.optimizer_config)
+        else:
+            optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    if distributed:
+        runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        eval_cfg = cfg.get('evaluation', {})
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        dataloader_setting = dict(
+            samples_per_gpu=1,
+            workers_per_gpu=cfg.data.get('workers_per_gpu', 1),
+            # cfg.gpus will be ignored if distributed
+            num_gpus=len(cfg.gpu_ids),
+            dist=distributed,
+            drop_last=False,
+            shuffle=False)
+        dataloader_setting = dict(dataloader_setting,
+                                  **cfg.data.get('val_dataloader', {}))
+        val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
+        eval_hook = DistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
diff --git a/mmpose/core/__init__.py b/mmpose/core/__init__.py
new file mode 100644
index 0000000..66185b7
--- /dev/null
+++ b/mmpose/core/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .camera import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .fp16 import *  # noqa: F401, F403
+from .optimizer import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
+from .visualization import *  # noqa: F401, F403
diff --git a/mmpose/core/camera/__init__.py b/mmpose/core/camera/__init__.py
new file mode 100644
index 0000000..a4a3c55
--- /dev/null
+++ b/mmpose/core/camera/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .camera_base import CAMERAS
+from .single_camera import SimpleCamera
+from .single_camera_torch import SimpleCameraTorch
+
+__all__ = ['CAMERAS', 'SimpleCamera', 'SimpleCameraTorch']
diff --git a/mmpose/core/camera/camera_base.py b/mmpose/core/camera/camera_base.py
new file mode 100644
index 0000000..28b23e7
--- /dev/null
+++ b/mmpose/core/camera/camera_base.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.utils import Registry
+
+CAMERAS = Registry('camera')
+
+
+class SingleCameraBase(metaclass=ABCMeta):
+    """Base class for single camera model.
+
+    Args:
+        param (dict): Camera parameters
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_world: Project points from camera coordinates to world
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    @abstractmethod
+    def __init__(self, param):
+        """Load camera parameters and check validity."""
+
+    def world_to_camera(self, X):
+        """Project points from world coordinates to camera coordinates."""
+        raise NotImplementedError
+
+    def camera_to_world(self, X):
+        """Project points from camera coordinates to world coordinates."""
+        raise NotImplementedError
+
+    def camera_to_pixel(self, X):
+        """Project points from camera coordinates to pixel coordinates."""
+        raise NotImplementedError
+
+    def world_to_pixel(self, X):
+        """Project points from world coordinates to pixel coordinates."""
+        _X = self.world_to_camera(X)
+        return self.camera_to_pixel(_X)
diff --git a/mmpose/core/camera/single_camera.py b/mmpose/core/camera/single_camera.py
new file mode 100644
index 0000000..cabd799
--- /dev/null
+++ b/mmpose/core/camera/single_camera.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .camera_base import CAMERAS, SingleCameraBase
+
+
+@CAMERAS.register_module()
+class SimpleCamera(SingleCameraBase):
+    """Camera model to calculate coordinate transformation with given
+    intrinsic/extrinsic camera parameters.
+
+    Note:
+        The keypoint coordinate should be an np.ndarray with a shape of
+    [...,J, C] where J is the keypoint number of an instance, and C is
+    the coordinate dimension. For example:
+
+        [J, C]: shape of joint coordinates of a person with J joints.
+        [N, J, C]: shape of a batch of person joint coordinates.
+        [N, T, J, C]: shape of a batch of pose sequences.
+
+    Args:
+        param (dict): camera parameters including:
+            - R: 3x3, camera rotation matrix (camera-to-world)
+            - T: 3x1, camera translation (camera-to-world)
+            - K: (optional) 2x3, camera intrinsic matrix
+            - k: (optional) nx1, camera radial distortion coefficients
+            - p: (optional) mx1, camera tangential distortion coefficients
+            - f: (optional) 2x1, camera focal length
+            - c: (optional) 2x1, camera center
+        if K is not provided, it will be calculated from f and c.
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    def __init__(self, param):
+
+        self.param = {}
+        # extrinsic param
+        R = np.array(param['R'], dtype=np.float32)
+        T = np.array(param['T'], dtype=np.float32)
+        assert R.shape == (3, 3)
+        assert T.shape == (3, 1)
+        # The camera matrices are transposed in advance because the joint
+        # coordinates are stored as row vectors.
+        self.param['R_c2w'] = R.T
+        self.param['T_c2w'] = T.T
+        self.param['R_w2c'] = R
+        self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c']
+
+        # intrinsic param
+        if 'K' in param:
+            K = np.array(param['K'], dtype=np.float32)
+            assert K.shape == (2, 3)
+            self.param['K'] = K.T
+            self.param['f'] = np.array([K[0, 0], K[1, 1]])[:, np.newaxis]
+            self.param['c'] = np.array([K[0, 2], K[1, 2]])[:, np.newaxis]
+        elif 'f' in param and 'c' in param:
+            f = np.array(param['f'], dtype=np.float32)
+            c = np.array(param['c'], dtype=np.float32)
+            assert f.shape == (2, 1)
+            assert c.shape == (2, 1)
+            self.param['K'] = np.concatenate((np.diagflat(f), c), axis=-1).T
+            self.param['f'] = f
+            self.param['c'] = c
+        else:
+            raise ValueError('Camera intrinsic parameters are missing. '
+                             'Either "K" or "f"&"c" should be provided.')
+
+        # distortion param
+        if 'k' in param and 'p' in param:
+            self.undistortion = True
+            self.param['k'] = np.array(param['k'], dtype=np.float32).flatten()
+            self.param['p'] = np.array(param['p'], dtype=np.float32).flatten()
+            assert self.param['k'].size in {3, 6}
+            assert self.param['p'].size == 2
+        else:
+            self.undistortion = False
+
+    def world_to_camera(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_w2c'] + self.param['T_w2c']
+
+    def camera_to_world(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_c2w'] + self.param['T_c2w']
+
+    def camera_to_pixel(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+
+        _X = X / X[..., 2:]
+
+        if self.undistortion:
+            k = self.param['k']
+            p = self.param['p']
+            _X_2d = _X[..., :2]
+            r2 = (_X_2d**2).sum(-1)
+            radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3]))
+            if k.size == 6:
+                radial /= 1 + sum(
+                    (ki * r2**(i + 1) for i, ki in enumerate(k[3:])))
+
+            tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1])
+
+            _X[..., :2] = _X_2d * (radial + tangential)[..., None] + np.outer(
+                r2, p[::-1]).reshape(_X_2d.shape)
+        return _X @ self.param['K']
+
+    def pixel_to_camera(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        _X = X.copy()
+        _X[:, :2] = (X[:, :2] - self.param['c'].T) / self.param['f'].T * X[:,
+                                                                           [2]]
+        return _X
diff --git a/mmpose/core/camera/single_camera_torch.py b/mmpose/core/camera/single_camera_torch.py
new file mode 100644
index 0000000..22eb72f
--- /dev/null
+++ b/mmpose/core/camera/single_camera_torch.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .camera_base import CAMERAS, SingleCameraBase
+
+
+@CAMERAS.register_module()
+class SimpleCameraTorch(SingleCameraBase):
+    """Camera model to calculate coordinate transformation with given
+    intrinsic/extrinsic camera parameters.
+
+    Notes:
+        The keypoint coordinate should be an np.ndarray with a shape of
+    [...,J, C] where J is the keypoint number of an instance, and C is
+    the coordinate dimension. For example:
+
+        [J, C]: shape of joint coordinates of a person with J joints.
+        [N, J, C]: shape of a batch of person joint coordinates.
+        [N, T, J, C]: shape of a batch of pose sequences.
+
+    Args:
+        param (dict): camera parameters including:
+            - R: 3x3, camera rotation matrix (camera-to-world)
+            - T: 3x1, camera translation (camera-to-world)
+            - K: (optional) 2x3, camera intrinsic matrix
+            - k: (optional) nx1, camera radial distortion coefficients
+            - p: (optional) mx1, camera tangential distortion coefficients
+            - f: (optional) 2x1, camera focal length
+            - c: (optional) 2x1, camera center
+        if K is not provided, it will be calculated from f and c.
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    def __init__(self, param, device):
+
+        self.param = {}
+        # extrinsic param
+        R = torch.tensor(param['R'], device=device)
+        T = torch.tensor(param['T'], device=device)
+
+        assert R.shape == (3, 3)
+        assert T.shape == (3, 1)
+        # The camera matrices are transposed in advance because the joint
+        # coordinates are stored as row vectors.
+        self.param['R_c2w'] = R.T
+        self.param['T_c2w'] = T.T
+        self.param['R_w2c'] = R
+        self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c']
+
+        # intrinsic param
+        if 'K' in param:
+            K = torch.tensor(param['K'], device=device)
+            assert K.shape == (2, 3)
+            self.param['K'] = K.T
+            self.param['f'] = torch.tensor([[K[0, 0]], [K[1, 1]]],
+                                           device=device)
+            self.param['c'] = torch.tensor([[K[0, 2]], [K[1, 2]]],
+                                           device=device)
+        elif 'f' in param and 'c' in param:
+            f = torch.tensor(param['f'], device=device)
+            c = torch.tensor(param['c'], device=device)
+            assert f.shape == (2, 1)
+            assert c.shape == (2, 1)
+            self.param['K'] = torch.cat([torch.diagflat(f), c], dim=-1).T
+            self.param['f'] = f
+            self.param['c'] = c
+        else:
+            raise ValueError('Camera intrinsic parameters are missing. '
+                             'Either "K" or "f"&"c" should be provided.')
+
+        # distortion param
+        if 'k' in param and 'p' in param:
+            self.undistortion = True
+            self.param['k'] = torch.tensor(param['k'], device=device).view(-1)
+            self.param['p'] = torch.tensor(param['p'], device=device).view(-1)
+            assert len(self.param['k']) in {3, 6}
+            assert len(self.param['p']) == 2
+        else:
+            self.undistortion = False
+
+    def world_to_camera(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_w2c'] + self.param['T_w2c']
+
+    def camera_to_world(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_c2w'] + self.param['T_c2w']
+
+    def camera_to_pixel(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+
+        _X = X / X[..., 2:]
+
+        if self.undistortion:
+            k = self.param['k']
+            p = self.param['p']
+            _X_2d = _X[..., :2]
+            r2 = (_X_2d**2).sum(-1)
+            radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3]))
+            if k.size == 6:
+                radial /= 1 + sum(
+                    (ki * r2**(i + 1) for i, ki in enumerate(k[3:])))
+
+            tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1])
+
+            _X[..., :2] = _X_2d * (radial + tangential)[..., None] + torch.ger(
+                r2, p.flip([0])).reshape(_X_2d.shape)
+        return _X @ self.param['K']
diff --git a/mmpose/core/distributed_wrapper.py b/mmpose/core/distributed_wrapper.py
new file mode 100644
index 0000000..c67acee
--- /dev/null
+++ b/mmpose/core/distributed_wrapper.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.parallel import MODULE_WRAPPERS as MMCV_MODULE_WRAPPERS
+from mmcv.parallel import MMDistributedDataParallel
+from mmcv.parallel.scatter_gather import scatter_kwargs
+from mmcv.utils import Registry
+from torch.cuda._utils import _get_device_index
+
+MODULE_WRAPPERS = Registry('module wrapper', parent=MMCV_MODULE_WRAPPERS)
+
+
+@MODULE_WRAPPERS.register_module()
+class DistributedDataParallelWrapper(nn.Module):
+    """A DistributedDataParallel wrapper for models in 3D mesh estimation task.
+
+    In  3D mesh estimation task, there is a need to wrap different modules in
+    the models with separate DistributedDataParallel. Otherwise, it will cause
+    errors for GAN training.
+    More specific, the GAN model, usually has two sub-modules:
+    generator and discriminator. If we wrap both of them in one
+    standard DistributedDataParallel, it will cause errors during training,
+    because when we update the parameters of the generator (or discriminator),
+    the parameters of the discriminator (or generator) is not updated, which is
+    not allowed for DistributedDataParallel.
+    So we design this wrapper to separately wrap DistributedDataParallel
+    for generator and discriminator.
+
+    In this wrapper, we perform two operations:
+    1. Wrap the modules in the models with separate MMDistributedDataParallel.
+        Note that only modules with parameters will be wrapped.
+    2. Do scatter operation for 'forward', 'train_step' and 'val_step'.
+
+    Note that the arguments of this wrapper is the same as those in
+    `torch.nn.parallel.distributed.DistributedDataParallel`.
+
+    Args:
+        module (nn.Module): Module that needs to be wrapped.
+        device_ids (list[int | `torch.device`]): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+        dim (int, optional): Same as that in the official scatter function in
+            pytorch. Defaults to 0.
+        broadcast_buffers (bool): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Defaults to False.
+        find_unused_parameters (bool, optional): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Traverse the autograd graph of all tensors contained in returned
+            value of the wrapped module’s forward function. Defaults to False.
+        kwargs (dict): Other arguments used in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+    """
+
+    def __init__(self,
+                 module,
+                 device_ids,
+                 dim=0,
+                 broadcast_buffers=False,
+                 find_unused_parameters=False,
+                 **kwargs):
+        super().__init__()
+        assert len(device_ids) == 1, (
+            'Currently, DistributedDataParallelWrapper only supports one'
+            'single CUDA device for each process.'
+            f'The length of device_ids must be 1, but got {len(device_ids)}.')
+        self.module = module
+        self.dim = dim
+        self.to_ddp(
+            device_ids=device_ids,
+            dim=dim,
+            broadcast_buffers=broadcast_buffers,
+            find_unused_parameters=find_unused_parameters,
+            **kwargs)
+        self.output_device = _get_device_index(device_ids[0], True)
+
+    def to_ddp(self, device_ids, dim, broadcast_buffers,
+               find_unused_parameters, **kwargs):
+        """Wrap models with separate MMDistributedDataParallel.
+
+        It only wraps the modules with parameters.
+        """
+        for name, module in self.module._modules.items():
+            if next(module.parameters(), None) is None:
+                module = module.cuda()
+            elif all(not p.requires_grad for p in module.parameters()):
+                module = module.cuda()
+            else:
+                module = MMDistributedDataParallel(
+                    module.cuda(),
+                    device_ids=device_ids,
+                    dim=dim,
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+            self.module._modules[name] = module
+
+    def scatter(self, inputs, kwargs, device_ids):
+        """Scatter function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+            device_ids (int): Device id.
+        """
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def forward(self, *inputs, **kwargs):
+        """Forward function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+
+    def train_step(self, *inputs, **kwargs):
+        """Train step function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """Validation step function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for ``scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output
diff --git a/mmpose/core/evaluation/__init__.py b/mmpose/core/evaluation/__init__.py
new file mode 100644
index 0000000..5f93784
--- /dev/null
+++ b/mmpose/core/evaluation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bottom_up_eval import (aggregate_scale, aggregate_stage_flip,
+                             flip_feature_maps, get_group_preds,
+                             split_ae_outputs)
+from .eval_hooks import DistEvalHook, EvalHook
+from .mesh_eval import compute_similarity_transform
+from .pose3d_eval import keypoint_3d_auc, keypoint_3d_pck, keypoint_mpjpe
+from .top_down_eval import (keypoint_auc, keypoint_epe, keypoint_pck_accuracy,
+                            keypoints_from_heatmaps, keypoints_from_heatmaps3d,
+                            keypoints_from_regression,
+                            multilabel_classification_accuracy,
+                            pose_pck_accuracy, post_dark_udp)
+
+__all__ = [
+    'EvalHook', 'DistEvalHook', 'pose_pck_accuracy', 'keypoints_from_heatmaps',
+    'keypoints_from_regression', 'keypoint_pck_accuracy', 'keypoint_3d_pck',
+    'keypoint_3d_auc', 'keypoint_auc', 'keypoint_epe', 'get_group_preds',
+    'split_ae_outputs', 'flip_feature_maps', 'aggregate_stage_flip',
+    'aggregate_scale', 'compute_similarity_transform', 'post_dark_udp',
+    'keypoint_mpjpe', 'keypoints_from_heatmaps3d',
+    'multilabel_classification_accuracy'
+]
diff --git a/mmpose/core/evaluation/bottom_up_eval.py b/mmpose/core/evaluation/bottom_up_eval.py
new file mode 100644
index 0000000..7b37d7c
--- /dev/null
+++ b/mmpose/core/evaluation/bottom_up_eval.py
@@ -0,0 +1,333 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmpose.core.post_processing import (get_warp_matrix, transform_preds,
+                                         warp_affine_joints)
+
+
+def split_ae_outputs(outputs, num_joints, with_heatmaps, with_ae,
+                     select_output_index):
+    """Split multi-stage outputs into heatmaps & tags.
+
+    Args:
+        outputs (list(Tensor)): Outputs of network
+        num_joints (int): Number of joints
+        with_heatmaps (list[bool]): Option to output
+            heatmaps for different stages.
+        with_ae (list[bool]): Option to output
+            ae tags for different stages.
+        select_output_index (list[int]): Output keep the selected index
+
+    Returns:
+        tuple: A tuple containing multi-stage outputs.
+
+        - list[Tensor]: multi-stage heatmaps.
+        - list[Tensor]: multi-stage tags.
+    """
+
+    heatmaps = []
+    tags = []
+
+    # aggregate heatmaps from different stages
+    for i, output in enumerate(outputs):
+        if i not in select_output_index:
+            continue
+        # staring index of the associative embeddings
+        offset_feat = num_joints if with_heatmaps[i] else 0
+        if with_heatmaps[i]:
+            heatmaps.append(output[:, :num_joints])
+        if with_ae[i]:
+            tags.append(output[:, offset_feat:])
+
+    return heatmaps, tags
+
+
+def flip_feature_maps(feature_maps, flip_index=None):
+    """Flip the feature maps and swap the channels.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        flip_index (list[int] | None): Channel-flip indexes.
+            If None, do not flip channels.
+
+    Returns:
+        list[Tensor]: Flipped feature_maps.
+    """
+    flipped_feature_maps = []
+    for feature_map in feature_maps:
+        feature_map = torch.flip(feature_map, [3])
+        if flip_index is not None:
+            flipped_feature_maps.append(feature_map[:, flip_index, :, :])
+        else:
+            flipped_feature_maps.append(feature_map)
+
+    return flipped_feature_maps
+
+
+def _resize_average(feature_maps, align_corners, index=-1, resize_size=None):
+    """Resize the feature maps and compute the average.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+
+    if feature_maps is None:
+        return None
+    feature_maps_avg = 0
+
+    feature_map_list = _resize_concate(
+        feature_maps, align_corners, index=index, resize_size=resize_size)
+    for feature_map in feature_map_list:
+        feature_maps_avg += feature_map
+
+    feature_maps_avg /= len(feature_map_list)
+    return [feature_maps_avg]
+
+
+def _resize_unsqueeze_concat(feature_maps,
+                             align_corners,
+                             index=-1,
+                             resize_size=None):
+    """Resize, unsqueeze and concatenate the feature_maps.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+    if feature_maps is None:
+        return None
+    feature_map_list = _resize_concate(
+        feature_maps, align_corners, index=index, resize_size=resize_size)
+
+    feat_dim = len(feature_map_list[0].shape) - 1
+    output_feature_maps = torch.cat(
+        [torch.unsqueeze(fmap, dim=feat_dim + 1) for fmap in feature_map_list],
+        dim=feat_dim + 1)
+    return [output_feature_maps]
+
+
+def _resize_concate(feature_maps, align_corners, index=-1, resize_size=None):
+    """Resize and concatenate the feature_maps.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+    if feature_maps is None:
+        return None
+
+    feature_map_list = []
+
+    if index < 0:
+        index += len(feature_maps)
+
+    if resize_size is None:
+        resize_size = (feature_maps[index].size(2),
+                       feature_maps[index].size(3))
+
+    for feature_map in feature_maps:
+        ori_size = (feature_map.size(2), feature_map.size(3))
+        if ori_size != resize_size:
+            feature_map = torch.nn.functional.interpolate(
+                feature_map,
+                size=resize_size,
+                mode='bilinear',
+                align_corners=align_corners)
+
+        feature_map_list.append(feature_map)
+
+    return feature_map_list
+
+
+def aggregate_stage_flip(feature_maps,
+                         feature_maps_flip,
+                         index=-1,
+                         project2image=True,
+                         size_projected=None,
+                         align_corners=False,
+                         aggregate_stage='concat',
+                         aggregate_flip='average'):
+    """Inference the model to get multi-stage outputs (heatmaps & tags), and
+    resize them to base sizes.
+
+    Args:
+        feature_maps (list[Tensor]): feature_maps can be heatmaps,
+            tags, and pafs.
+        feature_maps_flip (list[Tensor] | None): flipped feature_maps.
+            feature maps can be heatmaps, tags, and pafs.
+        project2image (bool): Option to resize to base scale.
+        size_projected (list[int, int]): Base size of heatmaps [w, h].
+        align_corners (bool): Align corners when performing interpolation.
+        aggregate_stage (str): Methods to aggregate multi-stage feature maps.
+            Options: 'concat', 'average'. Default: 'concat.
+
+            - 'concat': Concatenate the original and the flipped feature maps.
+            - 'average': Get the average of the original and the flipped
+                feature maps.
+        aggregate_flip (str): Methods to aggregate the original and
+            the flipped feature maps. Options: 'concat', 'average', 'none'.
+            Default: 'average.
+
+            - 'concat': Concatenate the original and the flipped feature maps.
+            - 'average': Get the average of the original and the flipped
+                feature maps..
+            - 'none': no flipped feature maps.
+
+    Returns:
+        list[Tensor]: Aggregated feature maps with shape [NxKxWxH].
+    """
+
+    if feature_maps_flip is None:
+        aggregate_flip = 'none'
+
+    output_feature_maps = []
+
+    if aggregate_stage == 'average':
+        _aggregate_stage_func = _resize_average
+    elif aggregate_stage == 'concat':
+        _aggregate_stage_func = _resize_concate
+    else:
+        NotImplementedError()
+
+    if project2image and size_projected:
+        _origin = _aggregate_stage_func(
+            feature_maps,
+            align_corners,
+            index=index,
+            resize_size=(size_projected[1], size_projected[0]))
+
+        _flipped = _aggregate_stage_func(
+            feature_maps_flip,
+            align_corners,
+            index=index,
+            resize_size=(size_projected[1], size_projected[0]))
+    else:
+        _origin = _aggregate_stage_func(
+            feature_maps, align_corners, index=index, resize_size=None)
+        _flipped = _aggregate_stage_func(
+            feature_maps_flip, align_corners, index=index, resize_size=None)
+
+    if aggregate_flip == 'average':
+        assert feature_maps_flip is not None
+        for _ori, _fli in zip(_origin, _flipped):
+            output_feature_maps.append((_ori + _fli) / 2.0)
+
+    elif aggregate_flip == 'concat':
+        assert feature_maps_flip is not None
+        output_feature_maps.append(*_origin)
+        output_feature_maps.append(*_flipped)
+
+    elif aggregate_flip == 'none':
+        if isinstance(_origin, list):
+            output_feature_maps.append(*_origin)
+        else:
+            output_feature_maps.append(_origin)
+    else:
+        NotImplementedError()
+
+    return output_feature_maps
+
+
+def aggregate_scale(feature_maps_list,
+                    align_corners=False,
+                    aggregate_scale='average'):
+    """Aggregate multi-scale outputs.
+
+    Note:
+        batch size: N
+        keypoints num : K
+        heatmap width: W
+        heatmap height: H
+
+    Args:
+        feature_maps_list (list[Tensor]): Aggregated feature maps.
+        project2image (bool): Option to resize to base scale.
+        align_corners (bool): Align corners when performing interpolation.
+        aggregate_scale (str): Methods to aggregate multi-scale feature maps.
+            Options: 'average', 'unsqueeze_concat'.
+
+            - 'average': Get the average of the feature maps.
+            - 'unsqueeze_concat': Concatenate the feature maps along new axis.
+                Default: 'average.
+
+    Returns:
+        Tensor: Aggregated feature maps.
+    """
+
+    if aggregate_scale == 'average':
+        output_feature_maps = _resize_average(
+            feature_maps_list, align_corners, index=0, resize_size=None)
+
+    elif aggregate_scale == 'unsqueeze_concat':
+        output_feature_maps = _resize_unsqueeze_concat(
+            feature_maps_list, align_corners, index=0, resize_size=None)
+    else:
+        NotImplementedError()
+
+    return output_feature_maps[0]
+
+
+def get_group_preds(grouped_joints,
+                    center,
+                    scale,
+                    heatmap_size,
+                    use_udp=False):
+    """Transform the grouped joints back to the image.
+
+    Args:
+        grouped_joints (list): Grouped person joints.
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        heatmap_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        use_udp (bool): Unbiased data processing.
+             Paper ref: Huang et al. The Devil is in the Details: Delving into
+             Unbiased Data Processing for Human Pose Estimation (CVPR'2020).
+
+    Returns:
+        list: List of the pose result for each person.
+    """
+    if len(grouped_joints) == 0:
+        return []
+
+    if use_udp:
+        if grouped_joints[0].shape[0] > 0:
+            heatmap_size_t = np.array(heatmap_size, dtype=np.float32) - 1.0
+            trans = get_warp_matrix(
+                theta=0,
+                size_input=heatmap_size_t,
+                size_dst=scale,
+                size_target=heatmap_size_t)
+            grouped_joints[0][..., :2] = \
+                warp_affine_joints(grouped_joints[0][..., :2], trans)
+        results = [person for person in grouped_joints[0]]
+    else:
+        results = []
+        for person in grouped_joints[0]:
+            joints = transform_preds(person, center, scale, heatmap_size)
+            results.append(joints)
+
+    return results
diff --git a/mmpose/core/evaluation/eval_hooks.py b/mmpose/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000..cf36a03
--- /dev/null
+++ b/mmpose/core/evaluation/eval_hooks.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.runner import DistEvalHook as _DistEvalHook
+from mmcv.runner import EvalHook as _EvalHook
+
+MMPOSE_GREATER_KEYS = [
+    'acc', 'ap', 'ar', 'pck', 'auc', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc'
+]
+MMPOSE_LESS_KEYS = ['loss', 'epe', 'nme', 'mpjpe', 'p-mpjpe', 'n-mpjpe']
+
+
+class EvalHook(_EvalHook):
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMPOSE_GREATER_KEYS,
+                 less_keys=MMPOSE_LESS_KEYS,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmpose.apis import single_gpu_test
+            test_fn = single_gpu_test
+
+        # to be compatible with the config before v0.16.0
+
+        # remove "gpu_collect" from eval_kwargs
+        if 'gpu_collect' in eval_kwargs:
+            warnings.warn(
+                '"gpu_collect" will be deprecated in EvalHook.'
+                'Please remove it from the config.', DeprecationWarning)
+            _ = eval_kwargs.pop('gpu_collect')
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="AP".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', 'AP')
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys, **eval_kwargs)
+
+
+class DistEvalHook(_DistEvalHook):
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMPOSE_GREATER_KEYS,
+                 less_keys=MMPOSE_LESS_KEYS,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmpose.apis import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        # to be compatible with the config before v0.16.0
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="AP".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', 'AP')
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys,
+                         broadcast_bn_buffer, tmpdir, gpu_collect,
+                         **eval_kwargs)
diff --git a/mmpose/core/evaluation/mesh_eval.py b/mmpose/core/evaluation/mesh_eval.py
new file mode 100644
index 0000000..683b453
--- /dev/null
+++ b/mmpose/core/evaluation/mesh_eval.py
@@ -0,0 +1,66 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def compute_similarity_transform(source_points, target_points):
+    """Computes a similarity transform (sR, t) that takes a set of 3D points
+    source_points (N x 3) closest to a set of 3D points target_points, where R
+    is an 3x3 rotation matrix, t 3x1 translation, s scale. And return the
+    transformed 3D points source_points_hat (N x 3). i.e. solves the orthogonal
+    Procrutes problem.
+
+    Note:
+        Points number: N
+
+    Args:
+        source_points (np.ndarray): Source point set with shape [N, 3].
+        target_points (np.ndarray): Target point set with shape [N, 3].
+
+    Returns:
+        np.ndarray: Transformed source point set with shape [N, 3].
+    """
+
+    assert target_points.shape[0] == source_points.shape[0]
+    assert target_points.shape[1] == 3 and source_points.shape[1] == 3
+
+    source_points = source_points.T
+    target_points = target_points.T
+
+    # 1. Remove mean.
+    mu1 = source_points.mean(axis=1, keepdims=True)
+    mu2 = target_points.mean(axis=1, keepdims=True)
+    X1 = source_points - mu1
+    X2 = target_points - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, _, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Transform the source points:
+    source_points_hat = scale * R.dot(source_points) + t
+
+    source_points_hat = source_points_hat.T
+
+    return source_points_hat
diff --git a/mmpose/core/evaluation/pose3d_eval.py b/mmpose/core/evaluation/pose3d_eval.py
new file mode 100644
index 0000000..545778c
--- /dev/null
+++ b/mmpose/core/evaluation/pose3d_eval.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .mesh_eval import compute_similarity_transform
+
+
+def keypoint_mpjpe(pred, gt, mask, alignment='none'):
+    """Calculate the mean per-joint position error (MPJPE) and the error after
+    rigid alignment with the ground truth (P-MPJPE).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+
+    Args:
+        pred (np.ndarray): Predicted keypoint location with shape [N, K, C].
+        gt (np.ndarray): Groundtruth keypoint location with shape [N, K, C].
+        mask (np.ndarray): Visibility of the target with shape [N, K].
+            False for invisible joints, and True for visible.
+            Invisible joints will be ignored for accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+                - ``'none'``: no alignment will be applied
+                - ``'scale'``: align in the least-square sense in scale
+                - ``'procrustes'``: align in the least-square sense in
+                    scale, rotation and translation.
+    Returns:
+        tuple: A tuple containing joint position errors
+
+        - (float | np.ndarray): mean per-joint position error (mpjpe).
+        - (float | np.ndarray): mpjpe after rigid alignment with the
+            ground truth (p-mpjpe).
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)[mask].mean()
+
+    return error
+
+
+def keypoint_3d_pck(pred, gt, mask, alignment='none', threshold=0.15):
+    """Calculate the Percentage of Correct Keypoints (3DPCK) w. or w/o rigid
+    alignment.
+
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+
+        threshold:  If L2 distance between the prediction and the groundtruth
+            is less then threshold, the predicted result is considered as
+            correct. Default: 0.15 (m).
+
+    Returns:
+        pck: percentage of correct keypoints.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+    pck = (error < threshold).astype(np.float32)[mask].mean() * 100
+
+    return pck
+
+
+def keypoint_3d_auc(pred, gt, mask, alignment='none'):
+    """Calculate the Area Under the Curve (3DAUC) computed for a range of 3DPCK
+    thresholds.
+
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+    This implementation is derived from mpii_compute_3d_pck.m, which is
+    provided as part of the MPI-INF-3DHP test data release.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        keypoint_dims: C
+
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+
+    Returns:
+        auc: AUC computed for a range of 3DPCK thresholds.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+
+    thresholds = np.linspace(0., 0.15, 31)
+    pck_values = np.zeros(len(thresholds))
+    for i in range(len(thresholds)):
+        pck_values[i] = (error < thresholds[i]).astype(np.float32)[mask].mean()
+
+    auc = pck_values.mean() * 100
+
+    return auc
diff --git a/mmpose/core/evaluation/top_down_eval.py b/mmpose/core/evaluation/top_down_eval.py
new file mode 100644
index 0000000..ee6a250
--- /dev/null
+++ b/mmpose/core/evaluation/top_down_eval.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import cv2
+import numpy as np
+
+from mmpose.core.post_processing import transform_preds
+
+
+def _calc_distances(preds, targets, mask, normalize):
+    """Calculate the normalized distances between preds and target.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        dimension of keypoints: D (normally, D=2 or D=3)
+
+    Args:
+        preds (np.ndarray[N, K, D]): Predicted keypoint location.
+        targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (np.ndarray[N, D]): Typical value is heatmap_size
+
+    Returns:
+        np.ndarray[K, N]: The normalized distances. \
+            If target keypoints are missing, the distance is -1.
+    """
+    N, K, _ = preds.shape
+    # set mask=0 when normalize==0
+    _mask = mask.copy()
+    _mask[np.where((normalize == 0).sum(1))[0], :] = False
+    distances = np.full((N, K), -1, dtype=np.float32)
+    # handle invalid values
+    normalize[np.where(normalize <= 0)] = 1e6
+    distances[_mask] = np.linalg.norm(
+        ((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
+    return distances.T
+
+
+def _distance_acc(distances, thr=0.5):
+    """Return the percentage below the distance threshold, while ignoring
+    distances values with -1.
+
+    Note:
+        batch_size: N
+    Args:
+        distances (np.ndarray[N, ]): The normalized distances.
+        thr (float): Threshold of the distances.
+
+    Returns:
+        float: Percentage of distances below the threshold. \
+            If all target keypoints are missing, return -1.
+    """
+    distance_valid = distances != -1
+    num_distance_valid = distance_valid.sum()
+    if num_distance_valid > 0:
+        return (distances[distance_valid] < thr).sum() / num_distance_valid
+    return -1
+
+
+def _get_max_preds(heatmaps):
+    """Get keypoint predictions from score maps.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps,
+                      np.ndarray), ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    N, K, _, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+    preds[:, :, 0] = preds[:, :, 0] % W
+    preds[:, :, 1] = preds[:, :, 1] // W
+
+    preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def _get_max_preds_3d(heatmaps):
+    """Get keypoint predictions from 3D score maps.
+
+    Note:
+        batch size: N
+        num keypoints: K
+        heatmap depth size: D
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 3]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps, np.ndarray), \
+        ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 5, 'heatmaps should be 5-ndim'
+
+    N, K, D, H, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.zeros((N, K, 3), dtype=np.float32)
+    _idx = idx[..., 0]
+    preds[..., 2] = _idx // (H * W)
+    preds[..., 1] = (_idx // W) % H
+    preds[..., 0] = _idx % W
+
+    preds = np.where(maxvals > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def pose_pck_accuracy(output, target, mask, thr=0.05, normalize=None):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints from heatmaps.
+
+    Note:
+        PCK metric measures accuracy of the localization of the body joints.
+        The distances between predicted positions and the ground-truth ones
+        are typically normalized by the bounding box size.
+        The threshold (thr) of the normalized distance is commonly set
+        as 0.05, 0.1 or 0.2 etc.
+
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output (np.ndarray[N, K, H, W]): Model output heatmaps.
+        target (np.ndarray[N, K, H, W]): Groundtruth heatmaps.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        thr (float): Threshold of PCK calculation. Default 0.05.
+        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
+
+    Returns:
+        tuple: A tuple containing keypoint accuracy.
+
+        - np.ndarray[K]: Accuracy of each keypoint.
+        - float: Averaged accuracy across all keypoints.
+        - int: Number of valid keypoints.
+    """
+    N, K, H, W = output.shape
+    if K == 0:
+        return None, 0, 0
+    if normalize is None:
+        normalize = np.tile(np.array([[H, W]]), (N, 1))
+
+    pred, _ = _get_max_preds(output)
+    gt, _ = _get_max_preds(target)
+    return keypoint_pck_accuracy(pred, gt, mask, thr, normalize)
+
+
+def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        PCK metric measures accuracy of the localization of the body joints.
+        The distances between predicted positions and the ground-truth ones
+        are typically normalized by the bounding box size.
+        The threshold (thr) of the normalized distance is commonly set
+        as 0.05, 0.1 or 0.2 etc.
+
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        thr (float): Threshold of PCK calculation.
+        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
+
+    Returns:
+        tuple: A tuple containing keypoint accuracy.
+
+        - acc (np.ndarray[K]): Accuracy of each keypoint.
+        - avg_acc (float): Averaged accuracy across all keypoints.
+        - cnt (int): Number of valid keypoints.
+    """
+    distances = _calc_distances(pred, gt, mask, normalize)
+
+    acc = np.array([_distance_acc(d, thr) for d in distances])
+    valid_acc = acc[acc >= 0]
+    cnt = len(valid_acc)
+    avg_acc = valid_acc.mean() if cnt > 0 else 0
+    return acc, avg_acc, cnt
+
+
+def keypoint_auc(pred, gt, mask, normalize, num_step=20):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (float): Normalization factor.
+
+    Returns:
+        float: Area under curve.
+    """
+    nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
+    x = [1.0 * i / num_step for i in range(num_step)]
+    y = []
+    for thr in x:
+        _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
+        y.append(avg_acc)
+
+    auc = 0
+    for i in range(num_step):
+        auc += 1.0 / num_step * y[i]
+    return auc
+
+
+def keypoint_nme(pred, gt, mask, normalize_factor):
+    """Calculate the normalized mean error (NME).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize_factor (np.ndarray[N, 2]): Normalization factor.
+
+    Returns:
+        float: normalized mean error
+    """
+    distances = _calc_distances(pred, gt, mask, normalize_factor)
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def keypoint_epe(pred, gt, mask):
+    """Calculate the end-point error.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+
+    Returns:
+        float: Average end-point error.
+    """
+
+    distances = _calc_distances(
+        pred, gt, mask,
+        np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32))
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def _taylor(heatmap, coord):
+    """Distribution aware coordinate decoding method.
+
+    Note:
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmap (np.ndarray[H, W]): Heatmap of a particular joint type.
+        coord (np.ndarray[2,]): Coordinates of the predicted keypoints.
+
+    Returns:
+        np.ndarray[2,]: Updated coordinates.
+    """
+    H, W = heatmap.shape[:2]
+    px, py = int(coord[0]), int(coord[1])
+    if 1 < px < W - 2 and 1 < py < H - 2:
+        dx = 0.5 * (heatmap[py][px + 1] - heatmap[py][px - 1])
+        dy = 0.5 * (heatmap[py + 1][px] - heatmap[py - 1][px])
+        dxx = 0.25 * (
+            heatmap[py][px + 2] - 2 * heatmap[py][px] + heatmap[py][px - 2])
+        dxy = 0.25 * (
+            heatmap[py + 1][px + 1] - heatmap[py - 1][px + 1] -
+            heatmap[py + 1][px - 1] + heatmap[py - 1][px - 1])
+        dyy = 0.25 * (
+            heatmap[py + 2 * 1][px] - 2 * heatmap[py][px] +
+            heatmap[py - 2 * 1][px])
+        derivative = np.array([[dx], [dy]])
+        hessian = np.array([[dxx, dxy], [dxy, dyy]])
+        if dxx * dyy - dxy**2 != 0:
+            hessianinv = np.linalg.inv(hessian)
+            offset = -hessianinv @ derivative
+            offset = np.squeeze(np.array(offset.T), axis=0)
+            coord += offset
+    return coord
+
+
+def post_dark_udp(coords, batch_heatmaps, kernel=3):
+    """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
+    Devil is in the Details: Delving into Unbiased Data Processing for Human
+    Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
+    Representation for Human Pose Estimation (CVPR 2020).
+
+    Note:
+        - batch size: B
+        - num keypoints: K
+        - num persons: N
+        - height of heatmaps: H
+        - width of heatmaps: W
+
+        B=1 for bottom_up paradigm where all persons share the same heatmap.
+        B=N for top_down paradigm where each person has its own heatmaps.
+
+    Args:
+        coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
+        batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
+        kernel (int): Gaussian kernel size (K) for modulation.
+
+    Returns:
+        np.ndarray([N, K, 2]): Refined coordinates.
+    """
+    if not isinstance(batch_heatmaps, np.ndarray):
+        batch_heatmaps = batch_heatmaps.cpu().numpy()
+    B, K, H, W = batch_heatmaps.shape
+    N = coords.shape[0]
+    assert (B == 1 or B == N)
+    for heatmaps in batch_heatmaps:
+        for heatmap in heatmaps:
+            cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
+    np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
+    np.log(batch_heatmaps, batch_heatmaps)
+
+    batch_heatmaps_pad = np.pad(
+        batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)),
+        mode='edge').flatten()
+
+    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
+    index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
+    index = index.astype(int).reshape(-1, 1)
+    i_ = batch_heatmaps_pad[index]
+    ix1 = batch_heatmaps_pad[index + 1]
+    iy1 = batch_heatmaps_pad[index + W + 2]
+    ix1y1 = batch_heatmaps_pad[index + W + 3]
+    ix1_y1_ = batch_heatmaps_pad[index - W - 3]
+    ix1_ = batch_heatmaps_pad[index - 1]
+    iy1_ = batch_heatmaps_pad[index - 2 - W]
+
+    dx = 0.5 * (ix1 - ix1_)
+    dy = 0.5 * (iy1 - iy1_)
+    derivative = np.concatenate([dx, dy], axis=1)
+    derivative = derivative.reshape(N, K, 2, 1)
+    dxx = ix1 - 2 * i_ + ix1_
+    dyy = iy1 - 2 * i_ + iy1_
+    dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+    hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
+    hessian = hessian.reshape(N, K, 2, 2)
+    hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+    coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
+    return coords
+
+
+def _gaussian_blur(heatmaps, kernel=11):
+    """Modulate heatmap distribution with Gaussian.
+     sigma = 0.3*((kernel_size-1)*0.5-1)+0.8
+     sigma~=3 if k=17
+     sigma=2 if k=11;
+     sigma~=1.5 if k=7;
+     sigma~=1 if k=3;
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+
+    Returns:
+        np.ndarray ([N, K, H, W]): Modulated heatmap distribution.
+    """
+    assert kernel % 2 == 1
+
+    border = (kernel - 1) // 2
+    batch_size = heatmaps.shape[0]
+    num_joints = heatmaps.shape[1]
+    height = heatmaps.shape[2]
+    width = heatmaps.shape[3]
+    for i in range(batch_size):
+        for j in range(num_joints):
+            origin_max = np.max(heatmaps[i, j])
+            dr = np.zeros((height + 2 * border, width + 2 * border),
+                          dtype=np.float32)
+            dr[border:-border, border:-border] = heatmaps[i, j].copy()
+            dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
+            heatmaps[i, j] = dr[border:-border, border:-border].copy()
+            heatmaps[i, j] *= origin_max / np.max(heatmaps[i, j])
+    return heatmaps
+
+
+def keypoints_from_regression(regression_preds, center, scale, img_size):
+    """Get final keypoint predictions from regression vectors and transform
+    them back to the image.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        regression_preds (np.ndarray[N, K, 2]): model prediction.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+        img_size (list(img_width, img_height)): model input image size.
+
+    Returns:
+        tuple:
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, _ = regression_preds.shape
+    preds, maxvals = regression_preds, np.ones((N, K, 1), dtype=np.float32)
+
+    preds = preds * img_size
+
+    # Transform back to the image
+    for i in range(N):
+        preds[i] = transform_preds(preds[i], center[i], scale[i], img_size)
+
+    return preds, maxvals
+
+
+def keypoints_from_heatmaps(heatmaps,
+                            center,
+                            scale,
+                            unbiased=False,
+                            post_process='default',
+                            kernel=11,
+                            valid_radius_factor=0.0546875,
+                            use_udp=False,
+                            target_type='GaussianHeatmap'):
+    """Get final keypoint predictions from heatmaps and transform them back to
+    the image.
+
+    Note:
+        - batch size: N
+        - num keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+        post_process (str/None): Choice of methods to post-process
+            heatmaps. Currently supported: None, 'default', 'unbiased',
+            'megvii'.
+        unbiased (bool): Option to use unbiased decoding. Mutually
+            exclusive with megvii.
+            Note: this arg is deprecated and unbiased=True can be replaced
+            by post_process='unbiased'
+            Paper ref: Zhang et al. Distribution-Aware Coordinate
+            Representation for Human Pose Estimation (CVPR 2020).
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+        valid_radius_factor (float): The radius factor of the positive area
+            in classification heatmap for UDP.
+        use_udp (bool): Use unbiased data processing.
+        target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
+            GaussianHeatmap: Classification target with gaussian distribution.
+            CombinedTarget: The combination of classification target
+            (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    # Avoid being affected
+    heatmaps = heatmaps.copy()
+
+    # detect conflicts
+    if unbiased:
+        assert post_process not in [False, None, 'megvii']
+    if post_process in ['megvii', 'unbiased']:
+        assert kernel > 0
+    if use_udp:
+        assert not post_process == 'megvii'
+
+    # normalize configs
+    if post_process is False:
+        warnings.warn(
+            'post_process=False is deprecated, '
+            'please use post_process=None instead', DeprecationWarning)
+        post_process = None
+    elif post_process is True:
+        if unbiased is True:
+            warnings.warn(
+                'post_process=True, unbiased=True is deprecated,'
+                " please use post_process='unbiased' instead",
+                DeprecationWarning)
+            post_process = 'unbiased'
+        else:
+            warnings.warn(
+                'post_process=True, unbiased=False is deprecated, '
+                "please use post_process='default' instead",
+                DeprecationWarning)
+            post_process = 'default'
+    elif post_process == 'default':
+        if unbiased is True:
+            warnings.warn(
+                'unbiased=True is deprecated, please use '
+                "post_process='unbiased' instead", DeprecationWarning)
+            post_process = 'unbiased'
+
+    # start processing
+    if post_process == 'megvii':
+        heatmaps = _gaussian_blur(heatmaps, kernel=kernel)
+
+    N, K, H, W = heatmaps.shape
+    if use_udp:
+        if target_type.lower() == 'GaussianHeatMap'.lower():
+            preds, maxvals = _get_max_preds(heatmaps)
+            preds = post_dark_udp(preds, heatmaps, kernel=kernel)
+        elif target_type.lower() == 'CombinedTarget'.lower():
+            for person_heatmaps in heatmaps:
+                for i, heatmap in enumerate(person_heatmaps):
+                    kt = 2 * kernel + 1 if i % 3 == 0 else kernel
+                    cv2.GaussianBlur(heatmap, (kt, kt), 0, heatmap)
+            # valid radius is in direct proportion to the height of heatmap.
+            valid_radius = valid_radius_factor * H
+            offset_x = heatmaps[:, 1::3, :].flatten() * valid_radius
+            offset_y = heatmaps[:, 2::3, :].flatten() * valid_radius
+            heatmaps = heatmaps[:, ::3, :]
+            preds, maxvals = _get_max_preds(heatmaps)
+            index = preds[..., 0] + preds[..., 1] * W
+            index += W * H * np.arange(0, N * K / 3)
+            index = index.astype(int).reshape(N, K // 3, 1)
+            preds += np.concatenate((offset_x[index], offset_y[index]), axis=2)
+        else:
+            raise ValueError('target_type should be either '
+                             "'GaussianHeatmap' or 'CombinedTarget'")
+    else:
+        preds, maxvals = _get_max_preds(heatmaps)
+        if post_process == 'unbiased':  # alleviate biased coordinate
+            # apply Gaussian distribution modulation.
+            heatmaps = np.log(
+                np.maximum(_gaussian_blur(heatmaps, kernel), 1e-10))
+            for n in range(N):
+                for k in range(K):
+                    preds[n][k] = _taylor(heatmaps[n][k], preds[n][k])
+        elif post_process is not None:
+            # add +/-0.25 shift to the predicted locations for higher acc.
+            for n in range(N):
+                for k in range(K):
+                    heatmap = heatmaps[n][k]
+                    px = int(preds[n][k][0])
+                    py = int(preds[n][k][1])
+                    if 1 < px < W - 1 and 1 < py < H - 1:
+                        diff = np.array([
+                            heatmap[py][px + 1] - heatmap[py][px - 1],
+                            heatmap[py + 1][px] - heatmap[py - 1][px]
+                        ])
+                        preds[n][k] += np.sign(diff) * .25
+                        if post_process == 'megvii':
+                            preds[n][k] += 0.5
+
+    # Transform back to the image
+    for i in range(N):
+        preds[i] = transform_preds(
+            preds[i], center[i], scale[i], [W, H], use_udp=use_udp)
+
+    if post_process == 'megvii':
+        maxvals = maxvals / 255.0 + 0.5
+
+    return preds, maxvals
+
+
+def keypoints_from_heatmaps3d(heatmaps, center, scale):
+    """Get final keypoint predictions from 3d heatmaps and transform them back
+    to the image.
+
+    Note:
+        - batch size: N
+        - num keypoints: K
+        - heatmap depth size: D
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location \
+            in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, D, H, W = heatmaps.shape
+    preds, maxvals = _get_max_preds_3d(heatmaps)
+    # Transform back to the image
+    for i in range(N):
+        preds[i, :, :2] = transform_preds(preds[i, :, :2], center[i], scale[i],
+                                          [W, H])
+    return preds, maxvals
+
+
+def multilabel_classification_accuracy(pred, gt, mask, thr=0.5):
+    """Get multi-label classification accuracy.
+
+    Note:
+        - batch size: N
+        - label number: L
+
+    Args:
+        pred (np.ndarray[N, L, 2]): model predicted labels.
+        gt (np.ndarray[N, L, 2]): ground-truth labels.
+        mask (np.ndarray[N, 1] or np.ndarray[N, L] ): reliability of
+        ground-truth labels.
+
+    Returns:
+        float: multi-label classification accuracy.
+    """
+    # we only compute accuracy on the samples with ground-truth of all labels.
+    valid = (mask > 0).min(axis=1) if mask.ndim == 2 else (mask > 0)
+    pred, gt = pred[valid], gt[valid]
+
+    if pred.shape[0] == 0:
+        acc = 0.0  # when no sample is with gt labels, set acc to 0.
+    else:
+        # The classification of a sample is regarded as correct
+        # only if it's correct for all labels.
+        acc = (((pred - thr) * (gt - thr)) > 0).all(axis=1).mean()
+    return acc
diff --git a/mmpose/core/fp16/__init__.py b/mmpose/core/fp16/__init__.py
new file mode 100644
index 0000000..5cb0548
--- /dev/null
+++ b/mmpose/core/fp16/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .decorators import auto_fp16, force_fp32
+from .hooks import Fp16OptimizerHook, wrap_fp16_model
+from .utils import cast_tensor_type
+
+__all__ = [
+    'auto_fp16', 'force_fp32', 'Fp16OptimizerHook', 'wrap_fp16_model',
+    'cast_tensor_type'
+]
diff --git a/mmpose/core/fp16/decorators.py b/mmpose/core/fp16/decorators.py
new file mode 100644
index 0000000..2d70ddf
--- /dev/null
+++ b/mmpose/core/fp16/decorators.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import warnings
+from inspect import getfullargspec
+
+import torch
+
+from .utils import cast_tensor_type
+
+
+def auto_fp16(apply_to=None, out_fp32=False):
+    """Decorator to enable fp16 training automatically.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If inputs arguments are fp32 tensors, they will
+    be converted to fp16 automatically. Arguments other than fp32 tensors are
+    ignored.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp32 (bool): Whether to convert the output back to fp32.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp16
+        >>>     @auto_fp16()
+        >>>     def forward(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp16
+        >>>     @auto_fp16(apply_to=('pred', ))
+        >>>     def do_something(self, pred, others):
+        >>>         pass
+    """
+
+    warnings.warn(
+        'auto_fp16 in mmpose will be deprecated in the next release.'
+        'Please use mmcv.runner.auto_fp16 instead (mmcv>=1.3.1).',
+        DeprecationWarning)
+
+    def auto_fp16_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@auto_fp16 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            # NOTE: default args are not taken into consideration
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.float, torch.half))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = {}
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.float, torch.half)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp32:
+                output = cast_tensor_type(output, torch.half, torch.float)
+            return output
+
+        return new_func
+
+    return auto_fp16_wrapper
+
+
+def force_fp32(apply_to=None, out_fp16=False):
+    """Decorator to convert input arguments to fp32 in force.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If there are some inputs that must be processed
+    in fp32 mode, then this decorator can handle it. If inputs arguments are
+    fp16 tensors, they will be converted to fp32 automatically. Arguments other
+    than fp16 tensors are ignored.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp16 (bool): Whether to convert the output back to fp16.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp32
+        >>>     @force_fp32()
+        >>>     def loss(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp32
+        >>>     @force_fp32(apply_to=('pred', ))
+        >>>     def post_process(self, pred, others):
+        >>>         pass
+    """
+    warnings.warn(
+        'force_fp32 in mmpose will be deprecated in the next release.'
+        'Please use mmcv.runner.force_fp32 instead (mmcv>=1.3.1).',
+        DeprecationWarning)
+
+    def force_fp32_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@force_fp32 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.half, torch.float))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = dict()
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.half, torch.float)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp16:
+                output = cast_tensor_type(output, torch.float, torch.half)
+            return output
+
+        return new_func
+
+    return force_fp32_wrapper
diff --git a/mmpose/core/fp16/hooks.py b/mmpose/core/fp16/hooks.py
new file mode 100644
index 0000000..74081a9
--- /dev/null
+++ b/mmpose/core/fp16/hooks.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+from mmcv.runner import OptimizerHook
+from mmcv.utils import _BatchNorm
+
+from ..utils.dist_utils import allreduce_grads
+from .utils import cast_tensor_type
+
+
+class Fp16OptimizerHook(OptimizerHook):
+    """FP16 optimizer hook.
+
+    The steps of fp16 optimizer is as follows.
+    1. Scale the loss value.
+    2. BP in the fp16 model.
+    2. Copy gradients from fp16 model to fp32 weights.
+    3. Update fp32 weights.
+    4. Copy updated parameters from fp32 weights to fp16 model.
+
+    Refer to https://arxiv.org/abs/1710.03740 for more details.
+
+    Args:
+        loss_scale (float): Scale factor multiplied with loss.
+    """
+
+    def __init__(self,
+                 grad_clip=None,
+                 coalesce=True,
+                 bucket_size_mb=-1,
+                 loss_scale=512.,
+                 distributed=True):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+        self.loss_scale = loss_scale
+        self.distributed = distributed
+
+    def before_run(self, runner):
+        """Preparing steps before Mixed Precision Training.
+
+        1. Make a master copy of fp32 weights for optimization.
+        2. Convert the main model from fp32 to fp16.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlines training runner.
+        """
+        # keep a copy of fp32 weights
+        runner.optimizer.param_groups = copy.deepcopy(
+            runner.optimizer.param_groups)
+        # convert model to fp16
+        wrap_fp16_model(runner.model)
+
+    @staticmethod
+    def copy_grads_to_fp32(fp16_net, fp32_weights):
+        """Copy gradients from fp16 model to fp32 weight copy."""
+        for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()):
+            if fp16_param.grad is not None:
+                if fp32_param.grad is None:
+                    fp32_param.grad = fp32_param.data.new(fp32_param.size())
+                fp32_param.grad.copy_(fp16_param.grad)
+
+    @staticmethod
+    def copy_params_to_fp16(fp16_net, fp32_weights):
+        """Copy updated params from fp32 weight copy to fp16 model."""
+        for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights):
+            fp16_param.data.copy_(fp32_param.data)
+
+    def after_train_iter(self, runner):
+        """Backward optimization steps for Mixed Precision Training.
+
+        1. Scale the loss by a scale factor.
+        2. Backward the loss to obtain the gradients (fp16).
+        3. Copy gradients from the model to the fp32 weight copy.
+        4. Scale the gradients back and update the fp32 weight copy.
+        5. Copy back the params from fp32 weight copy to the fp16 model.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlines training runner.
+        """
+        # clear grads of last iteration
+        runner.model.zero_grad()
+        runner.optimizer.zero_grad()
+        # scale the loss value
+        scaled_loss = runner.outputs['loss'] * self.loss_scale
+        scaled_loss.backward()
+        # copy fp16 grads in the model to fp32 params in the optimizer
+        fp32_weights = []
+        for param_group in runner.optimizer.param_groups:
+            fp32_weights += param_group['params']
+        self.copy_grads_to_fp32(runner.model, fp32_weights)
+        # allreduce grads
+        if self.distributed:
+            allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb)
+        # scale the gradients back
+        for param in fp32_weights:
+            if param.grad is not None:
+                param.grad.div_(self.loss_scale)
+        if self.grad_clip is not None:
+            self.clip_grads(fp32_weights)
+        # update fp32 params
+        runner.optimizer.step()
+        # copy fp32 params to the fp16 model
+        self.copy_params_to_fp16(runner.model, fp32_weights)
+
+
+def wrap_fp16_model(model):
+    """Wrap the FP32 model to FP16.
+
+    1. Convert FP32 model to FP16.
+    2. Remain some necessary layers to be FP32, e.g., normalization layers.
+
+    Args:
+        model (nn.Module): Model in FP32.
+    """
+    # convert model to fp16
+    model.half()
+    # patch the normalization layers to make it work in fp32 mode
+    patch_norm_fp32(model)
+    # set `fp16_enabled` flag
+    for m in model.modules():
+        if hasattr(m, 'fp16_enabled'):
+            m.fp16_enabled = True
+
+
+def patch_norm_fp32(module):
+    """Recursively convert normalization layers from FP16 to FP32.
+
+    Args:
+        module (nn.Module): The modules to be converted in FP16.
+
+    Returns:
+        nn.Module: The converted module, the normalization layers have been
+            converted to FP32.
+    """
+    if isinstance(module, (_BatchNorm, nn.GroupNorm)):
+        module.float()
+        module.forward = patch_forward_method(module.forward, torch.half,
+                                              torch.float)
+    for child in module.children():
+        patch_norm_fp32(child)
+    return module
+
+
+def patch_forward_method(func, src_type, dst_type, convert_output=True):
+    """Patch the forward method of a module.
+
+    Args:
+        func (callable): The original forward method.
+        src_type (torch.dtype): Type of input arguments to be converted from.
+        dst_type (torch.dtype): Type of input arguments to be converted to.
+        convert_output (bool): Whether to convert the output back to src_type.
+
+    Returns:
+        callable: The patched forward method.
+    """
+
+    def new_forward(*args, **kwargs):
+        output = func(*cast_tensor_type(args, src_type, dst_type),
+                      **cast_tensor_type(kwargs, src_type, dst_type))
+        if convert_output:
+            output = cast_tensor_type(output, dst_type, src_type)
+        return output
+
+    return new_forward
diff --git a/mmpose/core/fp16/utils.py b/mmpose/core/fp16/utils.py
new file mode 100644
index 0000000..f1ec3d3
--- /dev/null
+++ b/mmpose/core/fp16/utils.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import abc
+
+import numpy as np
+import torch
+
+
+def cast_tensor_type(inputs, src_type, dst_type):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type.
+        dst_type (torch.dtype): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, torch.Tensor):
+        return inputs.to(dst_type)
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+
+    return inputs
diff --git a/mmpose/core/optimizer/__init__.py b/mmpose/core/optimizer/__init__.py
new file mode 100644
index 0000000..4340ffc
--- /dev/null
+++ b/mmpose/core/optimizer/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import OPTIMIZERS, build_optimizers
+
+__all__ = ['build_optimizers', 'OPTIMIZERS']
diff --git a/mmpose/core/optimizer/builder.py b/mmpose/core/optimizer/builder.py
new file mode 100644
index 0000000..7d6accd
--- /dev/null
+++ b/mmpose/core/optimizer/builder.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import build_optimizer
+from mmcv.utils import Registry
+
+OPTIMIZERS = Registry('optimizers')
+
+
+def build_optimizers(model, cfgs):
+    """Build multiple optimizers from configs.
+
+    If `cfgs` contains several dicts for optimizers, then a dict for each
+    constructed optimizers will be returned.
+    If `cfgs` only contains one optimizer config, the constructed optimizer
+    itself will be returned.
+
+    For example,
+
+    1) Multiple optimizer configs:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(
+            model1=dict(type='SGD', lr=lr),
+            model2=dict(type='SGD', lr=lr))
+
+    The return dict is
+    ``dict('model1': torch.optim.Optimizer, 'model2': torch.optim.Optimizer)``
+
+    2) Single optimizer config:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(type='SGD', lr=lr)
+
+    The return is ``torch.optim.Optimizer``.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        cfgs (dict): The config dict of the optimizer.
+
+    Returns:
+        dict[:obj:`torch.optim.Optimizer`] | :obj:`torch.optim.Optimizer`:
+            The initialized optimizers.
+    """
+    optimizers = {}
+    if hasattr(model, 'module'):
+        model = model.module
+    # determine whether 'cfgs' has several dicts for optimizers
+    if all(isinstance(v, dict) for v in cfgs.values()):
+        for key, cfg in cfgs.items():
+            cfg_ = cfg.copy()
+            module = getattr(model, key)
+            optimizers[key] = build_optimizer(module, cfg_)
+        return optimizers
+
+    return build_optimizer(model, cfgs)
diff --git a/mmpose/core/post_processing/__init__.py b/mmpose/core/post_processing/__init__.py
new file mode 100644
index 0000000..1ee6858
--- /dev/null
+++ b/mmpose/core/post_processing/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .nms import oks_iou, oks_nms, soft_oks_nms
+from .one_euro_filter import OneEuroFilter
+from .post_transforms import (affine_transform, flip_back, fliplr_joints,
+                              fliplr_regression, get_affine_transform,
+                              get_warp_matrix, rotate_point, transform_preds,
+                              warp_affine_joints)
+
+__all__ = [
+    'oks_nms', 'soft_oks_nms', 'affine_transform', 'rotate_point', 'flip_back',
+    'fliplr_joints', 'fliplr_regression', 'transform_preds',
+    'get_affine_transform', 'get_warp_matrix', 'warp_affine_joints',
+    'OneEuroFilter', 'oks_iou'
+]
diff --git a/mmpose/core/post_processing/group.py b/mmpose/core/post_processing/group.py
new file mode 100644
index 0000000..6235dbc
--- /dev/null
+++ b/mmpose/core/post_processing/group.py
@@ -0,0 +1,410 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/princeton-vl/pose-ae-train/
+# Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+import torch
+from munkres import Munkres
+
+from mmpose.core.evaluation import post_dark_udp
+
+
+def _py_max_match(scores):
+    """Apply munkres algorithm to get the best match.
+
+    Args:
+        scores(np.ndarray): cost matrix.
+
+    Returns:
+        np.ndarray: best match.
+    """
+    m = Munkres()
+    tmp = m.compute(scores)
+    tmp = np.array(tmp).astype(int)
+    return tmp
+
+
+def _match_by_tag(inp, params):
+    """Match joints by tags. Use Munkres algorithm to calculate the best match
+    for keypoints grouping.
+
+    Note:
+        number of keypoints: K
+        max number of people in an image: M (M=30 by default)
+        dim of tags: L
+            If use flip testing, L=2; else L=1.
+
+    Args:
+        inp(tuple):
+            tag_k (np.ndarray[KxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[KxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[KxM]): top k value of the
+                feature maps per keypoint.
+        params(Params): class Params().
+
+    Returns:
+        np.ndarray: result of pose groups.
+    """
+    assert isinstance(params, _Params), 'params should be class _Params()'
+
+    tag_k, loc_k, val_k = inp
+
+    default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]),
+                        dtype=np.float32)
+
+    joint_dict = {}
+    tag_dict = {}
+    for i in range(params.num_joints):
+        idx = params.joint_order[i]
+
+        tags = tag_k[idx]
+        joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1)
+        mask = joints[:, 2] > params.detection_threshold
+        tags = tags[mask]
+        joints = joints[mask]
+
+        if joints.shape[0] == 0:
+            continue
+
+        if i == 0 or len(joint_dict) == 0:
+            for tag, joint in zip(tags, joints):
+                key = tag[0]
+                joint_dict.setdefault(key, np.copy(default_))[idx] = joint
+                tag_dict[key] = [tag]
+        else:
+            grouped_keys = list(joint_dict.keys())[:params.max_num_people]
+            grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys]
+
+            if (params.ignore_too_much
+                    and len(grouped_keys) == params.max_num_people):
+                continue
+
+            diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :]
+            diff_normed = np.linalg.norm(diff, ord=2, axis=2)
+            diff_saved = np.copy(diff_normed)
+
+            if params.use_detection_val:
+                diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
+
+            num_added = diff.shape[0]
+            num_grouped = diff.shape[1]
+
+            if num_added > num_grouped:
+                diff_normed = np.concatenate(
+                    (diff_normed,
+                     np.zeros((num_added, num_added - num_grouped),
+                              dtype=np.float32) + 1e10),
+                    axis=1)
+
+            pairs = _py_max_match(diff_normed)
+            for row, col in pairs:
+                if (row < num_added and col < num_grouped
+                        and diff_saved[row][col] < params.tag_threshold):
+                    key = grouped_keys[col]
+                    joint_dict[key][idx] = joints[row]
+                    tag_dict[key].append(tags[row])
+                else:
+                    key = tags[row][0]
+                    joint_dict.setdefault(key, np.copy(default_))[idx] = \
+                        joints[row]
+                    tag_dict[key] = [tags[row]]
+
+    results = np.array([joint_dict[i] for i in joint_dict]).astype(np.float32)
+    return results
+
+
+class _Params:
+    """A class of parameter.
+
+    Args:
+        cfg(Config): config.
+    """
+
+    def __init__(self, cfg):
+        self.num_joints = cfg['num_joints']
+        self.max_num_people = cfg['max_num_people']
+
+        self.detection_threshold = cfg['detection_threshold']
+        self.tag_threshold = cfg['tag_threshold']
+        self.use_detection_val = cfg['use_detection_val']
+        self.ignore_too_much = cfg['ignore_too_much']
+
+        if self.num_joints == 17:
+            self.joint_order = [
+                i - 1 for i in
+                [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17]
+            ]
+        else:
+            self.joint_order = list(np.arange(self.num_joints))
+
+
+class HeatmapParser:
+    """The heatmap parser for post processing."""
+
+    def __init__(self, cfg):
+        self.params = _Params(cfg)
+        self.tag_per_joint = cfg['tag_per_joint']
+        self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1,
+                                       cfg['nms_padding'])
+        self.use_udp = cfg.get('use_udp', False)
+        self.score_per_joint = cfg.get('score_per_joint', False)
+
+    def nms(self, heatmaps):
+        """Non-Maximum Suppression for heatmaps.
+
+        Args:
+            heatmap(torch.Tensor): Heatmaps before nms.
+
+        Returns:
+            torch.Tensor: Heatmaps after nms.
+        """
+
+        maxm = self.pool(heatmaps)
+        maxm = torch.eq(maxm, heatmaps).float()
+        heatmaps = heatmaps * maxm
+
+        return heatmaps
+
+    def match(self, tag_k, loc_k, val_k):
+        """Group keypoints to human poses in a batch.
+
+        Args:
+            tag_k (np.ndarray[NxKxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[NxKxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[NxKxM]): top k value of the
+                feature maps per keypoint.
+
+        Returns:
+            list
+        """
+
+        def _match(x):
+            return _match_by_tag(x, self.params)
+
+        return list(map(_match, zip(tag_k, loc_k, val_k)))
+
+    def top_k(self, heatmaps, tags):
+        """Find top_k values in an image.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            max number of people: M
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW])
+            tags (torch.Tensor[NxKxHxWxL])
+
+        Returns:
+            dict: A dict containing top_k values.
+
+            - tag_k (np.ndarray[NxKxMxL]):
+                tag corresponding to the top k values of
+                feature map per keypoint.
+            - loc_k (np.ndarray[NxKxMx2]):
+                top k location of feature map per keypoint.
+            - val_k (np.ndarray[NxKxM]):
+                top k value of feature map per keypoint.
+        """
+        heatmaps = self.nms(heatmaps)
+        N, K, H, W = heatmaps.size()
+        heatmaps = heatmaps.view(N, K, -1)
+        val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2)
+
+        tags = tags.view(tags.size(0), tags.size(1), W * H, -1)
+        if not self.tag_per_joint:
+            tags = tags.expand(-1, self.params.num_joints, -1, -1)
+
+        tag_k = torch.stack(
+            [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))],
+            dim=3)
+
+        x = ind % W
+        y = ind // W
+
+        ind_k = torch.stack((x, y), dim=3)
+
+        results = {
+            'tag_k': tag_k.cpu().numpy(),
+            'loc_k': ind_k.cpu().numpy(),
+            'val_k': val_k.cpu().numpy()
+        }
+
+        return results
+
+    @staticmethod
+    def adjust(results, heatmaps):
+        """Adjust the coordinates for better accuracy.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            results (list(np.ndarray)): Keypoint predictions.
+            heatmaps (torch.Tensor[NxKxHxW]): Heatmaps.
+        """
+        _, _, H, W = heatmaps.shape
+        for batch_id, people in enumerate(results):
+            for people_id, people_i in enumerate(people):
+                for joint_id, joint in enumerate(people_i):
+                    if joint[2] > 0:
+                        x, y = joint[0:2]
+                        xx, yy = int(x), int(y)
+                        tmp = heatmaps[batch_id][joint_id]
+                        if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1),
+                                                             xx]:
+                            y += 0.25
+                        else:
+                            y -= 0.25
+
+                        if tmp[yy, min(W - 1, xx + 1)] > tmp[yy,
+                                                             max(0, xx - 1)]:
+                            x += 0.25
+                        else:
+                            x -= 0.25
+                        results[batch_id][people_id, joint_id,
+                                          0:2] = (x + 0.5, y + 0.5)
+        return results
+
+    @staticmethod
+    def refine(heatmap, tag, keypoints, use_udp=False):
+        """Given initial keypoint predictions, we identify missing joints.
+
+        Note:
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmap: np.ndarray(K, H, W).
+            tag: np.ndarray(K, H, W) |  np.ndarray(K, H, W, L)
+            keypoints: np.ndarray of size (K, 3 + L)
+                        last dim is (x, y, score, tag).
+            use_udp: bool-unbiased data processing
+
+        Returns:
+            np.ndarray: The refined keypoints.
+        """
+
+        K, H, W = heatmap.shape
+        if len(tag.shape) == 3:
+            tag = tag[..., None]
+
+        tags = []
+        for i in range(K):
+            if keypoints[i, 2] > 0:
+                # save tag value of detected keypoint
+                x, y = keypoints[i][:2].astype(int)
+                x = np.clip(x, 0, W - 1)
+                y = np.clip(y, 0, H - 1)
+                tags.append(tag[i, y, x])
+
+        # mean tag of current detected people
+        prev_tag = np.mean(tags, axis=0)
+        results = []
+
+        for _heatmap, _tag in zip(heatmap, tag):
+            # distance of all tag values with mean tag of
+            # current detected people
+            distance_tag = (((_tag -
+                              prev_tag[None, None, :])**2).sum(axis=2)**0.5)
+            norm_heatmap = _heatmap - np.round(distance_tag)
+
+            # find maximum position
+            y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape)
+            xx = x.copy()
+            yy = y.copy()
+            # detection score at maximum position
+            val = _heatmap[y, x]
+            if not use_udp:
+                # offset by 0.5
+                x += 0.5
+                y += 0.5
+
+            # add a quarter offset
+            if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]:
+                x += 0.25
+            else:
+                x -= 0.25
+
+            if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]:
+                y += 0.25
+            else:
+                y -= 0.25
+
+            results.append((x, y, val))
+        results = np.array(results)
+
+        if results is not None:
+            for i in range(K):
+                # add keypoint if it is not detected
+                if results[i, 2] > 0 and keypoints[i, 2] == 0:
+                    keypoints[i, :3] = results[i, :3]
+
+        return keypoints
+
+    def parse(self, heatmaps, tags, adjust=True, refine=True):
+        """Group keypoints into poses given heatmap and tag.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps.
+            tags (torch.Tensor[NxKxHxWxL]): model output tagmaps.
+
+        Returns:
+            tuple: A tuple containing keypoint grouping results.
+
+            - results (list(np.ndarray)): Pose results.
+            - scores (list/list(np.ndarray)): Score of people.
+        """
+        results = self.match(**self.top_k(heatmaps, tags))
+
+        if adjust:
+            if self.use_udp:
+                for i in range(len(results)):
+                    if results[i].shape[0] > 0:
+                        results[i][..., :2] = post_dark_udp(
+                            results[i][..., :2].copy(), heatmaps[i:i + 1, :])
+            else:
+                results = self.adjust(results, heatmaps)
+
+        if self.score_per_joint:
+            scores = [i[:, 2] for i in results[0]]
+        else:
+            scores = [i[:, 2].mean() for i in results[0]]
+
+        if refine:
+            results = results[0]
+            # for every detected person
+            for i in range(len(results)):
+                heatmap_numpy = heatmaps[0].cpu().numpy()
+                tag_numpy = tags[0].cpu().numpy()
+                if not self.tag_per_joint:
+                    tag_numpy = np.tile(tag_numpy,
+                                        (self.params.num_joints, 1, 1, 1))
+                results[i] = self.refine(
+                    heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp)
+            results = [results]
+
+        return results, scores
diff --git a/mmpose/core/post_processing/nms.py b/mmpose/core/post_processing/nms.py
new file mode 100644
index 0000000..86a0ab3
--- /dev/null
+++ b/mmpose/core/post_processing/nms.py
@@ -0,0 +1,207 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def nms(dets, thr):
+    """Greedily select boxes with high confidence and overlap <= thr.
+
+    Args:
+        dets: [[x1, y1, x2, y2, score]].
+        thr: Retain overlap < thr.
+
+    Returns:
+         list: Indexes to keep.
+    """
+    if len(dets) == 0:
+        return []
+
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thr)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None):
+    """Calculate oks ious.
+
+    Args:
+        g: Ground truth keypoints.
+        d: Detected keypoints.
+        a_g: Area of the ground truth object.
+        a_d: Area of the detected object.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+
+    Returns:
+        list: The oks ious.
+    """
+    if sigmas is None:
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+    vars = (sigmas * 2)**2
+    xg = g[0::3]
+    yg = g[1::3]
+    vg = g[2::3]
+    ious = np.zeros(len(d), dtype=np.float32)
+    for n_d in range(0, len(d)):
+        xd = d[n_d, 0::3]
+        yd = d[n_d, 1::3]
+        vd = d[n_d, 2::3]
+        dx = xd - xg
+        dy = yd - yg
+        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
+        if vis_thr is not None:
+            ind = list(vg > vis_thr) and list(vd > vis_thr)
+            e = e[ind]
+        ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0
+    return ious
+
+
+def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False):
+    """OKS NMS implementations.
+
+    Args:
+        kpts_db: keypoints.
+        thr: Retain overlap < thr.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        inds = np.where(oks_ovr <= thr)[0]
+        order = order[inds + 1]
+
+    keep = np.array(keep)
+
+    return keep
+
+
+def _rescore(overlap, scores, thr, type='gaussian'):
+    """Rescoring mechanism gaussian or linear.
+
+    Args:
+        overlap: calculated ious
+        scores: target scores.
+        thr: retain oks overlap < thr.
+        type: 'gaussian' or 'linear'
+
+    Returns:
+        np.ndarray: indexes to keep
+    """
+    assert len(overlap) == len(scores)
+    assert type in ['gaussian', 'linear']
+
+    if type == 'linear':
+        inds = np.where(overlap >= thr)[0]
+        scores[inds] = scores[inds] * (1 - overlap[inds])
+    else:
+        scores = scores * np.exp(-overlap**2 / thr)
+
+    return scores
+
+
+def soft_oks_nms(kpts_db,
+                 thr,
+                 max_dets=20,
+                 sigmas=None,
+                 vis_thr=None,
+                 score_per_joint=False):
+    """Soft OKS NMS implementations.
+
+    Args:
+        kpts_db
+        thr: retain oks overlap < thr.
+        max_dets: max number of detections to keep.
+        sigmas: Keypoint labelling uncertainty.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+    scores = scores[order]
+
+    keep = np.zeros(max_dets, dtype=np.intp)
+    keep_cnt = 0
+    while len(order) > 0 and keep_cnt < max_dets:
+        i = order[0]
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        order = order[1:]
+        scores = _rescore(oks_ovr, scores[1:], thr)
+
+        tmp = scores.argsort()[::-1]
+        order = order[tmp]
+        scores = scores[tmp]
+
+        keep[keep_cnt] = i
+        keep_cnt += 1
+
+    keep = keep[:keep_cnt]
+
+    return keep
diff --git a/mmpose/core/post_processing/one_euro_filter.py b/mmpose/core/post_processing/one_euro_filter.py
new file mode 100644
index 0000000..01ffa5f
--- /dev/null
+++ b/mmpose/core/post_processing/one_euro_filter.py
@@ -0,0 +1,102 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy
+# Original licence: Copyright (c)  HoBeom Jeon, under the MIT License.
+# ------------------------------------------------------------------------------
+from time import time
+
+import numpy as np
+
+
+def smoothing_factor(t_e, cutoff):
+    r = 2 * np.pi * cutoff * t_e
+    return r / (r + 1)
+
+
+def exponential_smoothing(a, x, x_prev):
+    return a * x + (1 - a) * x_prev
+
+
+class OneEuroFilter:
+
+    def __init__(self,
+                 x0,
+                 dx0=0.0,
+                 min_cutoff=1.7,
+                 beta=0.3,
+                 d_cutoff=30.0,
+                 fps=None):
+        """One Euro Filter for keypoints smoothing.
+
+        Args:
+            x0 (np.ndarray[K, 2]): Initialize keypoints value
+            dx0 (float): 0.0
+            min_cutoff (float): parameter for one euro filter
+            beta (float): parameter for one euro filter
+            d_cutoff (float): Input data FPS
+            fps (float): Video FPS for video inference
+        """
+
+        # The parameters.
+        self.data_shape = x0.shape
+        self.min_cutoff = np.full(x0.shape, min_cutoff)
+        self.beta = np.full(x0.shape, beta)
+        self.d_cutoff = np.full(x0.shape, d_cutoff)
+        # Previous values.
+        self.x_prev = x0.astype(np.float32)
+        self.dx_prev = np.full(x0.shape, dx0)
+        self.mask_prev = np.ma.masked_where(x0 <= 0, x0)
+        self.realtime = True
+        if fps is None:
+            # Using in realtime inference
+            self.t_e = None
+            self.skip_frame_factor = d_cutoff
+        else:
+            # fps using video inference
+            self.realtime = False
+            self.d_cutoff = np.full(x0.shape, float(fps))
+        self.t_prev = time()
+
+    def __call__(self, x, t_e=1.0):
+        """Compute the filtered signal.
+
+        Hyper-parameters (cutoff, beta) are from `VNect
+        <http://gvv.mpi-inf.mpg.de/projects/VNect/>`__ .
+
+        Realtime Camera fps (d_cutoff) default 30.0
+
+        Args:
+            x (np.ndarray[K, 2]): keypoints results in frame
+            t_e (Optional): video skip frame count for posetrack
+                evaluation
+        """
+        assert x.shape == self.data_shape
+
+        t = 0
+        if self.realtime:
+            t = time()
+            t_e = (t - self.t_prev) * self.skip_frame_factor
+        t_e = np.full(x.shape, t_e)
+
+        # missing keypoints mask
+        mask = np.ma.masked_where(x <= 0, x)
+
+        # The filtered derivative of the signal.
+        a_d = smoothing_factor(t_e, self.d_cutoff)
+        dx = (x - self.x_prev) / t_e
+        dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
+
+        # The filtered signal.
+        cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
+        a = smoothing_factor(t_e, cutoff)
+        x_hat = exponential_smoothing(a, x, self.x_prev)
+
+        # missing keypoints remove
+        np.copyto(x_hat, -10, where=mask.mask)
+
+        # Memorize the previous values.
+        self.x_prev = x_hat
+        self.dx_prev = dx_hat
+        self.t_prev = t
+        self.mask_prev = mask
+
+        return x_hat
diff --git a/mmpose/core/post_processing/post_transforms.py b/mmpose/core/post_processing/post_transforms.py
new file mode 100644
index 0000000..93063fb
--- /dev/null
+++ b/mmpose/core/post_processing/post_transforms.py
@@ -0,0 +1,366 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import math
+
+import cv2
+import numpy as np
+import torch
+
+
+def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs):
+    """Flip human joints horizontally.
+
+    Note:
+        - num_keypoints: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints.
+        img_width (int): Image width.
+        flip_pairs (list[tuple]): Pairs of keypoints which are mirrored
+            (for example, left ear and right ear).
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints.
+        - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility.
+    """
+
+    assert len(joints_3d) == len(joints_3d_visible)
+    assert img_width > 0
+
+    joints_3d_flipped = joints_3d.copy()
+    joints_3d_visible_flipped = joints_3d_visible.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+        joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
+        joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
+
+    # Flip horizontally
+    joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
+    joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped
+
+    return joints_3d_flipped, joints_3d_visible_flipped
+
+
+def fliplr_regression(regression,
+                      flip_pairs,
+                      center_mode='static',
+                      center_x=0.5,
+                      center_index=0):
+    """Flip human joints horizontally.
+
+    Note:
+        - batch_size: N
+        - num_keypoint: K
+
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        np.ndarray([..., K, C]): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+        f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped
+
+
+def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
+    """Flip the flipped heatmaps back to the original form.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
+            from the flipped images.
+        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        target_type (str): GaussianHeatmap or CombinedTarget
+
+    Returns:
+        np.ndarray: heatmaps that flipped back to the original image
+    """
+    assert output_flipped.ndim == 4, \
+        'output_flipped should be [batch_size, num_keypoints, height, width]'
+    shape_ori = output_flipped.shape
+    channels = 1
+    if target_type.lower() == 'CombinedTarget'.lower():
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape(shape_ori[0], -1, channels,
+                                            shape_ori[2], shape_ori[3])
+    output_flipped_back = output_flipped.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    # Flip horizontally
+    output_flipped_back = output_flipped_back[..., ::-1]
+    return output_flipped_back
+
+
+def transform_preds(coords, center, scale, output_size, use_udp=False):
+    """Get final keypoint predictions from heatmaps and apply scaling and
+    translation to map them back to the image.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        coords (np.ndarray[K, ndims]):
+
+            * If ndims=2, corrds are predicted keypoint location.
+            * If ndims=4, corrds are composed of (x, y, scores, tags)
+            * If ndims=5, corrds are composed of (x, y, scores, tags,
+              flipped_tags)
+
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        use_udp (bool): Use unbiased data processing
+
+    Returns:
+        np.ndarray: Predicted coordinates in the images.
+    """
+    assert coords.shape[1] in (2, 4, 5)
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+
+    # Recover the scale which is normalized by a factor of 200.
+    scale = scale * 200.0
+
+    if use_udp:
+        scale_x = scale[0] / (output_size[0] - 1.0)
+        scale_y = scale[1] / (output_size[1] - 1.0)
+    else:
+        scale_x = scale[0] / output_size[0]
+        scale_y = scale[1] / output_size[1]
+
+    target_coords = np.ones_like(coords)
+    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
+    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
+
+    return target_coords
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    # pixel_std is 200.
+    scale_tmp = scale * 200.0
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, trans_mat):
+    """Apply an affine transformation to the points.
+
+    Args:
+        pt (np.ndarray): a 2 dimensional point to be transformed
+        trans_mat (np.ndarray): 2x3 matrix of an affine transform
+
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    assert len(pt) == 2
+    new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
+
+    return new_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        np.ndarray: A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
+                              0.5 * size_input[1] * math.sin(theta) +
+                              0.5 * size_target[0])
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
+                              0.5 * size_input[1] * math.cos(theta) +
+                              0.5 * size_target[1])
+    return matrix
+
+
+def warp_affine_joints(joints, mat):
+    """Apply affine transformation defined by the transform matrix on the
+    joints.
+
+    Args:
+        joints (np.ndarray[..., 2]): Origin coordinate of joints.
+        mat (np.ndarray[3, 2]): The affine matrix.
+
+    Returns:
+        np.ndarray[..., 2]: Result coordinate of joints.
+    """
+    joints = np.array(joints)
+    shape = joints.shape
+    joints = joints.reshape(-1, 2)
+    return np.dot(
+        np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1),
+        mat.T).reshape(shape)
+
+
+def affine_transform_torch(pts, t):
+    npts = pts.shape[0]
+    pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1)
+    out = torch.mm(t, torch.t(pts_homo))
+    return torch.t(out[:2, :])
diff --git a/mmpose/core/utils/__init__.py b/mmpose/core/utils/__init__.py
new file mode 100644
index 0000000..bd6c027
--- /dev/null
+++ b/mmpose/core/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dist_utils import allreduce_grads
+from .regularizations import WeightNormClipHook
+
+__all__ = ['allreduce_grads', 'WeightNormClipHook']
diff --git a/mmpose/core/utils/dist_utils.py b/mmpose/core/utils/dist_utils.py
new file mode 100644
index 0000000..e76e591
--- /dev/null
+++ b/mmpose/core/utils/dist_utils.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+import torch.distributed as dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    """Allreduce parameters as a whole."""
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Default: True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Default: -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
diff --git a/mmpose/core/utils/regularizations.py b/mmpose/core/utils/regularizations.py
new file mode 100644
index 0000000..d8c7449
--- /dev/null
+++ b/mmpose/core/utils/regularizations.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod, abstractproperty
+
+import torch
+
+
+class PytorchModuleHook(metaclass=ABCMeta):
+    """Base class for PyTorch module hook registers.
+
+    An instance of a subclass of PytorchModuleHook can be used to
+    register hook to a pytorch module using the `register` method like:
+        hook_register.register(module)
+
+    Subclasses should add/overwrite the following methods:
+        - __init__
+        - hook
+        - hook_type
+    """
+
+    @abstractmethod
+    def hook(self, *args, **kwargs):
+        """Hook function."""
+
+    @abstractproperty
+    def hook_type(self) -> str:
+        """Hook type Subclasses should overwrite this function to return a
+        string value in.
+
+        {`forward`, `forward_pre`, `backward`}
+        """
+
+    def register(self, module):
+        """Register the hook function to the module.
+
+        Args:
+            module (pytorch module): the module to register the hook.
+
+        Returns:
+            handle (torch.utils.hooks.RemovableHandle): a handle to remove
+                the hook by calling handle.remove()
+        """
+        assert isinstance(module, torch.nn.Module)
+
+        if self.hook_type == 'forward':
+            h = module.register_forward_hook(self.hook)
+        elif self.hook_type == 'forward_pre':
+            h = module.register_forward_pre_hook(self.hook)
+        elif self.hook_type == 'backward':
+            h = module.register_backward_hook(self.hook)
+        else:
+            raise ValueError(f'Invalid hook type {self.hook}')
+
+        return h
+
+
+class WeightNormClipHook(PytorchModuleHook):
+    """Apply weight norm clip regularization.
+
+    The module's parameter will be clip to a given maximum norm before each
+    forward pass.
+
+    Args:
+        max_norm (float): The maximum norm of the parameter.
+        module_param_names (str|list): The parameter name (or name list) to
+            apply weight norm clip.
+    """
+
+    def __init__(self, max_norm=1.0, module_param_names='weight'):
+        self.module_param_names = module_param_names if isinstance(
+            module_param_names, list) else [module_param_names]
+        self.max_norm = max_norm
+
+    @property
+    def hook_type(self):
+        return 'forward_pre'
+
+    def hook(self, module, _input):
+        for name in self.module_param_names:
+            assert name in module._parameters, f'{name} is not a parameter' \
+                f' of the module {type(module)}'
+            param = module._parameters[name]
+
+            with torch.no_grad():
+                m = param.norm().item()
+                if m > self.max_norm:
+                    param.mul_(self.max_norm / (m + 1e-6))
diff --git a/mmpose/core/visualization/__init__.py b/mmpose/core/visualization/__init__.py
new file mode 100644
index 0000000..9705494
--- /dev/null
+++ b/mmpose/core/visualization/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .effects import apply_bugeye_effect, apply_sunglasses_effect
+from .image import (imshow_bboxes, imshow_keypoints, imshow_keypoints_3d,
+                    imshow_mesh_3d)
+
+__all__ = [
+    'imshow_keypoints',
+    'imshow_keypoints_3d',
+    'imshow_bboxes',
+    'apply_bugeye_effect',
+    'apply_sunglasses_effect',
+    'imshow_mesh_3d',
+]
diff --git a/mmpose/core/visualization/effects.py b/mmpose/core/visualization/effects.py
new file mode 100644
index 0000000..d3add7d
--- /dev/null
+++ b/mmpose/core/visualization/effects.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+
+def apply_bugeye_effect(img,
+                        pose_results,
+                        left_eye_index,
+                        right_eye_index,
+                        kpt_thr=0.5):
+    """Apply bug-eye effect.
+
+    Args:
+        img (np.ndarray): Image data.
+        pose_results (list[dict]): The pose estimation results containing:
+            - "bbox" ([K, 4(or 5)]): detection bbox in
+                [x1, y1, x2, y2, (score)]
+            - "keypoints" ([K,3]): keypoint detection result in [x, y, score]
+        left_eye_index (int): Keypoint index of left eye
+        right_eye_index (int): Keypoint index of right eye
+        kpt_thr (float): The score threshold of required keypoints.
+    """
+
+    xx, yy = np.meshgrid(np.arange(img.shape[1]), np.arange(img.shape[0]))
+    xx = xx.astype(np.float32)
+    yy = yy.astype(np.float32)
+
+    for pose in pose_results:
+        bbox = pose['bbox']
+        kpts = pose['keypoints']
+
+        if kpts[left_eye_index, 2] < kpt_thr or kpts[right_eye_index,
+                                                     2] < kpt_thr:
+            continue
+
+        kpt_leye = kpts[left_eye_index, :2]
+        kpt_reye = kpts[right_eye_index, :2]
+        for xc, yc in [kpt_leye, kpt_reye]:
+
+            # distortion parameters
+            k1 = 0.001
+            epe = 1e-5
+
+            scale = (bbox[2] - bbox[0])**2 + (bbox[3] - bbox[1])**2
+            r2 = ((xx - xc)**2 + (yy - yc)**2)
+            r2 = (r2 + epe) / scale  # normalized by bbox scale
+
+            xx = (xx - xc) / (1 + k1 / r2) + xc
+            yy = (yy - yc) / (1 + k1 / r2) + yc
+
+        img = cv2.remap(
+            img,
+            xx,
+            yy,
+            interpolation=cv2.INTER_AREA,
+            borderMode=cv2.BORDER_REPLICATE)
+    return img
+
+
+def apply_sunglasses_effect(img,
+                            pose_results,
+                            sunglasses_img,
+                            left_eye_index,
+                            right_eye_index,
+                            kpt_thr=0.5):
+    """Apply sunglasses effect.
+
+    Args:
+        img (np.ndarray): Image data.
+        pose_results (list[dict]): The pose estimation results containing:
+            - "keypoints" ([K,3]): keypoint detection result in [x, y, score]
+        sunglasses_img (np.ndarray): Sunglasses image with white background.
+        left_eye_index (int): Keypoint index of left eye
+        right_eye_index (int): Keypoint index of right eye
+        kpt_thr (float): The score threshold of required keypoints.
+    """
+
+    hm, wm = sunglasses_img.shape[:2]
+    # anchor points in the sunglasses mask
+    pts_src = np.array([[0.3 * wm, 0.3 * hm], [0.3 * wm, 0.7 * hm],
+                        [0.7 * wm, 0.3 * hm], [0.7 * wm, 0.7 * hm]],
+                       dtype=np.float32)
+
+    for pose in pose_results:
+        kpts = pose['keypoints']
+
+        if kpts[left_eye_index, 2] < kpt_thr or kpts[right_eye_index,
+                                                     2] < kpt_thr:
+            continue
+
+        kpt_leye = kpts[left_eye_index, :2]
+        kpt_reye = kpts[right_eye_index, :2]
+        # orthogonal vector to the left-to-right eyes
+        vo = 0.5 * (kpt_reye - kpt_leye)[::-1] * [-1, 1]
+
+        # anchor points in the image by eye positions
+        pts_tar = np.vstack(
+            [kpt_reye + vo, kpt_reye - vo, kpt_leye + vo, kpt_leye - vo])
+
+        h_mat, _ = cv2.findHomography(pts_src, pts_tar)
+        patch = cv2.warpPerspective(
+            sunglasses_img,
+            h_mat,
+            dsize=(img.shape[1], img.shape[0]),
+            borderValue=(255, 255, 255))
+        #  mask the white background area in the patch with a threshold 200
+        mask = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
+        mask = (mask < 200).astype(np.uint8)
+        img = cv2.copyTo(patch, mask, img)
+
+    return img
diff --git a/mmpose/core/visualization/image.py b/mmpose/core/visualization/image.py
new file mode 100644
index 0000000..8acd10b
--- /dev/null
+++ b/mmpose/core/visualization/image.py
@@ -0,0 +1,442 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import warnings
+
+import cv2
+import mmcv
+import numpy as np
+from matplotlib import pyplot as plt
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.color import color_val
+
+try:
+    import trimesh
+    has_trimesh = True
+except (ImportError, ModuleNotFoundError):
+    has_trimesh = False
+
+try:
+    os.environ['PYOPENGL_PLATFORM'] = 'osmesa'
+    import pyrender
+    has_pyrender = True
+except (ImportError, ModuleNotFoundError):
+    has_pyrender = False
+
+
+def imshow_bboxes(img,
+                  bboxes,
+                  labels=None,
+                  colors='green',
+                  text_color='white',
+                  thickness=1,
+                  font_scale=0.5,
+                  show=True,
+                  win_name='',
+                  wait_time=0,
+                  out_file=None):
+    """Draw bboxes with labels (optional) on an image. This is a wrapper of
+    mmcv.imshow_bboxes.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): ndarray of shape (k, 4), each row is a bbox in
+            format [x1, y1, x2, y2].
+        labels (str or list[str], optional): labels of each bbox.
+        colors (list[str or tuple or :obj:`Color`]): A list of colors.
+        text_color (str or tuple or :obj:`Color`): Color of texts.
+        thickness (int): Thickness of lines.
+        font_scale (float): Font scales of texts.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str, optional): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+
+    # adapt to mmcv.imshow_bboxes input format
+    bboxes = np.split(
+        bboxes, bboxes.shape[0], axis=0) if bboxes.shape[0] > 0 else []
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(bboxes))]
+    colors = [mmcv.color_val(c) for c in colors]
+    assert len(bboxes) == len(colors)
+
+    img = mmcv.imshow_bboxes(
+        img,
+        bboxes,
+        colors,
+        top_k=-1,
+        thickness=thickness,
+        show=False,
+        out_file=None)
+
+    if labels is not None:
+        if not isinstance(labels, list):
+            labels = [labels for _ in range(len(bboxes))]
+        assert len(labels) == len(bboxes)
+
+        for bbox, label, color in zip(bboxes, labels, colors):
+            if label is None:
+                continue
+            bbox_int = bbox[0, :4].astype(np.int32)
+            # roughly estimate the proper font size
+            text_size, text_baseline = cv2.getTextSize(label,
+                                                       cv2.FONT_HERSHEY_DUPLEX,
+                                                       font_scale, thickness)
+            text_x1 = bbox_int[0]
+            text_y1 = max(0, bbox_int[1] - text_size[1] - text_baseline)
+            text_x2 = bbox_int[0] + text_size[0]
+            text_y2 = text_y1 + text_size[1] + text_baseline
+            cv2.rectangle(img, (text_x1, text_y1), (text_x2, text_y2), color,
+                          cv2.FILLED)
+            cv2.putText(img, label, (text_x1, text_y2 - text_baseline),
+                        cv2.FONT_HERSHEY_DUPLEX, font_scale,
+                        mmcv.color_val(text_color), thickness)
+
+    if show:
+        mmcv.imshow(img, win_name, wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+    return img
+
+
+@deprecated_api_warning({'pose_limb_color': 'pose_link_color'})
+def imshow_keypoints(img,
+                     pose_result,
+                     skeleton=None,
+                     kpt_score_thr=0.3,
+                     pose_kpt_color=None,
+                     pose_link_color=None,
+                     radius=4,
+                     thickness=1,
+                     show_keypoint_weight=False):
+    """Draw keypoints and links on an image.
+
+    Args:
+            img (str or Tensor): The image to draw poses on. If an image array
+                is given, id will be modified in-place.
+            pose_result (list[kpts]): The poses to draw. Each element kpts is
+                a set of K keypoints as an Kx3 numpy.ndarray, where each
+                keypoint is represented as x, y, score.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
+                the keypoint will not be drawn.
+            pose_link_color (np.array[Mx3]): Color of M links. If None, the
+                links will not be drawn.
+            thickness (int): Thickness of lines.
+    """
+
+    img = mmcv.imread(img)
+    img_h, img_w, _ = img.shape
+
+    for kpts in pose_result:
+
+        kpts = np.array(kpts, copy=False)
+
+        # draw each point on image
+        if pose_kpt_color is not None:
+            assert len(pose_kpt_color) == len(kpts)
+            for kid, kpt in enumerate(kpts):
+                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+                if kpt_score > kpt_score_thr:
+                    color = tuple(int(c) for c in pose_kpt_color[kid])
+                    if show_keypoint_weight:
+                        img_copy = img.copy()
+                        cv2.circle(img_copy, (int(x_coord), int(y_coord)),
+                                   radius, color, -1)
+                        transparency = max(0, min(1, kpt_score))
+                        cv2.addWeighted(
+                            img_copy,
+                            transparency,
+                            img,
+                            1 - transparency,
+                            0,
+                            dst=img)
+                    else:
+                        cv2.circle(img, (int(x_coord), int(y_coord)), radius,
+                                   color, -1)
+
+        # draw links
+        if skeleton is not None and pose_link_color is not None:
+            assert len(pose_link_color) == len(skeleton)
+            for sk_id, sk in enumerate(skeleton):
+                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
+                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+                if (pos1[0] > 0 and pos1[0] < img_w and pos1[1] > 0
+                        and pos1[1] < img_h and pos2[0] > 0 and pos2[0] < img_w
+                        and pos2[1] > 0 and pos2[1] < img_h
+                        and kpts[sk[0], 2] > kpt_score_thr
+                        and kpts[sk[1], 2] > kpt_score_thr):
+                    color = tuple(int(c) for c in pose_link_color[sk_id])
+                    if show_keypoint_weight:
+                        img_copy = img.copy()
+                        X = (pos1[0], pos2[0])
+                        Y = (pos1[1], pos2[1])
+                        mX = np.mean(X)
+                        mY = np.mean(Y)
+                        length = ((Y[0] - Y[1])**2 + (X[0] - X[1])**2)**0.5
+                        angle = math.degrees(
+                            math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                        stickwidth = 2
+                        polygon = cv2.ellipse2Poly(
+                            (int(mX), int(mY)),
+                            (int(length / 2), int(stickwidth)), int(angle), 0,
+                            360, 1)
+                        cv2.fillConvexPoly(img_copy, polygon, color)
+                        transparency = max(
+                            0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
+                        cv2.addWeighted(
+                            img_copy,
+                            transparency,
+                            img,
+                            1 - transparency,
+                            0,
+                            dst=img)
+                    else:
+                        cv2.line(img, pos1, pos2, color, thickness=thickness)
+
+    return img
+
+
+def imshow_keypoints_3d(
+    pose_result,
+    img=None,
+    skeleton=None,
+    pose_kpt_color=None,
+    pose_link_color=None,
+    vis_height=400,
+    kpt_score_thr=0.3,
+    num_instances=-1,
+    *,
+    axis_azimuth=70,
+    axis_limit=1.7,
+    axis_dist=10.0,
+    axis_elev=15.0,
+):
+    """Draw 3D keypoints and links in 3D coordinates.
+
+    Args:
+        pose_result (list[dict]): 3D pose results containing:
+            - "keypoints_3d" ([K,4]): 3D keypoints
+            - "title" (str): Optional. A string to specify the title of the
+                visualization of this pose result
+        img (str|np.ndarray): Opptional. The image or image path to show input
+            image and/or 2D pose. Note that the image should be given in BGR
+            channel order.
+        skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+            links, each is a pair of joint indices.
+        pose_kpt_color (np.ndarray[Nx3]`): Color of N keypoints. If None, do
+            not nddraw keypoints.
+        pose_link_color (np.array[Mx3]): Color of M links. If None, do not
+            draw links.
+        vis_height (int): The image height of the visualization. The width
+                will be N*vis_height depending on the number of visualized
+                items.
+        kpt_score_thr (float): Minimum score of keypoints to be shown.
+            Default: 0.3.
+        num_instances (int): Number of instances to be shown in 3D. If smaller
+            than 0, all the instances in the pose_result will be shown.
+            Otherwise, pad or truncate the pose_result to a length of
+            num_instances.
+        axis_azimuth (float): axis azimuth angle for 3D visualizations.
+        axis_dist (float): axis distance for 3D visualizations.
+        axis_elev (float): axis elevation view angle for 3D visualizations.
+        axis_limit (float): The axis limit to visualize 3d pose. The xyz
+            range will be set as:
+            - x: [x_c - axis_limit/2, x_c + axis_limit/2]
+            - y: [y_c - axis_limit/2, y_c + axis_limit/2]
+            - z: [0, axis_limit]
+            Where x_c, y_c is the mean value of x and y coordinates
+        figsize: (float): figure size in inch.
+    """
+
+    show_img = img is not None
+    if num_instances < 0:
+        num_instances = len(pose_result)
+    else:
+        if len(pose_result) > num_instances:
+            pose_result = pose_result[:num_instances]
+        elif len(pose_result) < num_instances:
+            pose_result += [dict()] * (num_instances - len(pose_result))
+    num_axis = num_instances + 1 if show_img else num_instances
+
+    plt.ioff()
+    fig = plt.figure(figsize=(vis_height * num_axis * 0.01, vis_height * 0.01))
+
+    if show_img:
+        img = mmcv.imread(img, channel_order='bgr')
+        img = mmcv.bgr2rgb(img)
+        img = mmcv.imrescale(img, scale=vis_height / img.shape[0])
+
+        ax_img = fig.add_subplot(1, num_axis, 1)
+        ax_img.get_xaxis().set_visible(False)
+        ax_img.get_yaxis().set_visible(False)
+        ax_img.set_axis_off()
+        ax_img.set_title('Input')
+        ax_img.imshow(img, aspect='equal')
+
+    for idx, res in enumerate(pose_result):
+        dummy = len(res) == 0
+        kpts = np.zeros((1, 3)) if dummy else res['keypoints_3d']
+        if kpts.shape[1] == 3:
+            kpts = np.concatenate([kpts, np.ones((kpts.shape[0], 1))], axis=1)
+        valid = kpts[:, 3] >= kpt_score_thr
+
+        ax_idx = idx + 2 if show_img else idx + 1
+        ax = fig.add_subplot(1, num_axis, ax_idx, projection='3d')
+        ax.view_init(
+            elev=axis_elev,
+            azim=axis_azimuth,
+        )
+        x_c = np.mean(kpts[valid, 0]) if sum(valid) > 0 else 0
+        y_c = np.mean(kpts[valid, 1]) if sum(valid) > 0 else 0
+        ax.set_xlim3d([x_c - axis_limit / 2, x_c + axis_limit / 2])
+        ax.set_ylim3d([y_c - axis_limit / 2, y_c + axis_limit / 2])
+        ax.set_zlim3d([0, axis_limit])
+        ax.set_aspect('auto')
+        ax.set_xticks([])
+        ax.set_yticks([])
+        ax.set_zticks([])
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+        ax.dist = axis_dist
+
+        if not dummy and pose_kpt_color is not None:
+            pose_kpt_color = np.array(pose_kpt_color)
+            assert len(pose_kpt_color) == len(kpts)
+            x_3d, y_3d, z_3d = np.split(kpts[:, :3], [1, 2], axis=1)
+            # matplotlib uses RGB color in [0, 1] value range
+            _color = pose_kpt_color[..., ::-1] / 255.
+            ax.scatter(
+                x_3d[valid],
+                y_3d[valid],
+                z_3d[valid],
+                marker='o',
+                color=_color[valid],
+            )
+
+        if not dummy and skeleton is not None and pose_link_color is not None:
+            pose_link_color = np.array(pose_link_color)
+            assert len(pose_link_color) == len(skeleton)
+            for link, link_color in zip(skeleton, pose_link_color):
+                link_indices = [_i for _i in link]
+                xs_3d = kpts[link_indices, 0]
+                ys_3d = kpts[link_indices, 1]
+                zs_3d = kpts[link_indices, 2]
+                kpt_score = kpts[link_indices, 3]
+                if kpt_score.min() > kpt_score_thr:
+                    # matplotlib uses RGB color in [0, 1] value range
+                    _color = link_color[::-1] / 255.
+                    ax.plot(xs_3d, ys_3d, zs_3d, color=_color, zdir='z')
+
+        if 'title' in res:
+            ax.set_title(res['title'])
+
+    # convert figure to numpy array
+    fig.tight_layout()
+    fig.canvas.draw()
+    img_w, img_h = fig.canvas.get_width_height()
+    img_vis = np.frombuffer(
+        fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(img_h, img_w, -1)
+    img_vis = mmcv.rgb2bgr(img_vis)
+
+    plt.close(fig)
+
+    return img_vis
+
+
+def imshow_mesh_3d(img,
+                   vertices,
+                   faces,
+                   camera_center,
+                   focal_length,
+                   colors=(76, 76, 204)):
+    """Render 3D meshes on background image.
+
+    Args:
+        img(np.ndarray): Background image.
+        vertices (list of np.ndarray): Vetrex coordinates in camera space.
+        faces (list of np.ndarray): Faces of meshes.
+        camera_center ([2]): Center pixel.
+        focal_length ([2]): Focal length of camera.
+        colors (list[str or tuple or Color]): A list of mesh colors.
+    """
+
+    H, W, C = img.shape
+
+    if not has_pyrender:
+        warnings.warn('pyrender package is not installed.')
+        return img
+
+    if not has_trimesh:
+        warnings.warn('trimesh package is not installed.')
+        return img
+
+    try:
+        renderer = pyrender.OffscreenRenderer(
+            viewport_width=W, viewport_height=H)
+    except (ImportError, RuntimeError):
+        warnings.warn('pyrender package is not installed correctly.')
+        return img
+
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(vertices))]
+    colors = [color_val(c) for c in colors]
+
+    depth_map = np.ones([H, W]) * np.inf
+    output_img = img
+    for idx in range(len(vertices)):
+        color = colors[idx]
+        color = [c / 255.0 for c in color]
+        color.append(1.0)
+        vert = vertices[idx]
+        face = faces[idx]
+
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.2, alphaMode='OPAQUE', baseColorFactor=color)
+
+        mesh = trimesh.Trimesh(vert, face)
+        rot = trimesh.transformations.rotation_matrix(
+            np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material)
+
+        scene = pyrender.Scene(ambient_light=(0.5, 0.5, 0.5))
+        scene.add(mesh, 'mesh')
+
+        camera_pose = np.eye(4)
+        camera = pyrender.IntrinsicsCamera(
+            fx=focal_length[0],
+            fy=focal_length[1],
+            cx=camera_center[0],
+            cy=camera_center[1],
+            zfar=1e5)
+        scene.add(camera, pose=camera_pose)
+
+        light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=1)
+        light_pose = np.eye(4)
+
+        light_pose[:3, 3] = np.array([0, -1, 1])
+        scene.add(light, pose=light_pose)
+
+        light_pose[:3, 3] = np.array([0, 1, 1])
+        scene.add(light, pose=light_pose)
+
+        light_pose[:3, 3] = np.array([1, 1, 2])
+        scene.add(light, pose=light_pose)
+
+        color, rend_depth = renderer.render(
+            scene, flags=pyrender.RenderFlags.RGBA)
+
+        valid_mask = (rend_depth < depth_map) * (rend_depth > 0)
+        depth_map[valid_mask] = rend_depth[valid_mask]
+        valid_mask = valid_mask[:, :, None]
+        output_img = (
+            valid_mask * color[:, :, :3] + (1 - valid_mask) * output_img)
+
+    return output_img
diff --git a/mmpose/datasets/__init__.py b/mmpose/datasets/__init__.py
new file mode 100644
index 0000000..1b9e7cf
--- /dev/null
+++ b/mmpose/datasets/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .dataset_info import DatasetInfo
+from .pipelines import Compose
+from .samplers import DistributedSampler
+
+from .datasets import (  # isort:skip
+    AnimalATRWDataset, AnimalFlyDataset, AnimalHorse10Dataset,
+    AnimalLocustDataset, AnimalMacaqueDataset, AnimalPoseDataset,
+    AnimalZebraDataset, Body3DH36MDataset, BottomUpAicDataset,
+    BottomUpCocoDataset, BottomUpCocoWholeBodyDataset,
+    BottomUpCrowdPoseDataset, BottomUpMhpDataset, DeepFashionDataset,
+    Face300WDataset, FaceAFLWDataset, FaceCocoWholeBodyDataset,
+    FaceCOFWDataset, FaceWFLWDataset, FreiHandDataset,
+    HandCocoWholeBodyDataset, InterHand2DDataset, InterHand3DDataset,
+    MeshAdversarialDataset, MeshH36MDataset, MeshMixDataset, MoshDataset,
+    OneHand10KDataset, PanopticDataset, TopDownAicDataset, TopDownCocoDataset,
+    TopDownCocoWholeBodyDataset, TopDownCrowdPoseDataset,
+    TopDownFreiHandDataset, TopDownH36MDataset, TopDownJhmdbDataset,
+    TopDownMhpDataset, TopDownMpiiDataset, TopDownMpiiTrbDataset,
+    TopDownOCHumanDataset, TopDownOneHand10KDataset, TopDownPanopticDataset,
+    TopDownPoseTrack18Dataset, TopDownPoseTrack18VideoDataset)
+
+__all__ = [
+    'TopDownCocoDataset', 'BottomUpCocoDataset', 'BottomUpMhpDataset',
+    'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset', 'TopDownMpiiDataset',
+    'TopDownMpiiTrbDataset', 'OneHand10KDataset', 'PanopticDataset',
+    'HandCocoWholeBodyDataset', 'FreiHandDataset', 'InterHand2DDataset',
+    'InterHand3DDataset', 'TopDownOCHumanDataset', 'TopDownAicDataset',
+    'TopDownCocoWholeBodyDataset', 'MeshH36MDataset', 'MeshMixDataset',
+    'MoshDataset', 'MeshAdversarialDataset', 'TopDownCrowdPoseDataset',
+    'BottomUpCrowdPoseDataset', 'TopDownFreiHandDataset',
+    'TopDownOneHand10KDataset', 'TopDownPanopticDataset',
+    'TopDownPoseTrack18Dataset', 'TopDownJhmdbDataset', 'TopDownMhpDataset',
+    'DeepFashionDataset', 'Face300WDataset', 'FaceAFLWDataset',
+    'FaceWFLWDataset', 'FaceCOFWDataset', 'FaceCocoWholeBodyDataset',
+    'Body3DH36MDataset', 'AnimalHorse10Dataset', 'AnimalMacaqueDataset',
+    'AnimalFlyDataset', 'AnimalLocustDataset', 'AnimalZebraDataset',
+    'AnimalATRWDataset', 'AnimalPoseDataset', 'TopDownH36MDataset',
+    'TopDownPoseTrack18VideoDataset', 'build_dataloader', 'build_dataset',
+    'Compose', 'DistributedSampler', 'DATASETS', 'PIPELINES', 'DatasetInfo'
+]
diff --git a/mmpose/datasets/builder.py b/mmpose/datasets/builder.py
new file mode 100644
index 0000000..990ba85
--- /dev/null
+++ b/mmpose/datasets/builder.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg, is_seq_of
+from mmcv.utils.parrots_wrapper import _get_dataloader
+from torch.utils.data.dataset import ConcatDataset
+
+from .samplers import DistributedSampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+
+
+def _concat_dataset(cfg, default_args=None):
+    types = cfg['type']
+    ann_files = cfg['ann_file']
+    img_prefixes = cfg.get('img_prefix', None)
+    dataset_infos = cfg.get('dataset_info', None)
+
+    num_joints = cfg['data_cfg'].get('num_joints', None)
+    dataset_channel = cfg['data_cfg'].get('dataset_channel', None)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        cfg_copy = copy.deepcopy(cfg)
+        cfg_copy['ann_file'] = ann_files[i]
+
+        if isinstance(types, (list, tuple)):
+            cfg_copy['type'] = types[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            cfg_copy['img_prefix'] = img_prefixes[i]
+        if isinstance(dataset_infos, (list, tuple)):
+            cfg_copy['dataset_info'] = dataset_infos[i]
+
+        if isinstance(num_joints, (list, tuple)):
+            cfg_copy['data_cfg']['num_joints'] = num_joints[i]
+
+        if is_seq_of(dataset_channel, list):
+            cfg_copy['data_cfg']['dataset_channel'] = dataset_channel[i]
+
+        datasets.append(build_dataset(cfg_copy, default_args))
+
+    return ConcatDataset(datasets)
+
+
+def build_dataset(cfg, default_args=None):
+    """Build a dataset from config dict.
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        default_args (dict, optional): Default initialization arguments.
+            Default: None.
+
+    Returns:
+        Dataset: The constructed dataset.
+    """
+    from .dataset_wrappers import RepeatDataset
+
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in cfg['datasets']])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+    return dataset
+
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     drop_last=True,
+                     pin_memory=True,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        drop_last (bool): Whether to drop the last incomplete batch in epoch.
+            Default: True
+        pin_memory (bool): Whether to use pin_memory in DataLoader.
+            Default: True
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        sampler = DistributedSampler(
+            dataset, world_size, rank, shuffle=shuffle, seed=seed)
+        shuffle = False
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    _, DataLoader = _get_dataloader()
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=pin_memory,
+        shuffle=shuffle,
+        worker_init_fn=init_fn,
+        drop_last=drop_last,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    """Init the random seed for various workers."""
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
diff --git a/mmpose/datasets/dataset_info.py b/mmpose/datasets/dataset_info.py
new file mode 100644
index 0000000..ef0d62e
--- /dev/null
+++ b/mmpose/datasets/dataset_info.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+class DatasetInfo:
+
+    def __init__(self, dataset_info):
+        self._dataset_info = dataset_info
+        self.dataset_name = self._dataset_info['dataset_name']
+        self.paper_info = self._dataset_info['paper_info']
+        self.keypoint_info = self._dataset_info['keypoint_info']
+        self.skeleton_info = self._dataset_info['skeleton_info']
+        self.joint_weights = np.array(
+            self._dataset_info['joint_weights'], dtype=np.float32)[:, None]
+
+        self.sigmas = np.array(self._dataset_info['sigmas'])
+
+        self._parse_keypoint_info()
+        self._parse_skeleton_info()
+
+    def _parse_skeleton_info(self):
+        """Parse skeleton information.
+
+        - link_num (int): number of links.
+        - skeleton (list((2,))): list of links (id).
+        - skeleton_name (list((2,))): list of links (name).
+        - pose_link_color (np.ndarray): the color of the link for
+            visualization.
+        """
+        self.link_num = len(self.skeleton_info.keys())
+        self.pose_link_color = []
+
+        self.skeleton_name = []
+        self.skeleton = []
+        for skid in self.skeleton_info.keys():
+            link = self.skeleton_info[skid]['link']
+            self.skeleton_name.append(link)
+            self.skeleton.append([
+                self.keypoint_name2id[link[0]], self.keypoint_name2id[link[1]]
+            ])
+            self.pose_link_color.append(self.skeleton_info[skid].get(
+                'color', [255, 128, 0]))
+        self.pose_link_color = np.array(self.pose_link_color)
+
+    def _parse_keypoint_info(self):
+        """Parse keypoint information.
+
+        - keypoint_num (int): number of keypoints.
+        - keypoint_id2name (dict): mapping keypoint id to keypoint name.
+        - keypoint_name2id (dict): mapping keypoint name to keypoint id.
+        - upper_body_ids (list): a list of keypoints that belong to the
+            upper body.
+        - lower_body_ids (list): a list of keypoints that belong to the
+            lower body.
+        - flip_index (list): list of flip index (id)
+        - flip_pairs (list((2,))): list of flip pairs (id)
+        - flip_index_name (list): list of flip index (name)
+        - flip_pairs_name (list((2,))): list of flip pairs (name)
+        - pose_kpt_color (np.ndarray): the color of the keypoint for
+            visualization.
+        """
+
+        self.keypoint_num = len(self.keypoint_info.keys())
+        self.keypoint_id2name = {}
+        self.keypoint_name2id = {}
+
+        self.pose_kpt_color = []
+        self.upper_body_ids = []
+        self.lower_body_ids = []
+
+        self.flip_index_name = []
+        self.flip_pairs_name = []
+
+        for kid in self.keypoint_info.keys():
+
+            keypoint_name = self.keypoint_info[kid]['name']
+            self.keypoint_id2name[kid] = keypoint_name
+            self.keypoint_name2id[keypoint_name] = kid
+            self.pose_kpt_color.append(self.keypoint_info[kid].get(
+                'color', [255, 128, 0]))
+
+            type = self.keypoint_info[kid].get('type', '')
+            if type == 'upper':
+                self.upper_body_ids.append(kid)
+            elif type == 'lower':
+                self.lower_body_ids.append(kid)
+            else:
+                pass
+
+            swap_keypoint = self.keypoint_info[kid].get('swap', '')
+            if swap_keypoint == keypoint_name or swap_keypoint == '':
+                self.flip_index_name.append(keypoint_name)
+            else:
+                self.flip_index_name.append(swap_keypoint)
+                if [swap_keypoint, keypoint_name] not in self.flip_pairs_name:
+                    self.flip_pairs_name.append([keypoint_name, swap_keypoint])
+
+        self.flip_pairs = [[
+            self.keypoint_name2id[pair[0]], self.keypoint_name2id[pair[1]]
+        ] for pair in self.flip_pairs_name]
+        self.flip_index = [
+            self.keypoint_name2id[name] for name in self.flip_index_name
+        ]
+        self.pose_kpt_color = np.array(self.pose_kpt_color)
diff --git a/mmpose/datasets/dataset_wrappers.py b/mmpose/datasets/dataset_wrappers.py
new file mode 100644
index 0000000..aaaa173
--- /dev/null
+++ b/mmpose/datasets/dataset_wrappers.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class RepeatDataset:
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+
+    def __init__(self, dataset, times):
+        self.dataset = dataset
+        self.times = times
+
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx):
+        """Get data."""
+        return self.dataset[idx % self._ori_len]
+
+    def __len__(self):
+        """Length after repetition."""
+        return self.times * self._ori_len
diff --git a/mmpose/datasets/datasets/__init__.py b/mmpose/datasets/datasets/__init__.py
new file mode 100644
index 0000000..f3839e5
--- /dev/null
+++ b/mmpose/datasets/datasets/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...deprecated import (TopDownFreiHandDataset, TopDownOneHand10KDataset,
+                           TopDownPanopticDataset)
+from .animal import (AnimalATRWDataset, AnimalFlyDataset, AnimalHorse10Dataset,
+                     AnimalLocustDataset, AnimalMacaqueDataset,
+                     AnimalPoseDataset, AnimalZebraDataset)
+from .body3d import Body3DH36MDataset, Body3DMviewDirectPanopticDataset
+from .bottom_up import (BottomUpAicDataset, BottomUpCocoDataset,
+                        BottomUpCocoWholeBodyDataset, BottomUpCrowdPoseDataset,
+                        BottomUpMhpDataset)
+from .face import (Face300WDataset, FaceAFLWDataset, FaceCocoWholeBodyDataset,
+                   FaceCOFWDataset, FaceWFLWDataset)
+from .fashion import DeepFashionDataset
+from .hand import (FreiHandDataset, HandCocoWholeBodyDataset,
+                   InterHand2DDataset, InterHand3DDataset, OneHand10KDataset,
+                   PanopticDataset)
+from .mesh import (MeshAdversarialDataset, MeshH36MDataset, MeshMixDataset,
+                   MoshDataset)
+from .top_down import (TopDownAicDataset, TopDownCocoDataset,
+                       TopDownCocoWholeBodyDataset, TopDownCrowdPoseDataset,
+                       TopDownH36MDataset, TopDownHalpeDataset,
+                       TopDownJhmdbDataset, TopDownMhpDataset,
+                       TopDownMpiiDataset, TopDownMpiiTrbDataset,
+                       TopDownOCHumanDataset, TopDownPoseTrack18Dataset,
+                       TopDownPoseTrack18VideoDataset)
+
+__all__ = [
+    'TopDownCocoDataset', 'BottomUpCocoDataset', 'BottomUpMhpDataset',
+    'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset', 'TopDownMpiiDataset',
+    'TopDownMpiiTrbDataset', 'OneHand10KDataset', 'PanopticDataset',
+    'HandCocoWholeBodyDataset', 'FreiHandDataset', 'InterHand2DDataset',
+    'InterHand3DDataset', 'TopDownOCHumanDataset', 'TopDownAicDataset',
+    'TopDownCocoWholeBodyDataset', 'MeshH36MDataset', 'MeshMixDataset',
+    'MoshDataset', 'MeshAdversarialDataset', 'TopDownCrowdPoseDataset',
+    'BottomUpCrowdPoseDataset', 'TopDownFreiHandDataset',
+    'TopDownOneHand10KDataset', 'TopDownPanopticDataset',
+    'TopDownPoseTrack18Dataset', 'TopDownJhmdbDataset', 'TopDownMhpDataset',
+    'DeepFashionDataset', 'Face300WDataset', 'FaceAFLWDataset',
+    'FaceWFLWDataset', 'FaceCOFWDataset', 'FaceCocoWholeBodyDataset',
+    'Body3DH36MDataset', 'AnimalHorse10Dataset', 'AnimalMacaqueDataset',
+    'AnimalFlyDataset', 'AnimalLocustDataset', 'AnimalZebraDataset',
+    'AnimalATRWDataset', 'AnimalPoseDataset', 'TopDownH36MDataset',
+    'TopDownHalpeDataset', 'TopDownPoseTrack18VideoDataset',
+    'Body3DMviewDirectPanopticDataset'
+]
diff --git a/mmpose/datasets/datasets/animal/__init__.py b/mmpose/datasets/datasets/animal/__init__.py
new file mode 100644
index 0000000..185b935
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .animal_ap10k_dataset import AnimalAP10KDataset
+from .animal_atrw_dataset import AnimalATRWDataset
+from .animal_fly_dataset import AnimalFlyDataset
+from .animal_horse10_dataset import AnimalHorse10Dataset
+from .animal_locust_dataset import AnimalLocustDataset
+from .animal_macaque_dataset import AnimalMacaqueDataset
+from .animal_pose_dataset import AnimalPoseDataset
+from .animal_zebra_dataset import AnimalZebraDataset
+
+__all__ = [
+    'AnimalHorse10Dataset', 'AnimalMacaqueDataset', 'AnimalFlyDataset',
+    'AnimalLocustDataset', 'AnimalZebraDataset', 'AnimalATRWDataset',
+    'AnimalPoseDataset', 'AnimalAP10KDataset'
+]
diff --git a/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py b/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py
new file mode 100644
index 0000000..11a1e73
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py
@@ -0,0 +1,367 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalAP10KDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AP-10K dataset for animal pose estimation.
+
+    "AP-10K: A Benchmark for Animal Pose Estimation in the Wild"
+    Neurips Dataset Track'2021.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2108.12617>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    AP-10K keypoint indexes::
+
+        0: 'L_Eye',
+        1: 'R_Eye',
+        2: 'Nose',
+        3: 'Neck',
+        4: 'root of tail',
+        5: 'L_Shoulder',
+        6: 'L_Elbow',
+        7: 'L_F_Paw',
+        8: 'R_Shoulder',
+        9: 'R_Elbow',
+        10: 'R_F_Paw,
+        11: 'L_Hip',
+        12: 'L_Knee',
+        13: 'L_B_Paw',
+        14: 'R_Hip',
+        15: 'R_Knee',
+        16: 'R_B_Paw'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/ap10k.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db, self.id2Cat = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db, id2Cat = self._load_coco_keypoint_annotations()
+        return gt_db, id2Cat
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db, id2Cat = [], dict()
+        for img_id in self.img_ids:
+            db_tmp, id2Cat_tmp = self._load_coco_keypoint_annotation_kernel(
+                img_id)
+            gt_db.extend(db_tmp)
+            id2Cat.update({img_id: id2Cat_tmp})
+        return gt_db, id2Cat
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        id2Cat = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            category = obj['category_id']
+            id2Cat.append({
+                'image_file': image_file,
+                'bbox_id': bbox_id,
+                'category': category,
+            })
+            bbox_id = bbox_id + 1
+
+        return rec, id2Cat
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                cat = self.id2Cat[image_id][bbox_ids[i]]['category']
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i],
+                    'category': cat
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': img_kpt['category'],
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/animal/animal_atrw_dataset.py b/mmpose/datasets/datasets/animal/animal_atrw_dataset.py
new file mode 100644
index 0000000..edfd3f9
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_atrw_dataset.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalATRWDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """ATRW dataset for animal pose estimation.
+
+    "ATRW: A Benchmark for Amur Tiger Re-identification in the Wild"
+    ACM MM'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1906.05586>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    ATRW keypoint indexes::
+
+        0: "left_ear",
+        1: "right_ear",
+        2: "nose",
+        3: "right_shoulder",
+        4: "right_front_paw",
+        5: "left_shoulder",
+        6: "left_front_paw",
+        7: "right_hip",
+        8: "right_knee",
+        9: "right_back_paw",
+        10: "left_hip",
+        11: "left_knee",
+        12: "left_back_paw",
+        13: "tail",
+        14: "center"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/atrw.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4], padding=1.0)
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/animal/animal_base_dataset.py b/mmpose/datasets/datasets/animal/animal_base_dataset.py
new file mode 100644
index 0000000..e191882
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class AnimalBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'AnimalBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/mmpose/datasets/datasets/animal/animal_fly_dataset.py b/mmpose/datasets/datasets/animal/animal_fly_dataset.py
new file mode 100644
index 0000000..f414117
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_fly_dataset.py
@@ -0,0 +1,215 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalFlyDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AnimalFlyDataset for animal pose estimation.
+
+    "Fast animal pose estimation using deep neural networks"
+    Nature methods'2019. More details can be found in the `paper
+    <https://www.biorxiv.org/content/biorxiv/\
+    early/2018/05/25/331181.full.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Vinegar Fly keypoint indexes::
+
+        0: "head",
+        1: "eyeL",
+        2: "eyeR",
+        3: "neck",
+        4: "thorax",
+        5: "abdomen",
+        6: "forelegR1",
+        7: "forelegR2",
+        8: "forelegR3",
+        9: "forelegR4",
+        10: "midlegR1",
+        11: "midlegR2",
+        12: "midlegR3",
+        13: "midlegR4",
+        14: "hindlegR1",
+        15: "hindlegR2",
+        16: "hindlegR3",
+        17: "hindlegR4",
+        18: "forelegL1",
+        19: "forelegL2",
+        20: "forelegL3",
+        21: "forelegL4",
+        22: "midlegL1",
+        23: "midlegL2",
+        24: "midlegL3",
+        25: "midlegL4",
+        26: "hindlegL1",
+        27: "hindlegL2",
+        28: "hindlegL3",
+        29: "hindlegL4",
+        30: "wingL",
+        31: "wingR"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/fly.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 192x192
+                center, scale = self._xywh2cs(0, 0, 192, 192, 0.8)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate Fly keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+
+            res_folder (str): Path of directory to save the results.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/animal/animal_horse10_dataset.py b/mmpose/datasets/datasets/animal/animal_horse10_dataset.py
new file mode 100644
index 0000000..d2bf198
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_horse10_dataset.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalHorse10Dataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AnimalHorse10Dataset for animal pose estimation.
+
+    "Pretraining boosts out-of-domain robustness for pose estimation"
+    WACV'2021. More details can be found in the `paper
+    <https://arxiv.org/pdf/1909.11229.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Horse-10 keypoint indexes::
+
+        0: 'Nose',
+        1: 'Eye',
+        2: 'Nearknee',
+        3: 'Nearfrontfetlock',
+        4: 'Nearfrontfoot',
+        5: 'Offknee',
+        6: 'Offfrontfetlock',
+        7: 'Offfrontfoot',
+        8: 'Shoulder',
+        9: 'Midshoulder',
+        10: 'Elbow',
+        11: 'Girth',
+        12: 'Wither',
+        13: 'Nearhindhock',
+        14: 'Nearhindfetlock',
+        15: 'Nearhindfoot',
+        16: 'Hip',
+        17: 'Stifle',
+        18: 'Offhindhock',
+        19: 'Offhindfetlock',
+        20: 'Offhindfoot',
+        21: 'Ischium'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/horse10.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # use 1.25 padded bbox as input
+                center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts):
+        """Get inter-ocular distance as the normalize factor, measured as the
+        Euclidean distance between the outer corners of the eyes.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 0, :] - gts[:, 1, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate horse-10 keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/animal/animal_locust_dataset.py b/mmpose/datasets/datasets/animal/animal_locust_dataset.py
new file mode 100644
index 0000000..95fb6ac
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_locust_dataset.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalLocustDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AnimalLocustDataset for animal pose estimation.
+
+    "DeepPoseKit, a software toolkit for fast and robust animal
+    pose estimation using deep learning" Elife'2019.
+    More details can be found in the paper.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Desert Locust keypoint indexes::
+
+        0: "head",
+        1: "neck",
+        2: "thorax",
+        3: "abdomen1",
+        4: "abdomen2",
+        5: "anttipL",
+        6: "antbaseL",
+        7: "eyeL",
+        8: "forelegL1",
+        9: "forelegL2",
+        10: "forelegL3",
+        11: "forelegL4",
+        12: "midlegL1",
+        13: "midlegL2",
+        14: "midlegL3",
+        15: "midlegL4",
+        16: "hindlegL1",
+        17: "hindlegL2",
+        18: "hindlegL3",
+        19: "hindlegL4",
+        20: "anttipR",
+        21: "antbaseR",
+        22: "eyeR",
+        23: "forelegR1",
+        24: "forelegR2",
+        25: "forelegR3",
+        26: "forelegR4",
+        27: "midlegR1",
+        28: "midlegR2",
+        29: "midlegR3",
+        30: "midlegR4",
+        31: "hindlegR1",
+        32: "hindlegR2",
+        33: "hindlegR3",
+        34: "hindlegR4"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/locust.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 160x160
+                center, scale = self._xywh2cs(0, 0, 160, 160, 0.8)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate Fly keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/animal/animal_macaque_dataset.py b/mmpose/datasets/datasets/animal/animal_macaque_dataset.py
new file mode 100644
index 0000000..359feca
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_macaque_dataset.py
@@ -0,0 +1,355 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalMacaqueDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """MacaquePose dataset for animal pose estimation.
+
+    "MacaquePose: A novel ‘in the wild’ macaque monkey pose dataset
+    for markerless motion capture" bioRxiv'2020.
+    More details can be found in the `paper
+    <https://www.biorxiv.org/content/10.1101/2020.07.30.229989v1>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Macaque keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/macaque.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/animal/animal_pose_dataset.py b/mmpose/datasets/datasets/animal/animal_pose_dataset.py
new file mode 100644
index 0000000..4ced570
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_pose_dataset.py
@@ -0,0 +1,359 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalPoseDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Animal-Pose dataset for animal pose estimation.
+
+    "Cross-domain Adaptation For Animal Pose Estimation" ICCV'2019
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1908.05806>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Animal-Pose keypoint indexes::
+
+        0: 'L_Eye',
+        1: 'R_Eye',
+        2: 'L_EarBase',
+        3: 'R_EarBase',
+        4: 'Nose',
+        5: 'Throat',
+        6: 'TailBase',
+        7: 'Withers',
+        8: 'L_F_Elbow',
+        9: 'R_F_Elbow',
+        10: 'L_B_Elbow',
+        11: 'R_B_Elbow',
+        12: 'L_F_Knee',
+        13: 'R_F_Knee',
+        14: 'L_B_Knee',
+        15: 'R_B_Knee',
+        16: 'L_F_Paw',
+        17: 'R_F_Paw',
+        18: 'L_B_Paw',
+        19: 'R_B_Paw'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/animalpose.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+
+        Args:
+            img_id: coco image id
+
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/animal/animal_zebra_dataset.py b/mmpose/datasets/datasets/animal/animal_zebra_dataset.py
new file mode 100644
index 0000000..9c5e3b7
--- /dev/null
+++ b/mmpose/datasets/datasets/animal/animal_zebra_dataset.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class AnimalZebraDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """AnimalZebraDataset for animal pose estimation.
+
+    "DeepPoseKit, a software toolkit for fast and robust animal
+    pose estimation using deep learning" Elife'2019.
+    More details can be found in the paper.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Desert Locust keypoint indexes::
+
+        0: "snout",
+        1: "head",
+        2: "neck",
+        3: "forelegL1",
+        4: "forelegR1",
+        5: "hindlegL1",
+        6: "hindlegR1",
+        7: "tailbase",
+        8: "tailtip"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/zebra.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 160x160
+                center, scale = self._xywh2cs(0, 0, 160, 160, 0.8)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate Fly keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/base/__init__.py b/mmpose/datasets/datasets/base/__init__.py
new file mode 100644
index 0000000..e5f9a08
--- /dev/null
+++ b/mmpose/datasets/datasets/base/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .kpt_2d_sview_rgb_img_bottom_up_dataset import \
+    Kpt2dSviewRgbImgBottomUpDataset
+from .kpt_2d_sview_rgb_img_top_down_dataset import \
+    Kpt2dSviewRgbImgTopDownDataset
+from .kpt_2d_sview_rgb_vid_top_down_dataset import \
+    Kpt2dSviewRgbVidTopDownDataset
+from .kpt_3d_mview_rgb_img_direct_dataset import Kpt3dMviewRgbImgDirectDataset
+from .kpt_3d_sview_kpt_2d_dataset import Kpt3dSviewKpt2dDataset
+from .kpt_3d_sview_rgb_img_top_down_dataset import \
+    Kpt3dSviewRgbImgTopDownDataset
+
+__all__ = [
+    'Kpt3dMviewRgbImgDirectDataset', 'Kpt2dSviewRgbImgTopDownDataset',
+    'Kpt3dSviewRgbImgTopDownDataset', 'Kpt2dSviewRgbImgBottomUpDataset',
+    'Kpt3dSviewKpt2dDataset', 'Kpt2dSviewRgbVidTopDownDataset'
+]
diff --git a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py
new file mode 100644
index 0000000..9930621
--- /dev/null
+++ b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import xtcocotools
+from torch.utils.data import Dataset
+from xtcocotools.coco import COCO
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt2dSviewRgbImgBottomUpDataset(Dataset, metaclass=ABCMeta):
+    """Base class for bottom-up datasets.
+
+    All datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_single`
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        coco_style (bool): Whether the annotation json is coco-style.
+            Default: True
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 coco_style=True,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        # bottom-up
+        self.base_size = data_cfg['base_size']
+        self.base_sigma = data_cfg['base_sigma']
+        self.int_sigma = False
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+        self.ann_info['num_scales'] = data_cfg['num_scales']
+        self.ann_info['scale_aware_sigma'] = data_cfg['scale_aware_sigma']
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.use_nms = data_cfg.get('use_nms', False)
+        self.soft_nms = data_cfg.get('soft_nms', True)
+        self.oks_thr = data_cfg.get('oks_thr', 0.9)
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        if coco_style:
+            self.coco = COCO(ann_file)
+            if 'categories' in self.coco.dataset:
+                cats = [
+                    cat['name']
+                    for cat in self.coco.loadCats(self.coco.getCatIds())
+                ]
+                self.classes = ['__background__'] + cats
+                self.num_classes = len(self.classes)
+                self._class_to_ind = dict(
+                    zip(self.classes, range(self.num_classes)))
+                self._class_to_coco_ind = dict(
+                    zip(cats, self.coco.getCatIds()))
+                self._coco_ind_to_class_ind = dict(
+                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
+                    for cls in self.classes[1:])
+            self.img_ids = self.coco.getImgIds()
+            if not test_mode:
+                self.img_ids = [
+                    img_id for img_id in self.img_ids if
+                    len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0
+                ]
+            self.num_images = len(self.img_ids)
+            self.id2name, self.name2id = self._get_mapping_id_name(
+                self.coco.imgs)
+
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _get_mask(self, anno, idx):
+        """Get ignore masks to mask out losses."""
+        coco = self.coco
+        img_info = coco.loadImgs(self.img_ids[idx])[0]
+
+        m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32)
+
+        for obj in anno:
+            if 'segmentation' in obj:
+                if obj['iscrowd']:
+                    rle = xtcocotools.mask.frPyObjects(obj['segmentation'],
+                                                       img_info['height'],
+                                                       img_info['width'])
+                    m += xtcocotools.mask.decode(rle)
+                elif obj['num_keypoints'] == 0:
+                    rles = xtcocotools.mask.frPyObjects(
+                        obj['segmentation'], img_info['height'],
+                        img_info['width'])
+                    for rle in rles:
+                        m += xtcocotools.mask.decode(rle)
+
+        return m < 0.5
+
+    @abstractmethod
+    def _get_single(self, idx):
+        """Get anno for a single image."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    def prepare_train_img(self, idx):
+        """Prepare image for training given the index."""
+        results = copy.deepcopy(self._get_single(idx))
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Prepare image for testing given the index."""
+        results = copy.deepcopy(self._get_single(idx))
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.img_ids)
+
+    def __getitem__(self, idx):
+        """Get the sample for either training or testing given index."""
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+
+        return self.prepare_train_img(idx)
diff --git a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
new file mode 100644
index 0000000..fb281f1
--- /dev/null
+++ b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import json_tricks as json
+import numpy as np
+from torch.utils.data import Dataset
+from xtcocotools.coco import COCO
+
+from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe,
+                                                  keypoint_nme,
+                                                  keypoint_pck_accuracy)
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt2dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta):
+    """Base class for keypoint 2D top-down pose estimation with single-view RGB
+    image as the input.
+
+    All fashion datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`, 'evaluate'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        coco_style (bool): Whether the annotation json is coco-style.
+            Default: True
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 coco_style=True,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.ann_info['max_num_joints'] = data_cfg.get('max_num_joints', None)
+        self.ann_info['dataset_idx'] = data_cfg.get('dataset_idx', 0)
+
+        self.ann_info['use_different_joint_weights'] = data_cfg.get(
+            'use_different_joint_weights', False)
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        if coco_style:
+            self.coco = COCO(ann_file)
+            if 'categories' in self.coco.dataset:
+                cats = [
+                    cat['name']
+                    for cat in self.coco.loadCats(self.coco.getCatIds())
+                ]
+                self.classes = ['__background__'] + cats
+                self.num_classes = len(self.classes)
+                self._class_to_ind = dict(
+                    zip(self.classes, range(self.num_classes)))
+                self._class_to_coco_ind = dict(
+                    zip(cats, self.coco.getCatIds()))
+                self._coco_ind_to_class_ind = dict(
+                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
+                    for cls in self.classes[1:])
+            self.img_ids = self.coco.getImgIds()
+            self.num_images = len(self.img_ids)
+            self.id2name, self.name2id = self._get_mapping_id_name(
+                self.coco.imgs)
+
+        self.db = []
+
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _xywh2cs(self, x, y, w, h, padding=1.25):
+        """This encodes bbox(x,y,w,h) into (center, scale)
+
+        Args:
+            x, y, w, h (float): left, top, width and height
+            padding (float): bounding box padding factor
+
+        Returns:
+            center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[
+            'image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+        if (not self.test_mode) and np.random.rand() < 0.3:
+            center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * padding
+
+        return center, scale
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get the normalize factor. generally inter-ocular distance measured
+        as the Euclidean distance between the outer corners of the eyes is
+        used. This function should be overrode, to measure NME.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+        return np.ones([gts.shape[0], 2], dtype=np.float32)
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self,
+                       res_file,
+                       metrics,
+                       pck_thr=0.2,
+                       pckh_thr=0.7,
+                       auc_nor=30):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'PCKh', 'AUC', 'EPE', 'NME'.
+            pck_thr (float): PCK threshold, default as 0.2.
+            pckh_thr (float): PCKh threshold, default as 0.7.
+            auc_nor (float): AUC normalization factor, default as 30 pixel.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.db)
+
+        outputs = []
+        gts = []
+        masks = []
+        box_sizes = []
+        threshold_bbox = []
+        threshold_head_box = []
+
+        for pred, item in zip(preds, self.db):
+            outputs.append(np.array(pred['keypoints'])[:, :-1])
+            gts.append(np.array(item['joints_3d'])[:, :-1])
+            masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0)
+            if 'PCK' in metrics:
+                bbox = np.array(item['bbox'])
+                bbox_thr = np.max(bbox[2:])
+                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
+            if 'PCKh' in metrics:
+                head_box_thr = item['head_size']
+                threshold_head_box.append(
+                    np.array([head_box_thr, head_box_thr]))
+            box_sizes.append(item.get('box_size', 1))
+
+        outputs = np.array(outputs)
+        gts = np.array(gts)
+        masks = np.array(masks)
+        threshold_bbox = np.array(threshold_bbox)
+        threshold_head_box = np.array(threshold_head_box)
+        box_sizes = np.array(box_sizes).reshape([-1, 1])
+
+        if 'PCK' in metrics:
+            _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
+                                              threshold_bbox)
+            info_str.append(('PCK', pck))
+
+        if 'PCKh' in metrics:
+            _, pckh, _ = keypoint_pck_accuracy(outputs, gts, masks, pckh_thr,
+                                               threshold_head_box)
+            info_str.append(('PCKh', pckh))
+
+        if 'AUC' in metrics:
+            info_str.append(('AUC', keypoint_auc(outputs, gts, masks,
+                                                 auc_nor)))
+
+        if 'EPE' in metrics:
+            info_str.append(('EPE', keypoint_epe(outputs, gts, masks)))
+
+        if 'NME' in metrics:
+            normalize_factor = self._get_normalize_factor(
+                gts=gts, box_sizes=box_sizes)
+            info_str.append(
+                ('NME', keypoint_nme(outputs, gts, masks, normalize_factor)))
+
+        return info_str
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = copy.deepcopy(self.db[idx])
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py
new file mode 100644
index 0000000..e529270
--- /dev/null
+++ b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from torch.utils.data import Dataset
+from xtcocotools.coco import COCO
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt2dSviewRgbVidTopDownDataset(Dataset, metaclass=ABCMeta):
+    """Base class for keypoint 2D top-down pose estimation with single-view RGB
+    video as the input.
+
+    All fashion datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`, 'evaluate'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where videos/images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        coco_style (bool): Whether the annotation json is coco-style.
+            Default: True
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 coco_style=True,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.ann_info['use_different_joint_weights'] = data_cfg.get(
+            'use_different_joint_weights', False)
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        if coco_style:
+            self.coco = COCO(ann_file)
+            if 'categories' in self.coco.dataset:
+                cats = [
+                    cat['name']
+                    for cat in self.coco.loadCats(self.coco.getCatIds())
+                ]
+                self.classes = ['__background__'] + cats
+                self.num_classes = len(self.classes)
+                self._class_to_ind = dict(
+                    zip(self.classes, range(self.num_classes)))
+                self._class_to_coco_ind = dict(
+                    zip(cats, self.coco.getCatIds()))
+                self._coco_ind_to_class_ind = dict(
+                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
+                    for cls in self.classes[1:])
+            self.img_ids = self.coco.getImgIds()
+            self.num_images = len(self.img_ids)
+            self.id2name, self.name2id = self._get_mapping_id_name(
+                self.coco.imgs)
+
+        self.db = []
+
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _xywh2cs(self, x, y, w, h, padding=1.25):
+        """This encodes bbox(x,y,w,h) into (center, scale)
+
+        Args:
+            x, y, w, h (float): left, top, width and height
+            padding (float): bounding box padding factor
+
+        Returns:
+            center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[
+            'image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+        if (not self.test_mode) and np.random.rand() < 0.3:
+            center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * padding
+
+        return center, scale
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    @staticmethod
+    @abstractmethod
+    def _write_keypoint_results(keypoint_results, gt_folder, pred_folder):
+        """Write results into a json file."""
+
+    @abstractmethod
+    def _do_keypoint_eval(self, gt_folder, pred_folder):
+        """Keypoint evaluation.
+        Args:
+            gt_folder (str): The folder of the json files storing
+                ground truth keypoint annotations.
+            pred_folder (str): The folder of the json files storing
+                prediction results.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = copy.deepcopy(self.db[idx])
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py b/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py
new file mode 100644
index 0000000..94cc1c2
--- /dev/null
+++ b/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import json_tricks as json
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt3dMviewRgbImgDirectDataset(Dataset, metaclass=ABCMeta):
+    """Base class for keypoint 3D top-down pose estimation with multi-view RGB
+    images as the input.
+
+    All subclasses should overwrite:
+        Methods:`_get_db`, 'evaluate'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['space_size'] = data_cfg['space_size']
+        self.ann_info['space_center'] = data_cfg['space_center']
+        self.ann_info['cube_size'] = data_cfg['cube_size']
+        self.ann_info['scale_aware_sigma'] = data_cfg.get(
+            'scale_aware_sigma', False)
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] <= dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['num_scales'] = 1
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        self.load_config(data_cfg)
+
+        self.db = []
+
+        self.pipeline = Compose(self.pipeline)
+
+    def load_config(self, data_cfg):
+        """Initialize dataset attributes according to the config.
+
+        Override this method to set dataset specific attributes.
+        """
+        self.num_joints = data_cfg['num_joints']
+        self.num_cameras = data_cfg['num_cameras']
+        self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1)
+        self.subset = data_cfg.get('subset', 'train')
+        self.need_2d_label = data_cfg.get('need_2d_label', False)
+        self.need_camera_param = True
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.db) // self.num_cameras
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = {}
+        # return self.pipeline(results)
+        for c in range(self.num_cameras):
+            result = copy.deepcopy(self.db[self.num_cameras * idx + c])
+            result['ann_info'] = self.ann_info
+            results[c] = result
+
+        return self.pipeline(results)
diff --git a/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py b/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py
new file mode 100644
index 0000000..dbdb998
--- /dev/null
+++ b/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt3dSviewKpt2dDataset(Dataset, metaclass=ABCMeta):
+    """Base class for 3D human pose datasets.
+
+    Subclasses should consider overwriting following methods:
+        - load_config
+        - load_annotations
+        - build_sample_indices
+        - evaluate
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+            - num_joints: Number of joints.
+            - seq_len: Number of frames in a sequence. Default: 1.
+            - seq_frame_interval: Extract frames from the video at certain
+                intervals. Default: 1.
+            - causal: If set to True, the rightmost input frame will be the
+                target frame. Otherwise, the middle input frame will be the
+                target frame. Default: True.
+            - temporal_padding: Whether to pad the video so that poses will be
+                predicted for every frame in the video. Default: False
+            - subset: Reduce dataset size by fraction. Default: 1.
+            - need_2d_label: Whether need 2D joint labels or not.
+                Default: False.
+
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.data_cfg = copy.deepcopy(data_cfg)
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+        self.ann_info = {}
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        self.load_config(self.data_cfg)
+
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        self.data_info = self.load_annotations()
+        self.sample_indices = self.build_sample_indices()
+        self.pipeline = Compose(pipeline)
+
+        self.name2id = {
+            name: i
+            for i, name in enumerate(self.data_info['imgnames'])
+        }
+
+    def load_config(self, data_cfg):
+        """Initialize dataset attributes according to the config.
+
+        Override this method to set dataset specific attributes.
+        """
+
+        self.num_joints = data_cfg['num_joints']
+        self.seq_len = data_cfg.get('seq_len', 1)
+        self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1)
+        self.causal = data_cfg.get('causal', True)
+        self.temporal_padding = data_cfg.get('temporal_padding', False)
+        self.subset = data_cfg.get('subset', 1)
+        self.need_2d_label = data_cfg.get('need_2d_label', False)
+        self.need_camera_param = False
+
+    def load_annotations(self):
+        """Load data annotation."""
+        data = np.load(self.ann_file)
+
+        # get image info
+        _imgnames = data['imgname']
+        num_imgs = len(_imgnames)
+        num_joints = self.ann_info['num_joints']
+
+        if 'scale' in data:
+            _scales = data['scale'].astype(np.float32)
+        else:
+            _scales = np.zeros(num_imgs, dtype=np.float32)
+
+        if 'center' in data:
+            _centers = data['center'].astype(np.float32)
+        else:
+            _centers = np.zeros((num_imgs, 2), dtype=np.float32)
+
+        # get 3D pose
+        if 'S' in data.keys():
+            _joints_3d = data['S'].astype(np.float32)
+        else:
+            _joints_3d = np.zeros((num_imgs, num_joints, 4), dtype=np.float32)
+
+        # get 2D pose
+        if 'part' in data.keys():
+            _joints_2d = data['part'].astype(np.float32)
+        else:
+            _joints_2d = np.zeros((num_imgs, num_joints, 3), dtype=np.float32)
+
+        data_info = {
+            'imgnames': _imgnames,
+            'joints_3d': _joints_3d,
+            'joints_2d': _joints_2d,
+            'scales': _scales,
+            'centers': _centers,
+        }
+
+        return data_info
+
+    def build_sample_indices(self):
+        """Build sample indices.
+
+        The default method creates sample indices that each sample is a single
+        frame (i.e. seq_len=1). Override this method in the subclass to define
+        how frames are sampled to form data samples.
+
+        Outputs:
+            sample_indices [list(tuple)]: the frame indices of each sample.
+                For a sample, all frames will be treated as an input sequence,
+                and the ground-truth pose of the last frame will be the target.
+        """
+        sample_indices = []
+        if self.seq_len == 1:
+            num_imgs = len(self.ann_info['imgnames'])
+            sample_indices = [(idx, ) for idx in range(num_imgs)]
+        else:
+            raise NotImplementedError('Multi-frame data sample unsupported!')
+        return sample_indices
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    def prepare_data(self, idx):
+        """Get data sample."""
+        data = self.data_info
+
+        frame_ids = self.sample_indices[idx]
+        assert len(frame_ids) == self.seq_len
+
+        # get the 3D/2D pose sequence
+        _joints_3d = data['joints_3d'][frame_ids]
+        _joints_2d = data['joints_2d'][frame_ids]
+
+        # get the image info
+        _imgnames = data['imgnames'][frame_ids]
+        _centers = data['centers'][frame_ids]
+        _scales = data['scales'][frame_ids]
+        if _scales.ndim == 1:
+            _scales = np.stack([_scales, _scales], axis=1)
+
+        target_idx = -1 if self.causal else int(self.seq_len) // 2
+
+        results = {
+            'input_2d': _joints_2d[:, :, :2],
+            'input_2d_visible': _joints_2d[:, :, -1:],
+            'input_3d': _joints_3d[:, :, :3],
+            'input_3d_visible': _joints_3d[:, :, -1:],
+            'target': _joints_3d[target_idx, :, :3],
+            'target_visible': _joints_3d[target_idx, :, -1:],
+            'image_paths': _imgnames,
+            'target_image_path': _imgnames[target_idx],
+            'scales': _scales,
+            'centers': _centers,
+        }
+
+        if self.need_2d_label:
+            results['target_2d'] = _joints_2d[target_idx, :, :2]
+
+        if self.need_camera_param:
+            _cam_param = self.get_camera_param(_imgnames[0])
+            results['camera_param'] = _cam_param
+            # get image size from camera parameters
+            if 'w' in _cam_param and 'h' in _cam_param:
+                results['image_width'] = _cam_param['w']
+                results['image_height'] = _cam_param['h']
+
+        return results
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.sample_indices)
+
+    def __getitem__(self, idx):
+        """Get a sample with given index."""
+        results = copy.deepcopy(self.prepare_data(idx))
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def get_camera_param(self, imgname):
+        """Get camera parameters of a frame by its image name."""
+        raise NotImplementedError
diff --git a/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py
new file mode 100644
index 0000000..af01e81
--- /dev/null
+++ b/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py
@@ -0,0 +1,256 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+
+import json_tricks as json
+import numpy as np
+from torch.utils.data import Dataset
+from xtcocotools.coco import COCO
+
+from mmpose.datasets import DatasetInfo
+from mmpose.datasets.pipelines import Compose
+
+
+class Kpt3dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta):
+    """Base class for keypoint 3D top-down pose estimation with single-view RGB
+    image as the input.
+
+    All fashion datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`, 'evaluate'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        coco_style (bool): Whether the annotation json is coco-style.
+            Default: True
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 coco_style=True,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        if dataset_info is None:
+            raise ValueError(
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.')
+
+        dataset_info = DatasetInfo(dataset_info)
+
+        assert self.ann_info['num_joints'] == dataset_info.keypoint_num
+        self.ann_info['flip_pairs'] = dataset_info.flip_pairs
+        self.ann_info['flip_index'] = dataset_info.flip_index
+        self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids
+        self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids
+        self.ann_info['joint_weights'] = dataset_info.joint_weights
+        self.ann_info['skeleton'] = dataset_info.skeleton
+        self.sigmas = dataset_info.sigmas
+        self.dataset_name = dataset_info.dataset_name
+
+        if coco_style:
+            self.coco = COCO(ann_file)
+            if 'categories' in self.coco.dataset:
+                cats = [
+                    cat['name']
+                    for cat in self.coco.loadCats(self.coco.getCatIds())
+                ]
+                self.classes = ['__background__'] + cats
+                self.num_classes = len(self.classes)
+                self._class_to_ind = dict(
+                    zip(self.classes, range(self.num_classes)))
+                self._class_to_coco_ind = dict(
+                    zip(cats, self.coco.getCatIds()))
+                self._coco_ind_to_class_ind = dict(
+                    (self._class_to_coco_ind[cls], self._class_to_ind[cls])
+                    for cls in self.classes[1:])
+            self.img_ids = self.coco.getImgIds()
+            self.num_images = len(self.img_ids)
+            self.id2name, self.name2id = self._get_mapping_id_name(
+                self.coco.imgs)
+
+        self.db = []
+
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _cam2pixel(cam_coord, f, c):
+        """Transform the joints from their camera coordinates to their pixel
+        coordinates.
+
+        Note:
+            N: number of joints
+
+        Args:
+            cam_coord (ndarray[N, 3]): 3D joints coordinates
+                in the camera coordinate system
+            f (ndarray[2]): focal length of x and y axis
+            c (ndarray[2]): principal point of x and y axis
+
+        Returns:
+            img_coord (ndarray[N, 3]): the coordinates (x, y, 0)
+                in the image plane.
+        """
+        x = cam_coord[:, 0] / (cam_coord[:, 2] + 1e-8) * f[0] + c[0]
+        y = cam_coord[:, 1] / (cam_coord[:, 2] + 1e-8) * f[1] + c[1]
+        z = np.zeros_like(x)
+        img_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1)
+        return img_coord
+
+    @staticmethod
+    def _world2cam(world_coord, R, T):
+        """Transform the joints from their world coordinates to their camera
+        coordinates.
+
+        Note:
+            N: number of joints
+
+        Args:
+            world_coord (ndarray[3, N]): 3D joints coordinates
+                in the world coordinate system
+            R (ndarray[3, 3]): camera rotation matrix
+            T (ndarray[3, 1]): camera position (x, y, z)
+
+        Returns:
+            cam_coord (ndarray[3, N]): 3D joints coordinates
+                in the camera coordinate system
+        """
+        cam_coord = np.dot(R, world_coord - T)
+        return cam_coord
+
+    @staticmethod
+    def _pixel2cam(pixel_coord, f, c):
+        """Transform the joints from their pixel coordinates to their camera
+        coordinates.
+
+        Note:
+            N: number of joints
+
+        Args:
+            pixel_coord (ndarray[N, 3]): 3D joints coordinates
+                in the pixel coordinate system
+            f (ndarray[2]): focal length of x and y axis
+            c (ndarray[2]): principal point of x and y axis
+
+        Returns:
+            cam_coord (ndarray[N, 3]): 3D joints coordinates
+                in the camera coordinate system
+        """
+        x = (pixel_coord[:, 0] - c[0]) / f[0] * pixel_coord[:, 2]
+        y = (pixel_coord[:, 1] - c[1]) / f[1] * pixel_coord[:, 2]
+        z = pixel_coord[:, 2]
+        cam_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1)
+        return cam_coord
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _xywh2cs(self, x, y, w, h, padding=1.25):
+        """This encodes bbox(x,y,w,h) into (center, scale)
+
+        Args:
+            x, y, w, h (float): left, top, width and height
+            padding (float): bounding box padding factor
+
+        Returns:
+            center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[
+            'image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+        if (not self.test_mode) and np.random.rand() < 0.3:
+            center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * padding
+
+        return center, scale
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, results, *args, **kwargs):
+        """Evaluate keypoint results."""
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = copy.deepcopy(self.db[idx])
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/body3d/__init__.py b/mmpose/datasets/datasets/body3d/__init__.py
new file mode 100644
index 0000000..5bc25a9
--- /dev/null
+++ b/mmpose/datasets/datasets/body3d/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .body3d_h36m_dataset import Body3DH36MDataset
+from .body3d_mpi_inf_3dhp_dataset import Body3DMpiInf3dhpDataset
+from .body3d_mview_direct_panoptic_dataset import \
+    Body3DMviewDirectPanopticDataset
+from .body3d_semi_supervision_dataset import Body3DSemiSupervisionDataset
+
+__all__ = [
+    'Body3DH36MDataset', 'Body3DSemiSupervisionDataset',
+    'Body3DMpiInf3dhpDataset', 'Body3DMviewDirectPanopticDataset'
+]
diff --git a/mmpose/datasets/datasets/body3d/body3d_base_dataset.py b/mmpose/datasets/datasets/body3d/body3d_base_dataset.py
new file mode 100644
index 0000000..10c2923
--- /dev/null
+++ b/mmpose/datasets/datasets/body3d/body3d_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class Body3DBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt3dSviewKpt2dDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'Body3DBaseDataset has been replaced by '
+            'Kpt3dSviewKpt2dDataset'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py b/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py
new file mode 100644
index 0000000..ae4949d
--- /dev/null
+++ b/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py
@@ -0,0 +1,343 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import mmcv
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.evaluation import keypoint_mpjpe
+from mmpose.datasets.datasets.base import Kpt3dSviewKpt2dDataset
+from ...builder import DATASETS
+
+
+@DATASETS.register_module()
+class Body3DH36MDataset(Kpt3dSviewKpt2dDataset):
+    """Human3.6M dataset for 3D human pose estimation.
+
+    "Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human
+    Sensing in Natural Environments", TPAMI`2014.
+    More details can be found in the `paper
+    <http://vision.imar.ro/human3.6m/pami-h36m.pdf>`__.
+
+    Human3.6M keypoint indexes::
+
+        0: 'root (pelvis)',
+        1: 'right_hip',
+        2: 'right_knee',
+        3: 'right_foot',
+        4: 'left_hip',
+        5: 'left_knee',
+        6: 'left_foot',
+        7: 'spine',
+        8: 'thorax',
+        9: 'neck_base',
+        10: 'head',
+        11: 'left_shoulder',
+        12: 'left_elbow',
+        13: 'left_wrist',
+        14: 'right_shoulder',
+        15: 'right_elbow',
+        16: 'right_wrist'
+
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    JOINT_NAMES = [
+        'Root', 'RHip', 'RKnee', 'RFoot', 'LHip', 'LKnee', 'LFoot', 'Spine',
+        'Thorax', 'NeckBase', 'Head', 'LShoulder', 'LElbow', 'LWrist',
+        'RShoulder', 'RElbow', 'RWrist'
+    ]
+
+    # 2D joint source options:
+    # "gt": from the annotation file
+    # "detection": from a detection result file of 2D keypoint
+    # "pipeline": will be generate by the pipeline
+    SUPPORTED_JOINT_2D_SRC = {'gt', 'detection', 'pipeline'}
+
+    # metric
+    ALLOWED_METRICS = {'mpjpe', 'p-mpjpe', 'n-mpjpe'}
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/h36m.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+    def load_config(self, data_cfg):
+        super().load_config(data_cfg)
+        # h36m specific attributes
+        self.joint_2d_src = data_cfg.get('joint_2d_src', 'gt')
+        if self.joint_2d_src not in self.SUPPORTED_JOINT_2D_SRC:
+            raise ValueError(
+                f'Unsupported joint_2d_src "{self.joint_2d_src}". '
+                f'Supported options are {self.SUPPORTED_JOINT_2D_SRC}')
+
+        self.joint_2d_det_file = data_cfg.get('joint_2d_det_file', None)
+
+        self.need_camera_param = data_cfg.get('need_camera_param', False)
+        if self.need_camera_param:
+            assert 'camera_param_file' in data_cfg
+            self.camera_param = self._load_camera_param(
+                data_cfg['camera_param_file'])
+
+        # h36m specific annotation info
+        ann_info = {}
+        ann_info['use_different_joint_weights'] = False
+        # action filter
+        actions = data_cfg.get('actions', '_all_')
+        self.actions = set(
+            actions if isinstance(actions, (list, tuple)) else [actions])
+
+        # subject filter
+        subjects = data_cfg.get('subjects', '_all_')
+        self.subjects = set(
+            subjects if isinstance(subjects, (list, tuple)) else [subjects])
+
+        self.ann_info.update(ann_info)
+
+    def load_annotations(self):
+        data_info = super().load_annotations()
+
+        # get 2D joints
+        if self.joint_2d_src == 'gt':
+            data_info['joints_2d'] = data_info['joints_2d']
+        elif self.joint_2d_src == 'detection':
+            data_info['joints_2d'] = self._load_joint_2d_detection(
+                self.joint_2d_det_file)
+            assert data_info['joints_2d'].shape[0] == data_info[
+                'joints_3d'].shape[0]
+            assert data_info['joints_2d'].shape[2] == 3
+        elif self.joint_2d_src == 'pipeline':
+            # joint_2d will be generated in the pipeline
+            pass
+        else:
+            raise NotImplementedError(
+                f'Unhandled joint_2d_src option {self.joint_2d_src}')
+
+        return data_info
+
+    @staticmethod
+    def _parse_h36m_imgname(imgname):
+        """Parse imgname to get information of subject, action and camera.
+
+        A typical h36m image filename is like:
+        S1_Directions_1.54138969_000001.jpg
+        """
+        subj, rest = osp.basename(imgname).split('_', 1)
+        action, rest = rest.split('.', 1)
+        camera, rest = rest.split('_', 1)
+
+        return subj, action, camera
+
+    def build_sample_indices(self):
+        """Split original videos into sequences and build frame indices.
+
+        This method overrides the default one in the base class.
+        """
+
+        # Group frames into videos. Assume that self.data_info is
+        # chronological.
+        video_frames = defaultdict(list)
+        for idx, imgname in enumerate(self.data_info['imgnames']):
+            subj, action, camera = self._parse_h36m_imgname(imgname)
+
+            if '_all_' not in self.actions and action not in self.actions:
+                continue
+
+            if '_all_' not in self.subjects and subj not in self.subjects:
+                continue
+
+            video_frames[(subj, action, camera)].append(idx)
+
+        # build sample indices
+        sample_indices = []
+        _len = (self.seq_len - 1) * self.seq_frame_interval + 1
+        _step = self.seq_frame_interval
+        for _, _indices in sorted(video_frames.items()):
+            n_frame = len(_indices)
+
+            if self.temporal_padding:
+                # Pad the sequence so that every frame in the sequence will be
+                # predicted.
+                if self.causal:
+                    frames_left = self.seq_len - 1
+                    frames_right = 0
+                else:
+                    frames_left = (self.seq_len - 1) // 2
+                    frames_right = frames_left
+                for i in range(n_frame):
+                    pad_left = max(0, frames_left - i // _step)
+                    pad_right = max(0,
+                                    frames_right - (n_frame - 1 - i) // _step)
+                    start = max(i % _step, i - frames_left * _step)
+                    end = min(n_frame - (n_frame - 1 - i) % _step,
+                              i + frames_right * _step + 1)
+                    sample_indices.append([_indices[0]] * pad_left +
+                                          _indices[start:end:_step] +
+                                          [_indices[-1]] * pad_right)
+            else:
+                seqs_from_video = [
+                    _indices[i:(i + _len):_step]
+                    for i in range(0, n_frame - _len + 1)
+                ]
+                sample_indices.extend(seqs_from_video)
+
+        # reduce dataset size if self.subset < 1
+        assert 0 < self.subset <= 1
+        subset_size = int(len(sample_indices) * self.subset)
+        start = np.random.randint(0, len(sample_indices) - subset_size + 1)
+        end = start + subset_size
+
+        return sample_indices[start:end]
+
+    def _load_joint_2d_detection(self, det_file):
+        """"Load 2D joint detection results from file."""
+        joints_2d = np.load(det_file).astype(np.float32)
+
+        return joints_2d
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs):
+        metrics = metric if isinstance(metric, list) else [metric]
+        for _metric in metrics:
+            if _metric not in self.ALLOWED_METRICS:
+                raise ValueError(
+                    f'Unsupported metric "{_metric}" for human3.6 dataset.'
+                    f'Supported metrics are {self.ALLOWED_METRICS}')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            image_paths = result['target_image_paths']
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                target_id = self.name2id[image_paths[i]]
+                kpts.append({
+                    'keypoints': preds[i],
+                    'target_id': target_id,
+                })
+
+        mmcv.dump(kpts, res_file)
+
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts)
+            elif _metric == 'p-mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts, mode='p-mpjpe')
+            elif _metric == 'n-mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts, mode='n-mpjpe')
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(_nv_tuples)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return OrderedDict(name_value_tuples)
+
+    def _report_mpjpe(self, keypoint_results, mode='mpjpe'):
+        """Cauculate mean per joint position error (MPJPE) or its variants like
+        P-MPJPE or N-MPJPE.
+
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DH36MDataset.evaluate' for details.
+            mode (str): Specify mpjpe variants. Supported options are:
+
+                - ``'mpjpe'``: Standard MPJPE.
+                - ``'p-mpjpe'``: MPJPE after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+                - ``'n-mpjpe'``: MPJPE after aligning prediction to groundtruth
+                    in scale only.
+        """
+
+        preds = []
+        gts = []
+        masks = []
+        action_category_indices = defaultdict(list)
+        for idx, result in enumerate(keypoint_results):
+            pred = result['keypoints']
+            target_id = result['target_id']
+            gt, gt_visible = np.split(
+                self.data_info['joints_3d'][target_id], [3], axis=-1)
+            preds.append(pred)
+            gts.append(gt)
+            masks.append(gt_visible)
+
+            action = self._parse_h36m_imgname(
+                self.data_info['imgnames'][target_id])[1]
+            action_category = action.split('_')[0]
+            action_category_indices[action_category].append(idx)
+
+        preds = np.stack(preds)
+        gts = np.stack(gts)
+        masks = np.stack(masks).squeeze(-1) > 0
+
+        err_name = mode.upper()
+        if mode == 'mpjpe':
+            alignment = 'none'
+        elif mode == 'p-mpjpe':
+            alignment = 'procrustes'
+        elif mode == 'n-mpjpe':
+            alignment = 'scale'
+        else:
+            raise ValueError(f'Invalid mode: {mode}')
+
+        error = keypoint_mpjpe(preds, gts, masks, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        for action_category, indices in action_category_indices.items():
+            _error = keypoint_mpjpe(preds[indices], gts[indices],
+                                    masks[indices])
+            name_value_tuples.append((f'{err_name}_{action_category}', _error))
+
+        return name_value_tuples
+
+    def _load_camera_param(self, camera_param_file):
+        """Load camera parameters from file."""
+        return mmcv.load(camera_param_file)
+
+    def get_camera_param(self, imgname):
+        """Get camera parameters of a frame by its image name."""
+        assert hasattr(self, 'camera_param')
+        subj, _, camera = self._parse_h36m_imgname(imgname)
+        return self.camera_param[(subj, camera)]
diff --git a/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py b/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py
new file mode 100644
index 0000000..4d06fcd
--- /dev/null
+++ b/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py
@@ -0,0 +1,417 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import mmcv
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.evaluation import (keypoint_3d_auc, keypoint_3d_pck,
+                                    keypoint_mpjpe)
+from mmpose.datasets.datasets.base import Kpt3dSviewKpt2dDataset
+from ...builder import DATASETS
+
+
+@DATASETS.register_module()
+class Body3DMpiInf3dhpDataset(Kpt3dSviewKpt2dDataset):
+    """MPI-INF-3DHP dataset for 3D human pose estimation.
+
+    "Monocular 3D Human Pose Estimation In The Wild Using Improved CNN
+    Supervision", 3DV'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/1611.09813>`__.
+
+    MPI-INF-3DHP keypoint indexes:
+
+        0: 'head_top',
+        1: 'neck',
+        2: 'right_shoulder',
+        3: 'right_elbow',
+        4: 'right_wrist',
+        5: 'left_shoulder;,
+        6: 'left_elbow',
+        7: 'left_wrist',
+        8: 'right_hip',
+        9: 'right_knee',
+        10: 'right_ankle',
+        11: 'left_hip',
+        12: 'left_knee',
+        13: 'left_ankle',
+        14: 'root (pelvis)',
+        15: 'spine',
+        16: 'head'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): Data configurations. Please refer to the docstring of
+            Body3DBaseDataset for common data attributes. Here are MPI-INF-3DHP
+            specific attributes.
+            - joint_2d_src: 2D joint source. Options include:
+                "gt": from the annotation file
+                "detection": from a detection result file of 2D keypoint
+                "pipeline": will be generate by the pipeline
+                Default: "gt".
+            - joint_2d_det_file: Path to the detection result file of 2D
+                keypoint. Only used when joint_2d_src == "detection".
+            - need_camera_param: Whether need camera parameters or not.
+                Default: False.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    JOINT_NAMES = [
+        'HeadTop', 'Neck', 'RShoulder', 'RElbow', 'RWrist', 'LShoulder',
+        'LElbow', 'LWrist', 'RHip', 'RKnee', 'RAnkle', 'LHip', 'LKnee',
+        'LAnkle', 'Root', 'Spine', 'Head'
+    ]
+
+    # 2D joint source options:
+    # "gt": from the annotation file
+    # "detection": from a detection result file of 2D keypoint
+    # "pipeline": will be generate by the pipeline
+    SUPPORTED_JOINT_2D_SRC = {'gt', 'detection', 'pipeline'}
+
+    # metric
+    ALLOWED_METRICS = {
+        'mpjpe', 'p-mpjpe', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc'
+    }
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mpi_inf_3dhp.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+    def load_config(self, data_cfg):
+        super().load_config(data_cfg)
+        # mpi-inf-3dhp specific attributes
+        self.joint_2d_src = data_cfg.get('joint_2d_src', 'gt')
+        if self.joint_2d_src not in self.SUPPORTED_JOINT_2D_SRC:
+            raise ValueError(
+                f'Unsupported joint_2d_src "{self.joint_2d_src}". '
+                f'Supported options are {self.SUPPORTED_JOINT_2D_SRC}')
+
+        self.joint_2d_det_file = data_cfg.get('joint_2d_det_file', None)
+
+        self.need_camera_param = data_cfg.get('need_camera_param', False)
+        if self.need_camera_param:
+            assert 'camera_param_file' in data_cfg
+            self.camera_param = self._load_camera_param(
+                data_cfg['camera_param_file'])
+
+        # mpi-inf-3dhp specific annotation info
+        ann_info = {}
+        ann_info['use_different_joint_weights'] = False
+
+        self.ann_info.update(ann_info)
+
+    def load_annotations(self):
+        data_info = super().load_annotations()
+
+        # get 2D joints
+        if self.joint_2d_src == 'gt':
+            data_info['joints_2d'] = data_info['joints_2d']
+        elif self.joint_2d_src == 'detection':
+            data_info['joints_2d'] = self._load_joint_2d_detection(
+                self.joint_2d_det_file)
+            assert data_info['joints_2d'].shape[0] == data_info[
+                'joints_3d'].shape[0]
+            assert data_info['joints_2d'].shape[2] == 3
+        elif self.joint_2d_src == 'pipeline':
+            # joint_2d will be generated in the pipeline
+            pass
+        else:
+            raise NotImplementedError(
+                f'Unhandled joint_2d_src option {self.joint_2d_src}')
+
+        return data_info
+
+    @staticmethod
+    def _parse_mpi_inf_3dhp_imgname(imgname):
+        """Parse imgname to get information of subject, sequence and camera.
+
+        A typical mpi-inf-3dhp training image filename is like:
+        S1_Seq1_Cam0_000001.jpg. A typical mpi-inf-3dhp testing image filename
+        is like: TS1_000001.jpg
+        """
+        if imgname[0] == 'S':
+            subj, rest = imgname.split('_', 1)
+            seq, rest = rest.split('_', 1)
+            camera, rest = rest.split('_', 1)
+            return subj, seq, camera
+        else:
+            subj, rest = imgname.split('_', 1)
+            return subj, None, None
+
+    def build_sample_indices(self):
+        """Split original videos into sequences and build frame indices.
+
+        This method overrides the default one in the base class.
+        """
+
+        # Group frames into videos. Assume that self.data_info is
+        # chronological.
+        video_frames = defaultdict(list)
+        for idx, imgname in enumerate(self.data_info['imgnames']):
+            subj, seq, camera = self._parse_mpi_inf_3dhp_imgname(imgname)
+            if seq is not None:
+                video_frames[(subj, seq, camera)].append(idx)
+            else:
+                video_frames[subj].append(idx)
+
+        # build sample indices
+        sample_indices = []
+        _len = (self.seq_len - 1) * self.seq_frame_interval + 1
+        _step = self.seq_frame_interval
+        for _, _indices in sorted(video_frames.items()):
+            n_frame = len(_indices)
+
+            if self.temporal_padding:
+                # Pad the sequence so that every frame in the sequence will be
+                # predicted.
+                if self.causal:
+                    frames_left = self.seq_len - 1
+                    frames_right = 0
+                else:
+                    frames_left = (self.seq_len - 1) // 2
+                    frames_right = frames_left
+                for i in range(n_frame):
+                    pad_left = max(0, frames_left - i // _step)
+                    pad_right = max(0,
+                                    frames_right - (n_frame - 1 - i) // _step)
+                    start = max(i % _step, i - frames_left * _step)
+                    end = min(n_frame - (n_frame - 1 - i) % _step,
+                              i + frames_right * _step + 1)
+                    sample_indices.append([_indices[0]] * pad_left +
+                                          _indices[start:end:_step] +
+                                          [_indices[-1]] * pad_right)
+            else:
+                seqs_from_video = [
+                    _indices[i:(i + _len):_step]
+                    for i in range(0, n_frame - _len + 1)
+                ]
+                sample_indices.extend(seqs_from_video)
+
+        # reduce dataset size if self.subset < 1
+        assert 0 < self.subset <= 1
+        subset_size = int(len(sample_indices) * self.subset)
+        start = np.random.randint(0, len(sample_indices) - subset_size + 1)
+        end = start + subset_size
+
+        return sample_indices[start:end]
+
+    def _load_joint_2d_detection(self, det_file):
+        """"Load 2D joint detection results from file."""
+        joints_2d = np.load(det_file).astype(np.float32)
+
+        return joints_2d
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs):
+        metrics = metric if isinstance(metric, list) else [metric]
+        for _metric in metrics:
+            if _metric not in self.ALLOWED_METRICS:
+                raise ValueError(
+                    f'Unsupported metric "{_metric}" for mpi-inf-3dhp dataset.'
+                    f'Supported metrics are {self.ALLOWED_METRICS}')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            image_paths = result['target_image_paths']
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                target_id = self.name2id[image_paths[i]]
+                kpts.append({
+                    'keypoints': preds[i],
+                    'target_id': target_id,
+                })
+
+        mmcv.dump(kpts, res_file)
+
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts)
+            elif _metric == 'p-mpjpe':
+                _nv_tuples = self._report_mpjpe(kpts, mode='p-mpjpe')
+            elif _metric == '3dpck':
+                _nv_tuples = self._report_3d_pck(kpts)
+            elif _metric == 'p-3dpck':
+                _nv_tuples = self._report_3d_pck(kpts, mode='p-3dpck')
+            elif _metric == '3dauc':
+                _nv_tuples = self._report_3d_auc(kpts)
+            elif _metric == 'p-3dauc':
+                _nv_tuples = self._report_3d_auc(kpts, mode='p-3dauc')
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(_nv_tuples)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return OrderedDict(name_value_tuples)
+
+    def _report_mpjpe(self, keypoint_results, mode='mpjpe'):
+        """Cauculate mean per joint position error (MPJPE) or its variants
+        P-MPJPE.
+
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            mode (str): Specify mpjpe variants. Supported options are:
+                - ``'mpjpe'``: Standard MPJPE.
+                - ``'p-mpjpe'``: MPJPE after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        preds = []
+        gts = []
+        for idx, result in enumerate(keypoint_results):
+            pred = result['keypoints']
+            target_id = result['target_id']
+            gt, gt_visible = np.split(
+                self.data_info['joints_3d'][target_id], [3], axis=-1)
+            preds.append(pred)
+            gts.append(gt)
+
+        preds = np.stack(preds)
+        gts = np.stack(gts)
+        masks = np.ones_like(gts[:, :, 0], dtype=bool)
+
+        err_name = mode.upper()
+        if mode == 'mpjpe':
+            alignment = 'none'
+        elif mode == 'p-mpjpe':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid mode: {mode}')
+
+        error = keypoint_mpjpe(preds, gts, masks, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_3d_pck(self, keypoint_results, mode='3dpck'):
+        """Cauculate Percentage of Correct Keypoints (3DPCK) w. or w/o
+        Procrustes alignment.
+
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            mode (str): Specify mpjpe variants. Supported options are:
+                - ``'3dpck'``: Standard 3DPCK.
+                - ``'p-3dpck'``: 3DPCK after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        preds = []
+        gts = []
+        for idx, result in enumerate(keypoint_results):
+            pred = result['keypoints']
+            target_id = result['target_id']
+            gt, gt_visible = np.split(
+                self.data_info['joints_3d'][target_id], [3], axis=-1)
+            preds.append(pred)
+            gts.append(gt)
+
+        preds = np.stack(preds)
+        gts = np.stack(gts)
+        masks = np.ones_like(gts[:, :, 0], dtype=bool)
+
+        err_name = mode.upper()
+        if mode == '3dpck':
+            alignment = 'none'
+        elif mode == 'p-3dpck':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid mode: {mode}')
+
+        error = keypoint_3d_pck(preds, gts, masks, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_3d_auc(self, keypoint_results, mode='3dauc'):
+        """Cauculate the Area Under the Curve (AUC) computed for a range of
+        3DPCK thresholds.
+
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            mode (str): Specify mpjpe variants. Supported options are:
+
+                - ``'3dauc'``: Standard 3DAUC.
+                - ``'p-3dauc'``: 3DAUC after aligning prediction to
+                    groundtruth via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        preds = []
+        gts = []
+        for idx, result in enumerate(keypoint_results):
+            pred = result['keypoints']
+            target_id = result['target_id']
+            gt, gt_visible = np.split(
+                self.data_info['joints_3d'][target_id], [3], axis=-1)
+            preds.append(pred)
+            gts.append(gt)
+
+        preds = np.stack(preds)
+        gts = np.stack(gts)
+        masks = np.ones_like(gts[:, :, 0], dtype=bool)
+
+        err_name = mode.upper()
+        if mode == '3dauc':
+            alignment = 'none'
+        elif mode == 'p-3dauc':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid mode: {mode}')
+
+        error = keypoint_3d_auc(preds, gts, masks, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _load_camera_param(self, camear_param_file):
+        """Load camera parameters from file."""
+        return mmcv.load(camear_param_file)
+
+    def get_camera_param(self, imgname):
+        """Get camera parameters of a frame by its image name."""
+        assert hasattr(self, 'camera_param')
+        return self.camera_param[imgname[:-11]]
diff --git a/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py b/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py
new file mode 100644
index 0000000..b5bf92d
--- /dev/null
+++ b/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py
@@ -0,0 +1,493 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import glob
+import json
+import os.path as osp
+import pickle
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.camera import SimpleCamera
+from mmpose.datasets.builder import DATASETS
+from mmpose.datasets.datasets.base import Kpt3dMviewRgbImgDirectDataset
+
+
+@DATASETS.register_module()
+class Body3DMviewDirectPanopticDataset(Kpt3dMviewRgbImgDirectDataset):
+    """Panoptic dataset for direct multi-view human pose estimation.
+
+    `Panoptic Studio: A Massively Multiview System for Social Motion
+    Capture' ICCV'2015
+    More details can be found in the `paper
+    <https://openaccess.thecvf.com/content_iccv_2015/papers/
+    Joo_Panoptic_Studio_A_ICCV_2015_paper.pdf>`__ .
+
+    The dataset loads both 2D and 3D annotations as well as camera parameters.
+
+    Panoptic keypoint indexes::
+
+        'neck': 0,
+        'nose': 1,
+        'mid-hip': 2,
+        'l-shoulder': 3,
+        'l-elbow': 4,
+        'l-wrist': 5,
+        'l-hip': 6,
+        'l-knee': 7,
+        'l-ankle': 8,
+        'r-shoulder': 9,
+        'r-elbow': 10,
+        'r-wrist': 11,
+        'r-hip': 12,
+        'r-knee': 13,
+        'r-ankle': 14,
+        'l-eye': 15,
+        'l-ear': 16,
+        'r-eye': 17,
+        'r-ear': 18,
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+    ALLOWED_METRICS = {'mpjpe', 'mAP'}
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/panoptic_body3d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.load_config(data_cfg)
+        self.ann_info['use_different_joint_weights'] = False
+
+        if ann_file is None:
+            self.db_file = osp.join(
+                img_prefix, f'group_{self.subset}_cam{self.num_cameras}.pkl')
+        else:
+            self.db_file = ann_file
+
+        if osp.exists(self.db_file):
+            with open(self.db_file, 'rb') as f:
+                info = pickle.load(f)
+            assert info['sequence_list'] == self.seq_list
+            assert info['interval'] == self.seq_frame_interval
+            assert info['cam_list'] == self.cam_list
+            self.db = info['db']
+        else:
+            self.db = self._get_db()
+            info = {
+                'sequence_list': self.seq_list,
+                'interval': self.seq_frame_interval,
+                'cam_list': self.cam_list,
+                'db': self.db
+            }
+            with open(self.db_file, 'wb') as f:
+                pickle.dump(info, f)
+
+        self.db_size = len(self.db)
+
+        print(f'=> load {len(self.db)} samples')
+
+    def load_config(self, data_cfg):
+        """Initialize dataset attributes according to the config.
+
+        Override this method to set dataset specific attributes.
+        """
+        self.num_joints = data_cfg['num_joints']
+        assert self.num_joints <= 19
+        self.seq_list = data_cfg['seq_list']
+        self.cam_list = data_cfg['cam_list']
+        self.num_cameras = data_cfg['num_cameras']
+        assert self.num_cameras == len(self.cam_list)
+        self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1)
+        self.subset = data_cfg.get('subset', 'train')
+        self.need_camera_param = True
+        self.root_id = data_cfg.get('root_id', 0)
+        self.max_persons = data_cfg.get('max_num', 10)
+
+    def _get_scale(self, raw_image_size):
+        heatmap_size = self.ann_info['heatmap_size']
+        image_size = self.ann_info['image_size']
+        assert heatmap_size[0][0] / heatmap_size[0][1] \
+               == image_size[0] / image_size[1]
+        w, h = raw_image_size
+        w_resized, h_resized = image_size
+        if w / w_resized < h / h_resized:
+            w_pad = h / h_resized * w_resized
+            h_pad = h
+        else:
+            w_pad = w
+            h_pad = w / w_resized * h_resized
+
+        scale = np.array([w_pad, h_pad], dtype=np.float32)
+
+        return scale
+
+    def _get_cam(self, seq):
+        """Get camera parameters.
+
+        Args:
+            seq (str): Sequence name.
+
+        Returns: Camera parameters.
+        """
+        cam_file = osp.join(self.img_prefix, seq,
+                            'calibration_{:s}.json'.format(seq))
+        with open(cam_file) as cfile:
+            calib = json.load(cfile)
+
+        M = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, -1.0], [0.0, 1.0, 0.0]])
+        cameras = {}
+        for cam in calib['cameras']:
+            if (cam['panel'], cam['node']) in self.cam_list:
+                sel_cam = {}
+                R_w2c = np.array(cam['R']).dot(M)
+                T_w2c = np.array(cam['t']).reshape((3, 1)) * 10.0  # cm to mm
+                R_c2w = R_w2c.T
+                T_c2w = -R_w2c.T @ T_w2c
+                sel_cam['R'] = R_c2w.tolist()
+                sel_cam['T'] = T_c2w.tolist()
+                sel_cam['K'] = cam['K'][:2]
+                distCoef = cam['distCoef']
+                sel_cam['k'] = [distCoef[0], distCoef[1], distCoef[4]]
+                sel_cam['p'] = [distCoef[2], distCoef[3]]
+                cameras[(cam['panel'], cam['node'])] = sel_cam
+
+        return cameras
+
+    def _get_db(self):
+        """Get dataset base.
+
+        Returns:
+            dict: the dataset base (2D and 3D information)
+        """
+        width = 1920
+        height = 1080
+        db = []
+        sample_id = 0
+        for seq in self.seq_list:
+            cameras = self._get_cam(seq)
+            curr_anno = osp.join(self.img_prefix, seq,
+                                 'hdPose3d_stage1_coco19')
+            anno_files = sorted(glob.iglob('{:s}/*.json'.format(curr_anno)))
+            print(f'load sequence: {seq}', flush=True)
+            for i, file in enumerate(anno_files):
+                if i % self.seq_frame_interval == 0:
+                    with open(file) as dfile:
+                        bodies = json.load(dfile)['bodies']
+                    if len(bodies) == 0:
+                        continue
+
+                    for k, cam_param in cameras.items():
+                        single_view_camera = SimpleCamera(cam_param)
+                        postfix = osp.basename(file).replace('body3DScene', '')
+                        prefix = '{:02d}_{:02d}'.format(k[0], k[1])
+                        image_file = osp.join(seq, 'hdImgs', prefix,
+                                              prefix + postfix)
+                        image_file = image_file.replace('json', 'jpg')
+
+                        all_poses_3d = np.zeros(
+                            (self.max_persons, self.num_joints, 3),
+                            dtype=np.float32)
+                        all_poses_vis_3d = np.zeros(
+                            (self.max_persons, self.num_joints, 3),
+                            dtype=np.float32)
+                        all_roots_3d = np.zeros((self.max_persons, 3),
+                                                dtype=np.float32)
+                        all_poses = np.zeros(
+                            (self.max_persons, self.num_joints, 3),
+                            dtype=np.float32)
+
+                        cnt = 0
+                        person_ids = -np.ones(self.max_persons, dtype=np.int)
+                        for body in bodies:
+                            if cnt >= self.max_persons:
+                                break
+                            pose3d = np.array(body['joints19']).reshape(
+                                (-1, 4))
+                            pose3d = pose3d[:self.num_joints]
+
+                            joints_vis = pose3d[:, -1] > 0.1
+
+                            if not joints_vis[self.root_id]:
+                                continue
+
+                            # Coordinate transformation
+                            M = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, -1.0],
+                                          [0.0, 1.0, 0.0]])
+                            pose3d[:, 0:3] = pose3d[:, 0:3].dot(M) * 10.0
+
+                            all_poses_3d[cnt] = pose3d[:, :3]
+                            all_roots_3d[cnt] = pose3d[self.root_id, :3]
+                            all_poses_vis_3d[cnt] = np.repeat(
+                                np.reshape(joints_vis, (-1, 1)), 3, axis=1)
+
+                            pose2d = np.zeros((pose3d.shape[0], 3))
+                            # get pose_2d from pose_3d
+                            pose2d[:, :2] = single_view_camera.world_to_pixel(
+                                pose3d[:, :3])
+                            x_check = np.bitwise_and(pose2d[:, 0] >= 0,
+                                                     pose2d[:, 0] <= width - 1)
+                            y_check = np.bitwise_and(
+                                pose2d[:, 1] >= 0, pose2d[:, 1] <= height - 1)
+                            check = np.bitwise_and(x_check, y_check)
+                            joints_vis[np.logical_not(check)] = 0
+                            pose2d[:, -1] = joints_vis
+
+                            all_poses[cnt] = pose2d
+                            person_ids[cnt] = body['id']
+                            cnt += 1
+
+                        if cnt > 0:
+                            db.append({
+                                'image_file':
+                                osp.join(self.img_prefix, image_file),
+                                'joints_3d':
+                                all_poses_3d,
+                                'person_ids':
+                                person_ids,
+                                'joints_3d_visible':
+                                all_poses_vis_3d,
+                                'joints': [all_poses],
+                                'roots_3d':
+                                all_roots_3d,
+                                'camera':
+                                cam_param,
+                                'num_persons':
+                                cnt,
+                                'sample_id':
+                                sample_id,
+                                'center':
+                                np.array((width / 2, height / 2),
+                                         dtype=np.float32),
+                                'scale':
+                                self._get_scale((width, height))
+                            })
+                            sample_id += 1
+        return db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs):
+        """
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+                - pose_3d (np.ndarray): predicted 3D human pose
+                - sample_id (np.ndarray): sample id of a frame.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Defaults: 'mpjpe'.
+            **kwargs:
+
+        Returns:
+
+        """
+        pose_3ds = np.concatenate([result['pose_3d'] for result in results],
+                                  axis=0)
+        sample_ids = []
+        for result in results:
+            sample_ids.extend(result['sample_id'])
+
+        _results = [
+            dict(sample_id=sample_id, pose_3d=pose_3d)
+            for (sample_id, pose_3d) in zip(sample_ids, pose_3ds)
+        ]
+        _results = self._sort_and_unique_outputs(_results, key='sample_id')
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        for _metric in metrics:
+            if _metric not in self.ALLOWED_METRICS:
+                raise ValueError(
+                    f'Unsupported metric "{_metric}"'
+                    f'Supported metrics are {self.ALLOWED_METRICS}')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        mmcv.dump(_results, res_file)
+
+        eval_list = []
+        gt_num = self.db_size // self.num_cameras
+        assert len(
+            _results) == gt_num, f'number mismatch: {len(_results)}, {gt_num}'
+
+        total_gt = 0
+        for i in range(gt_num):
+            index = self.num_cameras * i
+            db_rec = copy.deepcopy(self.db[index])
+            joints_3d = db_rec['joints_3d']
+            joints_3d_vis = db_rec['joints_3d_visible']
+
+            if joints_3d_vis.sum() < 1:
+                continue
+
+            pred = _results[i]['pose_3d'].copy()
+            pred = pred[pred[:, 0, 3] >= 0]
+            for pose in pred:
+                mpjpes = []
+                for (gt, gt_vis) in zip(joints_3d, joints_3d_vis):
+                    vis = gt_vis[:, 0] > 0
+                    if vis.sum() < 1:
+                        break
+                    mpjpe = np.mean(
+                        np.sqrt(
+                            np.sum((pose[vis, 0:3] - gt[vis])**2, axis=-1)))
+                    mpjpes.append(mpjpe)
+                min_gt = np.argmin(mpjpes)
+                min_mpjpe = np.min(mpjpes)
+                score = pose[0, 4]
+                eval_list.append({
+                    'mpjpe': float(min_mpjpe),
+                    'score': float(score),
+                    'gt_id': int(total_gt + min_gt)
+                })
+
+            total_gt += (joints_3d_vis[:, :, 0].sum(-1) >= 1).sum()
+
+        mpjpe_threshold = np.arange(25, 155, 25)
+        aps = []
+        ars = []
+        for t in mpjpe_threshold:
+            ap, ar = self._eval_list_to_ap(eval_list, total_gt, t)
+            aps.append(ap)
+            ars.append(ar)
+
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                stats_names = ['RECALL 500mm', 'MPJPE 500mm']
+                info_str = list(
+                    zip(stats_names, [
+                        self._eval_list_to_recall(eval_list, total_gt),
+                        self._eval_list_to_mpjpe(eval_list)
+                    ]))
+            elif _metric == 'mAP':
+                stats_names = [
+                    'AP 25', 'AP 50', 'AP 75', 'AP 100', 'AP 125', 'AP 150',
+                    'mAP', 'AR 25', 'AR 50', 'AR 75', 'AR 100', 'AR 125',
+                    'AR 150', 'mAR'
+                ]
+                mAP = np.array(aps).mean()
+                mAR = np.array(ars).mean()
+                info_str = list(zip(stats_names, aps + [mAP] + ars + [mAR]))
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return OrderedDict(name_value_tuples)
+
+    @staticmethod
+    def _eval_list_to_ap(eval_list, total_gt, threshold):
+        """Get Average Precision (AP) and Average Recall at a certain
+        threshold."""
+
+        eval_list.sort(key=lambda k: k['score'], reverse=True)
+        total_num = len(eval_list)
+
+        tp = np.zeros(total_num)
+        fp = np.zeros(total_num)
+        gt_det = []
+        for i, item in enumerate(eval_list):
+            if item['mpjpe'] < threshold and item['gt_id'] not in gt_det:
+                tp[i] = 1
+                gt_det.append(item['gt_id'])
+            else:
+                fp[i] = 1
+        tp = np.cumsum(tp)
+        fp = np.cumsum(fp)
+        recall = tp / (total_gt + 1e-5)
+        precise = tp / (tp + fp + 1e-5)
+        for n in range(total_num - 2, -1, -1):
+            precise[n] = max(precise[n], precise[n + 1])
+
+        precise = np.concatenate(([0], precise, [0]))
+        recall = np.concatenate(([0], recall, [1]))
+        index = np.where(recall[1:] != recall[:-1])[0]
+        ap = np.sum((recall[index + 1] - recall[index]) * precise[index + 1])
+
+        return ap, recall[-2]
+
+    @staticmethod
+    def _eval_list_to_mpjpe(eval_list, threshold=500):
+        """Get MPJPE within a certain threshold."""
+        eval_list.sort(key=lambda k: k['score'], reverse=True)
+        gt_det = []
+
+        mpjpes = []
+        for i, item in enumerate(eval_list):
+            if item['mpjpe'] < threshold and item['gt_id'] not in gt_det:
+                mpjpes.append(item['mpjpe'])
+                gt_det.append(item['gt_id'])
+
+        return np.mean(mpjpes) if len(mpjpes) > 0 else np.inf
+
+    @staticmethod
+    def _eval_list_to_recall(eval_list, total_gt, threshold=500):
+        """Get Recall at a certain threshold."""
+        gt_ids = [e['gt_id'] for e in eval_list if e['mpjpe'] < threshold]
+
+        return len(np.unique(gt_ids)) / total_gt
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = {}
+        for c in range(self.num_cameras):
+            result = copy.deepcopy(self.db[self.num_cameras * idx + c])
+            result['ann_info'] = self.ann_info
+            width = 1920
+            height = 1080
+            result['mask'] = [np.ones((height, width), dtype=np.float32)]
+            results[c] = result
+
+        return self.pipeline(results)
+
+    @staticmethod
+    def _sort_and_unique_outputs(outputs, key='sample_id'):
+        """sort outputs and remove the repeated ones."""
+        outputs = sorted(outputs, key=lambda x: x[key])
+        num_outputs = len(outputs)
+        for i in range(num_outputs - 1, 0, -1):
+            if outputs[i][key] == outputs[i - 1][key]:
+                del outputs[i]
+
+        return outputs
diff --git a/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py b/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py
new file mode 100644
index 0000000..491d549
--- /dev/null
+++ b/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets.builder import DATASETS, build_dataset
+
+
+@DATASETS.register_module()
+class Body3DSemiSupervisionDataset(Dataset):
+    """Mix Dataset for semi-supervised training in 3D human pose estimation
+    task.
+
+    The dataset combines data from two datasets (a labeled one and an unlabeled
+    one) and return a dict containing data from two datasets.
+
+    Args:
+        labeled_dataset (Dataset): Dataset with 3D keypoint annotations.
+        unlabeled_dataset (Dataset): Dataset without 3D keypoint annotations.
+    """
+
+    def __init__(self, labeled_dataset, unlabeled_dataset):
+        super().__init__()
+        self.labeled_dataset = build_dataset(labeled_dataset)
+        self.unlabeled_dataset = build_dataset(unlabeled_dataset)
+        self.length = len(self.unlabeled_dataset)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return self.length
+
+    def __getitem__(self, i):
+        """Given index, get the data from unlabeled dataset and randomly sample
+        an item from labeled dataset.
+
+        Return a dict containing data from labeled and unlabeled dataset.
+        """
+        data = self.unlabeled_dataset[i]
+        rand_ind = np.random.randint(0, len(self.labeled_dataset))
+        labeled_data = self.labeled_dataset[rand_ind]
+        data.update(labeled_data)
+        return data
diff --git a/mmpose/datasets/datasets/bottom_up/__init__.py b/mmpose/datasets/datasets/bottom_up/__init__.py
new file mode 100644
index 0000000..2ac7937
--- /dev/null
+++ b/mmpose/datasets/datasets/bottom_up/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bottom_up_aic import BottomUpAicDataset
+from .bottom_up_coco import BottomUpCocoDataset
+from .bottom_up_coco_wholebody import BottomUpCocoWholeBodyDataset
+from .bottom_up_crowdpose import BottomUpCrowdPoseDataset
+from .bottom_up_mhp import BottomUpMhpDataset
+
+__all__ = [
+    'BottomUpCocoDataset', 'BottomUpCrowdPoseDataset', 'BottomUpMhpDataset',
+    'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset'
+]
diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py b/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py
new file mode 100644
index 0000000..e56b725
--- /dev/null
+++ b/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import json_tricks as json
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.datasets.builder import DATASETS
+from .bottom_up_coco import BottomUpCocoDataset
+
+
+@DATASETS.register_module()
+class BottomUpAicDataset(BottomUpCocoDataset):
+    """Aic dataset for bottom-up pose estimation.
+
+    "AI Challenger : A Large-scale Dataset for Going Deeper
+    in Image Understanding", arXiv'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1711.06475>`__
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    AIC keypoint indexes::
+
+        0: "right_shoulder",
+        1: "right_elbow",
+        2: "right_wrist",
+        3: "left_shoulder",
+        4: "left_elbow",
+        5: "left_wrist",
+        6: "right_hip",
+        7: "right_knee",
+        8: "right_ankle",
+        9: "left_hip",
+        10: "left_knee",
+        11: "left_ankle",
+        12: "head_top",
+        13: "neck"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/aic.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(BottomUpCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        print(f'=> num_images: {self.num_images}')
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        with open(res_file, 'r') as file:
+            res_json = json.load(file)
+            if not res_json:
+                info_str = list(zip(stats_names, [
+                    0,
+                ] * len(stats_names)))
+                return info_str
+
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py b/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py
new file mode 100644
index 0000000..6a2fea5
--- /dev/null
+++ b/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.utils.data import Dataset
+
+
+class BottomUpBaseDataset(Dataset):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgBottomUpDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'BottomUpBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgBottomUpDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py b/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py
new file mode 100644
index 0000000..fa2967f
--- /dev/null
+++ b/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py
@@ -0,0 +1,305 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.core.post_processing import oks_nms, soft_oks_nms
+from mmpose.datasets.builder import DATASETS
+from mmpose.datasets.datasets.base import Kpt2dSviewRgbImgBottomUpDataset
+
+
+@DATASETS.register_module()
+class BottomUpCocoDataset(Kpt2dSviewRgbImgBottomUpDataset):
+    """COCO dataset for bottom-up pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/coco.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        print(f'=> num_images: {self.num_images}')
+
+    def _get_single(self, idx):
+        """Get anno for a single image.
+
+        Args:
+            idx (int): image idx
+
+        Returns:
+            dict: info for model training
+        """
+        coco = self.coco
+        img_id = self.img_ids[idx]
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        anno = coco.loadAnns(ann_ids)
+
+        mask = self._get_mask(anno, idx)
+        anno = [
+            obj.copy() for obj in anno
+            if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0
+        ]
+
+        joints = self._get_joints(anno)
+        mask_list = [mask.copy() for _ in range(self.ann_info['num_scales'])]
+        joints_list = [
+            joints.copy() for _ in range(self.ann_info['num_scales'])
+        ]
+
+        db_rec = {}
+        db_rec['dataset'] = self.dataset_name
+        db_rec['image_file'] = osp.join(self.img_prefix, self.id2name[img_id])
+        db_rec['mask'] = mask_list
+        db_rec['joints'] = joints_list
+
+        return db_rec
+
+    def _get_joints(self, anno):
+        """Get joints for all people in an image."""
+        num_people = len(anno)
+
+        if self.ann_info['scale_aware_sigma']:
+            joints = np.zeros((num_people, self.ann_info['num_joints'], 4),
+                              dtype=np.float32)
+        else:
+            joints = np.zeros((num_people, self.ann_info['num_joints'], 3),
+                              dtype=np.float32)
+
+        for i, obj in enumerate(anno):
+            joints[i, :, :3] = \
+                np.array(obj['keypoints']).reshape([-1, 3])
+            if self.ann_info['scale_aware_sigma']:
+                # get person box
+                box = obj['bbox']
+                size = max(box[2], box[3])
+                sigma = size / self.base_size * self.base_sigma
+                if self.int_sigma:
+                    sigma = int(np.ceil(sigma))
+                assert sigma > 0, sigma
+                joints[i, :, 3] = sigma
+
+        return joints
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - num_people: P
+            - num_keypoints: K
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (list[np.ndarray(P, K, 3+tag_num)]): \
+                    Pose predictions for all people in images.
+                - scores (list[P]): List of person scores.
+                - image_path (list[str]): For example, ['coco/images/\
+                    val2017/000000397133.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model outputs.
+
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        preds = []
+        scores = []
+        image_paths = []
+
+        for result in results:
+            preds.append(result['preds'])
+            scores.append(result['scores'])
+            image_paths.append(result['image_paths'][0])
+
+        kpts = defaultdict(list)
+        # iterate over images
+        for idx, _preds in enumerate(preds):
+            str_image_path = image_paths[idx]
+            image_id = self.name2id[osp.basename(str_image_path)]
+            # iterate over people
+            for idx_person, kpt in enumerate(_preds):
+                # use bbox area
+                area = (np.max(kpt[:, 0]) - np.min(kpt[:, 0])) * (
+                    np.max(kpt[:, 1]) - np.min(kpt[:, 1]))
+
+                kpts[image_id].append({
+                    'keypoints': kpt[:, 0:3],
+                    'score': scores[idx][idx_person],
+                    'tags': kpt[:, 3],
+                    'image_id': image_id,
+                    'area': area,
+                })
+
+        valid_kpts = []
+        for img in kpts.keys():
+            img_kpts = kpts[img]
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, self.oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            for img_kpt, key_point in zip(img_kpts, key_points):
+                kpt = key_point.reshape((self.ann_info['num_joints'], 3))
+                left_top = np.amin(kpt, axis=0)
+                right_bottom = np.amax(kpt, axis=0)
+
+                w = right_bottom[0] - left_top[0]
+                h = right_bottom[1] - left_top[1]
+
+                cat_results.append({
+                    'image_id': img_kpt['image_id'],
+                    'category_id': cat_id,
+                    'keypoints': key_point.tolist(),
+                    'score': img_kpt['score'],
+                    'bbox': [left_top[0], left_top[1], w, h]
+                })
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        with open(res_file, 'r') as file:
+            res_json = json.load(file)
+            if not res_json:
+                info_str = list(zip(stats_names, [
+                    0,
+                ] * len(stats_names)))
+                return info_str
+
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py b/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py
new file mode 100644
index 0000000..363d2ef
--- /dev/null
+++ b/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py
@@ -0,0 +1,238 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.datasets.builder import DATASETS
+from .bottom_up_coco import BottomUpCocoDataset
+
+
+@DATASETS.register_module()
+class BottomUpCocoWholeBodyDataset(BottomUpCocoDataset):
+    """CocoWholeBodyDataset dataset for bottom-up pose estimation.
+
+    `Whole-Body Human Pose Estimation in the Wild', ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.11858>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    In total, we have 133 keypoints for wholebody pose estimation.
+
+    COCO-WholeBody keypoint indexes::
+
+        0-16: 17 body keypoints,
+        17-22: 6 foot keypoints,
+        23-90: 68 face keypoints,
+        91-132: 42 hand keypoints
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/coco_wholebody.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(BottomUpCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+
+        self.body_num = 17
+        self.foot_num = 6
+        self.face_num = 68
+        self.left_hand_num = 21
+        self.right_hand_num = 21
+
+        print(f'=> num_images: {self.num_images}')
+
+    def _get_joints(self, anno):
+        """Get joints for all people in an image."""
+        num_people = len(anno)
+
+        if self.ann_info['scale_aware_sigma']:
+            joints = np.zeros((num_people, self.ann_info['num_joints'], 4),
+                              dtype=np.float32)
+        else:
+            joints = np.zeros((num_people, self.ann_info['num_joints'], 3),
+                              dtype=np.float32)
+
+        for i, obj in enumerate(anno):
+            keypoints = np.array(obj['keypoints'] + obj['foot_kpts'] +
+                                 obj['face_kpts'] + obj['lefthand_kpts'] +
+                                 obj['righthand_kpts']).reshape(-1, 3)
+
+            joints[i, :self.ann_info['num_joints'], :3] = keypoints
+            if self.ann_info['scale_aware_sigma']:
+                # get person box
+                box = obj['bbox']
+                size = max(box[2], box[3])
+                sigma = size / self.base_size * self.base_sigma
+                if self.int_sigma:
+                    sigma = int(np.ceil(sigma))
+                assert sigma > 0, sigma
+                joints[i, :, 3] = sigma
+
+        return joints
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            cuts = np.cumsum([
+                0, self.body_num, self.foot_num, self.face_num,
+                self.left_hand_num, self.right_hand_num
+            ]) * 3
+
+            for img_kpt, key_point in zip(img_kpts, key_points):
+                kpt = key_point.reshape((self.ann_info['num_joints'], 3))
+                left_top = np.amin(kpt, axis=0)
+                right_bottom = np.amax(kpt, axis=0)
+
+                w = right_bottom[0] - left_top[0]
+                h = right_bottom[1] - left_top[1]
+
+                cat_results.append({
+                    'image_id':
+                    img_kpt['image_id'],
+                    'category_id':
+                    cat_id,
+                    'keypoints':
+                    key_point[cuts[0]:cuts[1]].tolist(),
+                    'foot_kpts':
+                    key_point[cuts[1]:cuts[2]].tolist(),
+                    'face_kpts':
+                    key_point[cuts[2]:cuts[3]].tolist(),
+                    'lefthand_kpts':
+                    key_point[cuts[3]:cuts[4]].tolist(),
+                    'righthand_kpts':
+                    key_point[cuts[4]:cuts[5]].tolist(),
+                    'score':
+                    img_kpt['score'],
+                    'bbox': [left_top[0], left_top[1], w, h]
+                })
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+
+        cuts = np.cumsum([
+            0, self.body_num, self.foot_num, self.face_num, self.left_hand_num,
+            self.right_hand_num
+        ])
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_body',
+            self.sigmas[cuts[0]:cuts[1]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_foot',
+            self.sigmas[cuts[1]:cuts[2]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_face',
+            self.sigmas[cuts[2]:cuts[3]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_lefthand',
+            self.sigmas[cuts[3]:cuts[4]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_righthand',
+            self.sigmas[cuts[4]:cuts[5]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_wholebody',
+            self.sigmas,
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py b/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py
new file mode 100644
index 0000000..ebabf3e
--- /dev/null
+++ b/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import json_tricks as json
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.datasets.builder import DATASETS
+from .bottom_up_coco import BottomUpCocoDataset
+
+
+@DATASETS.register_module()
+class BottomUpCrowdPoseDataset(BottomUpCocoDataset):
+    """CrowdPose dataset for bottom-up pose estimation.
+
+    "CrowdPose: Efficient Crowded Scenes Pose Estimation and
+    A New Benchmark", CVPR'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1812.00324>`__.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    CrowdPose keypoint indexes::
+
+        0: 'left_shoulder',
+        1: 'right_shoulder',
+        2: 'left_elbow',
+        3: 'right_elbow',
+        4: 'left_wrist',
+        5: 'right_wrist',
+        6: 'left_hip',
+        7: 'right_hip',
+        8: 'left_knee',
+        9: 'right_knee',
+        10: 'left_ankle',
+        11: 'right_ankle',
+        12: 'top_head',
+        13: 'neck'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/crowdpose.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(BottomUpCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        print(f'=> num_images: {self.num_images}')
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AR', 'AR .5', 'AR .75', 'AP(E)', 'AP(M)',
+            'AP(H)'
+        ]
+
+        with open(res_file, 'r') as file:
+            res_json = json.load(file)
+            if not res_json:
+                info_str = list(zip(stats_names, [
+                    0,
+                ] * len(stats_names)))
+                return info_str
+
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_crowd',
+            self.sigmas,
+            use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py b/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py
new file mode 100644
index 0000000..1438123
--- /dev/null
+++ b/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import json_tricks as json
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.datasets.builder import DATASETS
+from .bottom_up_coco import BottomUpCocoDataset
+
+
+@DATASETS.register_module()
+class BottomUpMhpDataset(BottomUpCocoDataset):
+    """MHPv2.0 dataset for top-down pose estimation.
+
+    "Understanding Humans in Crowded Scenes: Deep Nested Adversarial
+    Learning and A New Benchmark for Multi-Human Parsing", ACM MM'2018.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1804.03287>`__
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MHP keypoint indexes::
+
+        0: "right ankle",
+        1: "right knee",
+        2: "right hip",
+        3: "left hip",
+        4: "left knee",
+        5: "left ankle",
+        6: "pelvis",
+        7: "thorax",
+        8: "upper neck",
+        9: "head top",
+        10: "right wrist",
+        11: "right elbow",
+        12: "right shoulder",
+        13: "left shoulder",
+        14: "left elbow",
+        15: "left wrist",
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mhp.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(BottomUpCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        print(f'=> num_images: {self.num_images}')
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        with open(res_file, 'r') as file:
+            res_json = json.load(file)
+            if not res_json:
+                info_str = list(zip(stats_names, [
+                    0,
+                ] * len(stats_names)))
+                return info_str
+
+        coco_det = self.coco.loadRes(res_file)
+
+        coco_eval = COCOeval(
+            self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/face/__init__.py b/mmpose/datasets/datasets/face/__init__.py
new file mode 100644
index 0000000..1ba42d4
--- /dev/null
+++ b/mmpose/datasets/datasets/face/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .face_300w_dataset import Face300WDataset
+from .face_aflw_dataset import FaceAFLWDataset
+from .face_coco_wholebody_dataset import FaceCocoWholeBodyDataset
+from .face_cofw_dataset import FaceCOFWDataset
+from .face_wflw_dataset import FaceWFLWDataset
+
+__all__ = [
+    'Face300WDataset', 'FaceAFLWDataset', 'FaceWFLWDataset', 'FaceCOFWDataset',
+    'FaceCocoWholeBodyDataset'
+]
diff --git a/mmpose/datasets/datasets/face/face_300w_dataset.py b/mmpose/datasets/datasets/face/face_300w_dataset.py
new file mode 100644
index 0000000..e5b602e
--- /dev/null
+++ b/mmpose/datasets/datasets/face/face_300w_dataset.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class Face300WDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Face300W dataset for top-down face keypoint localization.
+
+    "300 faces In-the-wild challenge: Database and results",
+    Image and Vision Computing (IMAVIS) 2019.
+
+    The dataset loads raw images and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The landmark annotations follow the 68 points mark-up. The definition
+    can be found in `https://ibug.doc.ic.ac.uk/resources/300-W/`.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/300w.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                if 'center' in obj and 'scale' in obj:
+                    center = np.array(obj['center'])
+                    scale = np.array([obj['scale'], obj['scale']]) * 1.25
+                else:
+                    center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get inter-ocular distance as the normalize factor, measured as the
+        Euclidean distance between the outer corners of the eyes.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 36, :] - gts[:, 45, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['300W/ibug/\
+                    image_018.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/face/face_aflw_dataset.py b/mmpose/datasets/datasets/face/face_aflw_dataset.py
new file mode 100644
index 0000000..292d9ee
--- /dev/null
+++ b/mmpose/datasets/datasets/face/face_aflw_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FaceAFLWDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Face AFLW dataset for top-down face keypoint localization.
+
+    "Annotated Facial Landmarks in the Wild: A Large-scale,
+    Real-world Database for Facial Landmark Localization".
+    In Proc. First IEEE International Workshop on Benchmarking
+    Facial Image Analysis Technologies, 2011.
+
+    The dataset loads raw images and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The landmark annotations follow the 19 points mark-up. The definition
+    can be found in `https://www.tugraz.at/institute/icg/research`
+    `/team-bischof/lrs/downloads/aflw/`
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/aflw.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if self.test_mode:
+                    # 'box_size' is used as normalization factor
+                    assert 'box_size' in obj
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                if 'center' in obj and 'scale' in obj:
+                    center = np.array(obj['center'])
+                    scale = np.array([obj['scale'], obj['scale']]) * 1.25
+                else:
+                    center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'box_size': obj['box_size'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, box_sizes, *args, **kwargs):
+        """Get normalize factor for evaluation.
+
+        Args:
+            box_sizes (np.ndarray[N, 1]): box size
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        return np.tile(box_sizes, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['aflw/images/flickr/ \
+                    0/image00002.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/face/face_base_dataset.py b/mmpose/datasets/datasets/face/face_base_dataset.py
new file mode 100644
index 0000000..466fabb
--- /dev/null
+++ b/mmpose/datasets/datasets/face/face_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class FaceBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'FaceBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py b/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py
new file mode 100644
index 0000000..ef5117a
--- /dev/null
+++ b/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FaceCocoWholeBodyDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """CocoWholeBodyDataset for face keypoint localization.
+
+    `Whole-Body Human Pose Estimation in the Wild', ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.11858>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The face landmark annotations follow the 68 points mark-up.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/'
+                                  'coco_wholebody_face.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if obj['face_valid'] and max(obj['face_kpts']) > 0:
+                    joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                    joints_3d_visible = np.zeros((num_joints, 3),
+                                                 dtype=np.float32)
+
+                    keypoints = np.array(obj['face_kpts']).reshape(-1, 3)
+                    joints_3d[:, :2] = keypoints[:, :2]
+                    joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                    center, scale = self._xywh2cs(*obj['face_box'][:4], 1.25)
+
+                    image_file = osp.join(self.img_prefix,
+                                          self.id2name[img_id])
+                    gt_db.append({
+                        'image_file': image_file,
+                        'center': center,
+                        'scale': scale,
+                        'rotation': 0,
+                        'joints_3d': joints_3d,
+                        'joints_3d_visible': joints_3d_visible,
+                        'dataset': self.dataset_name,
+                        'bbox': obj['face_box'],
+                        'bbox_score': 1,
+                        'bbox_id': bbox_id
+                    })
+                    bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get inter-ocular distance as the normalize factor, measured as the
+        Euclidean distance between the outer corners of the eyes.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 36, :] - gts[:, 45, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate COCO-WholeBody Face keypoint results. The pose prediction
+        results will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['coco/train2017/\
+                    000000000009.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/face/face_cofw_dataset.py b/mmpose/datasets/datasets/face/face_cofw_dataset.py
new file mode 100644
index 0000000..456ea0e
--- /dev/null
+++ b/mmpose/datasets/datasets/face/face_cofw_dataset.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FaceCOFWDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Face COFW dataset for top-down face keypoint localization.
+
+    "Robust face landmark estimation under occlusion", ICCV'2013.
+
+    The dataset loads raw images and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The landmark annotations follow the 29 points mark-up. The definition
+    can be found in `http://www.vision.caltech.edu/xpburgos/ICCV13/`.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/cofw.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                if 'center' in obj and 'scale' in obj:
+                    center = np.array(obj['center'])
+                    scale = np.array([obj['scale'], obj['scale']]) * 1.25
+                else:
+                    center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get normalize factor for evaluation.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 8, :] - gts[:, 9, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['cofw/images/\
+                    000001.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/face/face_wflw_dataset.py b/mmpose/datasets/datasets/face/face_wflw_dataset.py
new file mode 100644
index 0000000..e4611e1
--- /dev/null
+++ b/mmpose/datasets/datasets/face/face_wflw_dataset.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FaceWFLWDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Face WFLW dataset for top-down face keypoint localization.
+
+    "Look at Boundary: A Boundary-Aware Face Alignment Algorithm",
+    CVPR'2018.
+
+    The dataset loads raw images and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The landmark annotations follow the 98 points mark-up. The definition
+    can be found in `https://wywu.github.io/projects/LAB/WFLW.html`.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/wflw.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                if 'center' in obj and 'scale' in obj:
+                    center = np.array(obj['center'])
+                    scale = np.array([obj['scale'], obj['scale']]) * 1.25
+                else:
+                    center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get normalize factor for evaluation.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+
+        interocular = np.linalg.norm(
+            gts[:, 60, :] - gts[:, 72, :], axis=1, keepdims=True)
+        return np.tile(interocular, [1, 2])
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='NME', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[1,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str]): For example, ['wflw/images/\
+                    0--Parade/0_Parade_marchingband_1_1015.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'NME'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/fashion/__init__.py b/mmpose/datasets/datasets/fashion/__init__.py
new file mode 100644
index 0000000..575d6ed
--- /dev/null
+++ b/mmpose/datasets/datasets/fashion/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .deepfashion_dataset import DeepFashionDataset
+
+__all__ = ['DeepFashionDataset']
diff --git a/mmpose/datasets/datasets/fashion/deepfashion_dataset.py b/mmpose/datasets/datasets/fashion/deepfashion_dataset.py
new file mode 100644
index 0000000..0fef655
--- /dev/null
+++ b/mmpose/datasets/datasets/fashion/deepfashion_dataset.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """DeepFashion dataset (full-body clothes) for fashion landmark detection.
+
+    "DeepFashion: Powering Robust Clothes Recognition
+    and Retrieval with Rich Annotations", CVPR'2016.
+    "Fashion Landmark Detection in the Wild", ECCV'2016.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    The dataset contains 3 categories for full-body, upper-body and lower-body.
+
+    Fashion landmark indexes for upper-body clothes::
+
+        0: 'left collar',
+        1: 'right collar',
+        2: 'left sleeve',
+        3: 'right sleeve',
+        4: 'left hem',
+        5: 'right hem'
+
+    Fashion landmark indexes for lower-body clothes::
+
+        0: 'left waistline',
+        1: 'right waistline',
+        2: 'left hem',
+        3: 'right hem'
+
+    Fashion landmark indexes for full-body clothes::
+
+        0: 'left collar',
+        1: 'right collar',
+        2: 'left sleeve',
+        3: 'right sleeve',
+        4: 'left waistline',
+        5: 'right waistline',
+        6: 'left hem',
+        7: 'right hem'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 subset='',
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            if subset != '':
+                warnings.warn(
+                    'subset is deprecated.'
+                    'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                    'for details.', DeprecationWarning)
+            if subset == 'upper':
+                cfg = Config.fromfile(
+                    'configs/_base_/datasets/deepfashion_upper.py')
+                dataset_info = cfg._cfg_dict['dataset_info']
+            elif subset == 'lower':
+                cfg = Config.fromfile(
+                    'configs/_base_/datasets/deepfashion_lower.py')
+                dataset_info = cfg._cfg_dict['dataset_info']
+            elif subset == 'full':
+                cfg = Config.fromfile(
+                    'configs/_base_/datasets/deepfashion_full.py')
+                dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # use 1.25bbox as input
+                center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['img_00000001.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/fashion/fashion_base_dataset.py b/mmpose/datasets/datasets/fashion/fashion_base_dataset.py
new file mode 100644
index 0000000..d4e5860
--- /dev/null
+++ b/mmpose/datasets/datasets/fashion/fashion_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class FashionBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'FashionBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/mmpose/datasets/datasets/hand/__init__.py b/mmpose/datasets/datasets/hand/__init__.py
new file mode 100644
index 0000000..49159af
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .freihand_dataset import FreiHandDataset
+from .hand_coco_wholebody_dataset import HandCocoWholeBodyDataset
+from .interhand2d_dataset import InterHand2DDataset
+from .interhand3d_dataset import InterHand3DDataset
+from .onehand10k_dataset import OneHand10KDataset
+from .panoptic_hand2d_dataset import PanopticDataset
+from .rhd2d_dataset import Rhd2DDataset
+
+__all__ = [
+    'FreiHandDataset', 'InterHand2DDataset', 'InterHand3DDataset',
+    'OneHand10KDataset', 'PanopticDataset', 'Rhd2DDataset',
+    'HandCocoWholeBodyDataset'
+]
diff --git a/mmpose/datasets/datasets/hand/freihand_dataset.py b/mmpose/datasets/datasets/hand/freihand_dataset.py
new file mode 100644
index 0000000..e9ceeff
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/freihand_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class FreiHandDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """FreiHand dataset for top-down hand pose estimation.
+
+    "FreiHAND: A Dataset for Markerless Capture of Hand Pose
+    and Shape from Single RGB Images", ICCV'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/1909.04349.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    FreiHand keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/freihand2d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 224x224
+                center, scale = self._xywh2cs(0, 0, 224, 224, 0.8)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate freihand keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['training/rgb/\
+                    00031426.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/hand/hand_base_dataset.py b/mmpose/datasets/datasets/hand/hand_base_dataset.py
new file mode 100644
index 0000000..fd20846
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/hand_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class HandBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'HandBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py b/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py
new file mode 100644
index 0000000..7c95cc0
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class HandCocoWholeBodyDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """CocoWholeBodyDataset for top-down hand pose estimation.
+
+    "Whole-Body Human Pose Estimation in the Wild", ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.11858>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO-WholeBody Hand keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile(
+                'configs/_base_/datasets/coco_wholebody_hand.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                for type in ['left', 'right']:
+                    if obj[f'{type}hand_valid'] and max(
+                            obj[f'{type}hand_kpts']) > 0:
+                        joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                        joints_3d_visible = np.zeros((num_joints, 3),
+                                                     dtype=np.float32)
+
+                        keypoints = np.array(obj[f'{type}hand_kpts']).reshape(
+                            -1, 3)
+                        joints_3d[:, :2] = keypoints[:, :2]
+                        joints_3d_visible[:, :2] = np.minimum(
+                            1, keypoints[:, 2:3])
+
+                        # use 1.25 padded bbox as input
+                        center, scale = self._xywh2cs(
+                            *obj[f'{type}hand_box'][:4], 1.25)
+
+                        image_file = osp.join(self.img_prefix,
+                                              self.id2name[img_id])
+
+                        gt_db.append({
+                            'image_file': image_file,
+                            'center': center,
+                            'scale': scale,
+                            'rotation': 0,
+                            'joints_3d': joints_3d,
+                            'joints_3d_visible': joints_3d_visible,
+                            'dataset': self.dataset_name,
+                            'bbox': obj[f'{type}hand_box'],
+                            'bbox_score': 1,
+                            'bbox_id': bbox_id
+                        })
+                        bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate COCO-WholeBody Hand keypoint results. The pose prediction
+        results will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/hand/interhand2d_dataset.py b/mmpose/datasets/datasets/hand/interhand2d_dataset.py
new file mode 100644
index 0000000..fea17fa
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/interhand2d_dataset.py
@@ -0,0 +1,306 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class InterHand2DDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """InterHand2.6M 2D dataset for top-down hand pose estimation.
+
+    "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose
+    Estimation from a Single RGB Image", ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/2008.09309.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    InterHand2.6M keypoint indexes::
+
+        0: 'thumb4',
+        1: 'thumb3',
+        2: 'thumb2',
+        3: 'thumb1',
+        4: 'forefinger4',
+        5: 'forefinger3',
+        6: 'forefinger2',
+        7: 'forefinger1',
+        8: 'middle_finger4',
+        9: 'middle_finger3',
+        10: 'middle_finger2',
+        11: 'middle_finger1',
+        12: 'ring_finger4',
+        13: 'ring_finger3',
+        14: 'ring_finger2',
+        15: 'ring_finger1',
+        16: 'pinky_finger4',
+        17: 'pinky_finger3',
+        18: 'pinky_finger2',
+        19: 'pinky_finger1',
+        20: 'wrist'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        camera_file (str): Path to the camera file.
+        joint_file (str): Path to the joint file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (str): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 camera_file,
+                 joint_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/interhand2d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.camera_file = camera_file
+        self.joint_file = joint_file
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    @staticmethod
+    def _cam2pixel(cam_coord, f, c):
+        """Transform the joints from their camera coordinates to their pixel
+        coordinates.
+
+        Note:
+            - N: number of joints
+
+        Args:
+            cam_coord (ndarray[N, 3]): 3D joints coordinates
+                in the camera coordinate system
+            f (ndarray[2]): focal length of x and y axis
+            c (ndarray[2]): principal point of x and y axis
+
+        Returns:
+            img_coord (ndarray[N, 3]): the coordinates (x, y, 0)
+                in the image plane.
+        """
+        x = cam_coord[:, 0] / (cam_coord[:, 2] + 1e-8) * f[0] + c[0]
+        y = cam_coord[:, 1] / (cam_coord[:, 2] + 1e-8) * f[1] + c[1]
+        z = np.zeros_like(x)
+        img_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1)
+        return img_coord
+
+    @staticmethod
+    def _world2cam(world_coord, R, T):
+        """Transform the joints from their world coordinates to their camera
+        coordinates.
+
+        Note:
+            - N: number of joints
+
+        Args:
+            world_coord (ndarray[3, N]): 3D joints coordinates
+                in the world coordinate system
+            R (ndarray[3, 3]): camera rotation matrix
+            T (ndarray[3]): camera position (x, y, z)
+
+        Returns:
+            cam_coord (ndarray[3, N]): 3D joints coordinates
+                in the camera coordinate system
+        """
+        cam_coord = np.dot(R, world_coord - T)
+        return cam_coord
+
+    def _get_db(self):
+        """Load dataset.
+
+        Adapted from 'https://github.com/facebookresearch/InterHand2.6M/'
+            'blob/master/data/InterHand2.6M/dataset.py'
+        Copyright (c) FaceBook Research, under CC-BY-NC 4.0 license.
+        """
+        with open(self.camera_file, 'r') as f:
+            cameras = json.load(f)
+        with open(self.joint_file, 'r') as f:
+            joints = json.load(f)
+        gt_db = []
+        bbox_id = 0
+        for img_id in self.img_ids:
+            num_joints = self.ann_info['num_joints']
+
+            ann_id = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            ann = self.coco.loadAnns(ann_id)[0]
+            img = self.coco.loadImgs(img_id)[0]
+
+            capture_id = str(img['capture'])
+            camera_name = img['camera']
+            frame_idx = str(img['frame_idx'])
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+            camera_pos, camera_rot = np.array(
+                cameras[capture_id]['campos'][camera_name],
+                dtype=np.float32), np.array(
+                    cameras[capture_id]['camrot'][camera_name],
+                    dtype=np.float32)
+            focal, principal_pt = np.array(
+                cameras[capture_id]['focal'][camera_name],
+                dtype=np.float32), np.array(
+                    cameras[capture_id]['princpt'][camera_name],
+                    dtype=np.float32)
+            joint_world = np.array(
+                joints[capture_id][frame_idx]['world_coord'], dtype=np.float32)
+            joint_cam = self._world2cam(
+                joint_world.transpose(1, 0), camera_rot,
+                camera_pos.reshape(3, 1)).transpose(1, 0)
+            joint_img = self._cam2pixel(joint_cam, focal, principal_pt)[:, :2]
+            joint_img = joint_img.reshape(2, -1, 2)
+
+            joint_valid = np.array(
+                ann['joint_valid'], dtype=np.float32).reshape(2, -1)
+            # if root is not valid -> root-relative 3D pose is also not valid.
+            # Therefore, mark all joints as invalid
+            for hand in range(2):
+                joint_valid[hand, :] *= joint_valid[hand][-1]
+
+                if np.sum(joint_valid[hand, :]) > 11:
+                    joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                    joints_3d_visible = np.zeros((num_joints, 3),
+                                                 dtype=np.float32)
+                    joints_3d[:, :2] = joint_img[hand, :, :]
+                    joints_3d_visible[:, :2] = np.minimum(
+                        1, joint_valid[hand, :].reshape(-1, 1))
+
+                    # use the tightest bbox enclosing all keypoints as bbox
+                    bbox = [img['width'], img['height'], 0, 0]
+                    for i in range(num_joints):
+                        if joints_3d_visible[i][0]:
+                            bbox[0] = min(bbox[0], joints_3d[i][0])
+                            bbox[1] = min(bbox[1], joints_3d[i][1])
+                            bbox[2] = max(bbox[2], joints_3d[i][0])
+                            bbox[3] = max(bbox[3], joints_3d[i][1])
+
+                    bbox[2] -= bbox[0]
+                    bbox[3] -= bbox[1]
+
+                    # use 1.5bbox as input
+                    center, scale = self._xywh2cs(*bbox, 1.5)
+
+                    gt_db.append({
+                        'image_file': image_file,
+                        'center': center,
+                        'scale': scale,
+                        'rotation': 0,
+                        'joints_3d': joints_3d,
+                        'joints_3d_visible': joints_3d_visible,
+                        'dataset': self.dataset_name,
+                        'bbox': bbox,
+                        'bbox_score': 1,
+                        'bbox_id': bbox_id
+                    })
+                    bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate interhand2d keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Capture12/\
+                    0390_dh_touchROM/cam410209/image62434.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/hand/interhand3d_dataset.py b/mmpose/datasets/datasets/hand/interhand3d_dataset.py
new file mode 100644
index 0000000..318d73f
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/interhand3d_dataset.py
@@ -0,0 +1,505 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.evaluation.top_down_eval import keypoint_epe
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt3dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class InterHand3DDataset(Kpt3dSviewRgbImgTopDownDataset):
+    """InterHand2.6M 3D dataset for top-down hand pose estimation.
+
+    "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose
+    Estimation from a Single RGB Image", ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/2008.09309.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    InterHand2.6M keypoint indexes::
+
+        0: 'r_thumb4',
+        1: 'r_thumb3',
+        2: 'r_thumb2',
+        3: 'r_thumb1',
+        4: 'r_index4',
+        5: 'r_index3',
+        6: 'r_index2',
+        7: 'r_index1',
+        8: 'r_middle4',
+        9: 'r_middle3',
+        10: 'r_middle2',
+        11: 'r_middle1',
+        12: 'r_ring4',
+        13: 'r_ring3',
+        14: 'r_ring2',
+        15: 'r_ring1',
+        16: 'r_pinky4',
+        17: 'r_pinky3',
+        18: 'r_pinky2',
+        19: 'r_pinky1',
+        20: 'r_wrist',
+        21: 'l_thumb4',
+        22: 'l_thumb3',
+        23: 'l_thumb2',
+        24: 'l_thumb1',
+        25: 'l_index4',
+        26: 'l_index3',
+        27: 'l_index2',
+        28: 'l_index1',
+        29: 'l_middle4',
+        30: 'l_middle3',
+        31: 'l_middle2',
+        32: 'l_middle1',
+        33: 'l_ring4',
+        34: 'l_ring3',
+        35: 'l_ring2',
+        36: 'l_ring1',
+        37: 'l_pinky4',
+        38: 'l_pinky3',
+        39: 'l_pinky2',
+        40: 'l_pinky1',
+        41: 'l_wrist'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        camera_file (str): Path to the camera file.
+        joint_file (str): Path to the joint file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        use_gt_root_depth (bool): Using the ground truth depth of the wrist
+            or given depth from rootnet_result_file.
+        rootnet_result_file (str): Path to the wrist depth file.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (str): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 camera_file,
+                 joint_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 use_gt_root_depth=True,
+                 rootnet_result_file=None,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/interhand3d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['heatmap3d_depth_bound'] = data_cfg[
+            'heatmap3d_depth_bound']
+        self.ann_info['heatmap_size_root'] = data_cfg['heatmap_size_root']
+        self.ann_info['root_depth_bound'] = data_cfg['root_depth_bound']
+        self.ann_info['use_different_joint_weights'] = False
+
+        self.camera_file = camera_file
+        self.joint_file = joint_file
+
+        self.use_gt_root_depth = use_gt_root_depth
+        if not self.use_gt_root_depth:
+            assert rootnet_result_file is not None
+            self.rootnet_result_file = rootnet_result_file
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    @staticmethod
+    def _encode_handtype(hand_type):
+        if hand_type == 'right':
+            return np.array([1, 0], dtype=np.float32)
+        elif hand_type == 'left':
+            return np.array([0, 1], dtype=np.float32)
+        elif hand_type == 'interacting':
+            return np.array([1, 1], dtype=np.float32)
+        else:
+            assert 0, f'Not support hand type: {hand_type}'
+
+    def _get_db(self):
+        """Load dataset.
+
+        Adapted from 'https://github.com/facebookresearch/InterHand2.6M/'
+            'blob/master/data/InterHand2.6M/dataset.py'
+        Copyright (c) FaceBook Research, under CC-BY-NC 4.0 license.
+        """
+        with open(self.camera_file, 'r') as f:
+            cameras = json.load(f)
+        with open(self.joint_file, 'r') as f:
+            joints = json.load(f)
+
+        if not self.use_gt_root_depth:
+            rootnet_result = {}
+            with open(self.rootnet_result_file, 'r') as f:
+                rootnet_annot = json.load(f)
+            for i in range(len(rootnet_annot)):
+                rootnet_result[str(
+                    rootnet_annot[i]['annot_id'])] = rootnet_annot[i]
+
+        gt_db = []
+        bbox_id = 0
+        for img_id in self.img_ids:
+            num_joints = self.ann_info['num_joints']
+
+            ann_id = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            ann = self.coco.loadAnns(ann_id)[0]
+            img = self.coco.loadImgs(img_id)[0]
+
+            capture_id = str(img['capture'])
+            camera_name = img['camera']
+            frame_idx = str(img['frame_idx'])
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+            camera_pos = np.array(
+                cameras[capture_id]['campos'][camera_name], dtype=np.float32)
+            camera_rot = np.array(
+                cameras[capture_id]['camrot'][camera_name], dtype=np.float32)
+            focal = np.array(
+                cameras[capture_id]['focal'][camera_name], dtype=np.float32)
+            principal_pt = np.array(
+                cameras[capture_id]['princpt'][camera_name], dtype=np.float32)
+            joint_world = np.array(
+                joints[capture_id][frame_idx]['world_coord'], dtype=np.float32)
+            joint_cam = self._world2cam(
+                joint_world.transpose(1, 0), camera_rot,
+                camera_pos.reshape(3, 1)).transpose(1, 0)
+            joint_img = self._cam2pixel(joint_cam, focal, principal_pt)[:, :2]
+
+            joint_valid = np.array(
+                ann['joint_valid'], dtype=np.float32).flatten()
+            hand_type = self._encode_handtype(ann['hand_type'])
+            hand_type_valid = ann['hand_type_valid']
+
+            if self.use_gt_root_depth:
+                bbox = np.array(ann['bbox'], dtype=np.float32)
+                # extend the bbox to include some context
+                center, scale = self._xywh2cs(*bbox, 1.25)
+                abs_depth = [joint_cam[20, 2], joint_cam[41, 2]]
+            else:
+                rootnet_ann_data = rootnet_result[str(ann_id[0])]
+                bbox = np.array(rootnet_ann_data['bbox'], dtype=np.float32)
+                # the bboxes have been extended
+                center, scale = self._xywh2cs(*bbox, 1.0)
+                abs_depth = rootnet_ann_data['abs_depth']
+            # 41: 'l_wrist', left hand root
+            # 20: 'r_wrist', right hand root
+            rel_root_depth = joint_cam[41, 2] - joint_cam[20, 2]
+            # if root is not valid, root-relative 3D depth is also invalid.
+            rel_root_valid = joint_valid[20] * joint_valid[41]
+
+            # if root is not valid -> root-relative 3D pose is also not valid.
+            # Therefore, mark all joints as invalid
+            joint_valid[:20] *= joint_valid[20]
+            joint_valid[21:] *= joint_valid[41]
+
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d[:, :2] = joint_img
+            joints_3d[:21, 2] = joint_cam[:21, 2] - joint_cam[20, 2]
+            joints_3d[21:, 2] = joint_cam[21:, 2] - joint_cam[41, 2]
+            joints_3d_visible[...] = np.minimum(1, joint_valid.reshape(-1, 1))
+
+            gt_db.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'hand_type': hand_type,
+                'hand_type_valid': hand_type_valid,
+                'rel_root_depth': rel_root_depth,
+                'rel_root_valid': rel_root_valid,
+                'abs_depth': abs_depth,
+                'joints_cam': joint_cam,
+                'focal': focal,
+                'princpt': principal_pt,
+                'dataset': self.dataset_name,
+                'bbox': bbox,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='MPJPE', **kwargs):
+        """Evaluate interhand2d keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - hand_type (np.ndarray[N, 4]): The first two dimensions are \
+                    hand type, scores is the last two dimensions.
+                - rel_root_depth (np.ndarray[N]): The relative depth of left \
+                    wrist and right wrist.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Capture6/\
+                    0012_aokay_upright/cam410061/image4996.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'MRRPE', 'MPJPE', 'Handedness_acc'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['MRRPE', 'MPJPE', 'Handedness_acc']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result.get('preds')
+            if preds is None and 'MPJPE' in metrics:
+                raise KeyError('metric MPJPE is not supported')
+
+            hand_type = result.get('hand_type')
+            if hand_type is None and 'Handedness_acc' in metrics:
+                raise KeyError('metric Handedness_acc is not supported')
+
+            rel_root_depth = result.get('rel_root_depth')
+            if rel_root_depth is None and 'MRRPE' in metrics:
+                raise KeyError('metric MRRPE is not supported')
+
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpt = {
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                }
+
+                if preds is not None:
+                    kpt['keypoints'] = preds[i, :, :3].tolist()
+                if hand_type is not None:
+                    kpt['hand_type'] = hand_type[i][0:2].tolist()
+                    kpt['hand_type_score'] = hand_type[i][2:4].tolist()
+                if rel_root_depth is not None:
+                    kpt['rel_root_depth'] = float(rel_root_depth[i])
+
+                kpts.append(kpt)
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _get_accuracy(outputs, gts, masks):
+        """Get accuracy of multi-label classification.
+
+        Note:
+            - batch_size: N
+            - label_num: C
+
+        Args:
+            outputs (np.array[N, C]): predicted multi-label.
+            gts (np.array[N, C]): Groundtruth muti-label.
+            masks (np.array[N, ]): masked outputs will be ignored for
+                accuracy calculation.
+
+        Returns:
+            float: mean accuracy
+        """
+        acc = (outputs == gts).all(axis=1)
+        return np.mean(acc[masks])
+
+    def _report_metric(self, res_file, metrics):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'MRRPE', 'MPJPE', 'Handedness_acc'.
+
+        Returns:
+            list: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.db)
+
+        gts_rel_root = []
+        preds_rel_root = []
+        rel_root_masks = []
+        gts_joint_coord_cam = []
+        preds_joint_coord_cam = []
+        single_masks = []
+        interacting_masks = []
+        all_masks = []
+        gts_hand_type = []
+        preds_hand_type = []
+        hand_type_masks = []
+
+        for pred, item in zip(preds, self.db):
+            # mrrpe
+            if 'MRRPE' in metrics:
+                if item['hand_type'].all() and item['joints_3d_visible'][
+                        20, 0] and item['joints_3d_visible'][41, 0]:
+                    rel_root_masks.append(True)
+
+                    pred_left_root_img = np.array(
+                        pred['keypoints'][41], dtype=np.float32)[None, :]
+                    pred_left_root_img[:, 2] += item['abs_depth'][0] + pred[
+                        'rel_root_depth']
+                    pred_left_root_cam = self._pixel2cam(
+                        pred_left_root_img, item['focal'], item['princpt'])
+
+                    pred_right_root_img = np.array(
+                        pred['keypoints'][20], dtype=np.float32)[None, :]
+                    pred_right_root_img[:, 2] += item['abs_depth'][0]
+                    pred_right_root_cam = self._pixel2cam(
+                        pred_right_root_img, item['focal'], item['princpt'])
+
+                    preds_rel_root.append(pred_left_root_cam -
+                                          pred_right_root_cam)
+                    gts_rel_root.append(
+                        [item['joints_cam'][41] - item['joints_cam'][20]])
+                else:
+                    rel_root_masks.append(False)
+                    preds_rel_root.append([[0., 0., 0.]])
+                    gts_rel_root.append([[0., 0., 0.]])
+
+            if 'MPJPE' in metrics:
+                pred_joint_coord_img = np.array(
+                    pred['keypoints'], dtype=np.float32)
+                gt_joint_coord_cam = item['joints_cam'].copy()
+
+                pred_joint_coord_img[:21, 2] += item['abs_depth'][0]
+                pred_joint_coord_img[21:, 2] += item['abs_depth'][1]
+                pred_joint_coord_cam = self._pixel2cam(pred_joint_coord_img,
+                                                       item['focal'],
+                                                       item['princpt'])
+
+                pred_joint_coord_cam[:21] -= pred_joint_coord_cam[20]
+                pred_joint_coord_cam[21:] -= pred_joint_coord_cam[41]
+                gt_joint_coord_cam[:21] -= gt_joint_coord_cam[20]
+                gt_joint_coord_cam[21:] -= gt_joint_coord_cam[41]
+
+                preds_joint_coord_cam.append(pred_joint_coord_cam)
+                gts_joint_coord_cam.append(gt_joint_coord_cam)
+
+                mask = (np.array(item['joints_3d_visible'])[:, 0]) > 0
+
+                if item['hand_type'].all():
+                    single_masks.append(
+                        np.zeros(self.ann_info['num_joints'], dtype=bool))
+                    interacting_masks.append(mask)
+                    all_masks.append(mask)
+                else:
+                    single_masks.append(mask)
+                    interacting_masks.append(
+                        np.zeros(self.ann_info['num_joints'], dtype=bool))
+                    all_masks.append(mask)
+
+            if 'Handedness_acc' in metrics:
+                pred_hand_type = np.array(pred['hand_type'], dtype=int)
+                preds_hand_type.append(pred_hand_type)
+                gts_hand_type.append(item['hand_type'])
+                hand_type_masks.append(item['hand_type_valid'] > 0)
+
+        gts_rel_root = np.array(gts_rel_root, dtype=np.float32)
+        preds_rel_root = np.array(preds_rel_root, dtype=np.float32)
+        rel_root_masks = np.array(rel_root_masks, dtype=bool)[:, None]
+        gts_joint_coord_cam = np.array(gts_joint_coord_cam, dtype=np.float32)
+        preds_joint_coord_cam = np.array(
+            preds_joint_coord_cam, dtype=np.float32)
+        single_masks = np.array(single_masks, dtype=bool)
+        interacting_masks = np.array(interacting_masks, dtype=bool)
+        all_masks = np.array(all_masks, dtype=bool)
+        gts_hand_type = np.array(gts_hand_type, dtype=int)
+        preds_hand_type = np.array(preds_hand_type, dtype=int)
+        hand_type_masks = np.array(hand_type_masks, dtype=bool)
+
+        if 'MRRPE' in metrics:
+            info_str.append(('MRRPE',
+                             keypoint_epe(preds_rel_root, gts_rel_root,
+                                          rel_root_masks)))
+
+        if 'MPJPE' in metrics:
+            info_str.append(('MPJPE_all',
+                             keypoint_epe(preds_joint_coord_cam,
+                                          gts_joint_coord_cam, all_masks)))
+            info_str.append(('MPJPE_single',
+                             keypoint_epe(preds_joint_coord_cam,
+                                          gts_joint_coord_cam, single_masks)))
+            info_str.append(
+                ('MPJPE_interacting',
+                 keypoint_epe(preds_joint_coord_cam, gts_joint_coord_cam,
+                              interacting_masks)))
+
+        if 'Handedness_acc' in metrics:
+            info_str.append(('Handedness_acc',
+                             self._get_accuracy(preds_hand_type, gts_hand_type,
+                                                hand_type_masks)))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/hand/onehand10k_dataset.py b/mmpose/datasets/datasets/hand/onehand10k_dataset.py
new file mode 100644
index 0000000..9783cab
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/onehand10k_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class OneHand10KDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """OneHand10K dataset for top-down hand pose estimation.
+
+    "Mask-pose Cascaded CNN for 2D Hand Pose Estimation from
+    Single Color Images", TCSVT'2019.
+    More details can be found in the `paper
+    <https://www.yangangwang.com/papers/WANG-MCC-2018-10.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    OneHand10K keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/onehand10k.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # use 1.25 padded bbox as input
+                center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate onehand10k keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['Test/source/0.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py b/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py
new file mode 100644
index 0000000..c1d7fc6
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class PanopticDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Panoptic dataset for top-down hand pose estimation.
+
+    "Hand Keypoint Detection in Single Images using Multiview
+    Bootstrapping", CVPR'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1704.07809>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Panoptic keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/panoptic_hand2d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # The bbox is the tightest bbox enclosing keypoints.
+                # The paper uses 2.2 bbox as the input, while
+                # we use 1.76 (2.2 * 0.8) bbox as the input.
+                center, scale = self._xywh2cs(*obj['bbox'][:4], 1.76)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'head_size': obj['head_size'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs):
+        """Evaluate panoptic keypoint results. The pose prediction results will
+        be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['hand_labels/\
+                    manual_test/000648952_02_l.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCKh', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCKh', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/hand/rhd2d_dataset.py b/mmpose/datasets/datasets/hand/rhd2d_dataset.py
new file mode 100644
index 0000000..3667f5f
--- /dev/null
+++ b/mmpose/datasets/datasets/hand/rhd2d_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class Rhd2DDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Rendered Handpose Dataset for top-down hand pose estimation.
+
+    "Learning to Estimate 3D Hand Pose from Single RGB Images",
+    ICCV'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/1705.01389.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Rhd keypoint indexes::
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/rhd2d.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # the ori image is 224x224
+                center, scale = self._xywh2cs(*obj['bbox'][:4], padding=1.25)
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate rhd keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1], area, score]
+                - image_paths (list[str]): For example,
+                    ['training/rgb/00031426.jpg']
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
diff --git a/mmpose/datasets/datasets/mesh/__init__.py b/mmpose/datasets/datasets/mesh/__init__.py
new file mode 100644
index 0000000..14297c7
--- /dev/null
+++ b/mmpose/datasets/datasets/mesh/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mesh_adv_dataset import MeshAdversarialDataset
+from .mesh_h36m_dataset import MeshH36MDataset
+from .mesh_mix_dataset import MeshMixDataset
+from .mosh_dataset import MoshDataset
+
+__all__ = [
+    'MeshH36MDataset', 'MoshDataset', 'MeshMixDataset',
+    'MeshAdversarialDataset'
+]
diff --git a/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py b/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py
new file mode 100644
index 0000000..cd9ba39
--- /dev/null
+++ b/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets.builder import DATASETS, build_dataset
+
+
+@DATASETS.register_module()
+class MeshAdversarialDataset(Dataset):
+    """Mix Dataset for the adversarial training in 3D human mesh estimation
+    task.
+
+    The dataset combines data from two datasets and
+    return a dict containing data from two datasets.
+
+    Args:
+        train_dataset (Dataset): Dataset for 3D human mesh estimation.
+        adversarial_dataset (Dataset): Dataset for adversarial learning,
+            provides real SMPL parameters.
+    """
+
+    def __init__(self, train_dataset, adversarial_dataset):
+        super().__init__()
+        self.train_dataset = build_dataset(train_dataset)
+        self.adversarial_dataset = build_dataset(adversarial_dataset)
+        self.length = len(self.train_dataset)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return self.length
+
+    def __getitem__(self, i):
+        """Given index, get the data from train dataset and randomly sample an
+        item from adversarial dataset.
+
+        Return a dict containing data from train and adversarial dataset.
+        """
+        data = self.train_dataset[i]
+        ind_adv = np.random.randint(
+            low=0, high=len(self.adversarial_dataset), dtype=int)
+        data.update(self.adversarial_dataset[ind_adv %
+                                             len(self.adversarial_dataset)])
+        return data
diff --git a/mmpose/datasets/datasets/mesh/mesh_base_dataset.py b/mmpose/datasets/datasets/mesh/mesh_base_dataset.py
new file mode 100644
index 0000000..79c8a8a
--- /dev/null
+++ b/mmpose/datasets/datasets/mesh/mesh_base_dataset.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+import os
+from abc import ABCMeta
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets.pipelines import Compose
+
+
+class MeshBaseDataset(Dataset, metaclass=ABCMeta):
+    """Base dataset for 3D human mesh estimation task. In 3D humamesh
+    estimation task, all datasets share this BaseDataset for training and have
+    their own evaluate function.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    This dataset can only be used for training.
+    For evaluation, subclass should write an extra evaluate function.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 test_mode=False):
+
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.ann_file = ann_file
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['iuv_size'] = np.array(data_cfg['iuv_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+        self.ann_info['flip_pairs'] = None
+        self.db = []
+        self.pipeline = Compose(self.pipeline)
+
+        # flip_pairs
+        # For all mesh dataset, we use 24 joints as CMR and SPIN.
+        self.ann_info['flip_pairs'] = [[0, 5], [1, 4], [2, 3], [6, 11],
+                                       [7, 10], [8, 9], [20, 21], [22, 23]]
+        self.ann_info['use_different_joint_weights'] = False
+        assert self.ann_info['num_joints'] == 24
+        self.ann_info['joint_weights'] = np.ones([24, 1], dtype=np.float32)
+
+        self.ann_info['uv_type'] = data_cfg['uv_type']
+        self.ann_info['use_IUV'] = data_cfg['use_IUV']
+        uv_type = self.ann_info['uv_type']
+        self.iuv_prefix = os.path.join(self.img_prefix, f'{uv_type}_IUV_gt')
+        self.db = self._get_db(ann_file)
+
+    def _get_db(self, ann_file):
+        """Load dataset."""
+        data = np.load(ann_file)
+        tmpl = dict(
+            image_file=None,
+            center=None,
+            scale=None,
+            rotation=0,
+            joints_2d=None,
+            joints_2d_visible=None,
+            joints_3d=None,
+            joints_3d_visible=None,
+            gender=None,
+            pose=None,
+            beta=None,
+            has_smpl=0,
+            iuv_file=None,
+            has_iuv=0)
+        gt_db = []
+
+        _imgnames = data['imgname']
+        _scales = data['scale'].astype(np.float32)
+        _centers = data['center'].astype(np.float32)
+        dataset_len = len(_imgnames)
+
+        # Get 2D keypoints
+        if 'part' in data.keys():
+            _keypoints = data['part'].astype(np.float32)
+        else:
+            _keypoints = np.zeros((dataset_len, 24, 3), dtype=np.float32)
+
+        # Get gt 3D joints, if available
+        if 'S' in data.keys():
+            _joints_3d = data['S'].astype(np.float32)
+        else:
+            _joints_3d = np.zeros((dataset_len, 24, 4), dtype=np.float32)
+
+        # Get gt SMPL parameters, if available
+        if 'pose' in data.keys() and 'shape' in data.keys():
+            _poses = data['pose'].astype(np.float32)
+            _betas = data['shape'].astype(np.float32)
+            has_smpl = 1
+        else:
+            _poses = np.zeros((dataset_len, 72), dtype=np.float32)
+            _betas = np.zeros((dataset_len, 10), dtype=np.float32)
+            has_smpl = 0
+
+        # Get gender data, if available
+        if 'gender' in data.keys():
+            _genders = data['gender']
+            _genders = np.array([str(g) != 'm' for g in _genders]).astype(int)
+        else:
+            _genders = -1 * np.ones(dataset_len).astype(int)
+
+        # Get IUV image, if available
+        if 'iuv_names' in data.keys():
+            _iuv_names = data['iuv_names']
+            has_iuv = has_smpl
+        else:
+            _iuv_names = [''] * dataset_len
+            has_iuv = 0
+
+        for i in range(len(_imgnames)):
+            newitem = cp.deepcopy(tmpl)
+            newitem['image_file'] = os.path.join(self.img_prefix, _imgnames[i])
+            newitem['scale'] = np.array([_scales[i], _scales[i]])
+            newitem['center'] = _centers[i]
+            newitem['joints_2d'] = _keypoints[i, :, :2]
+            newitem['joints_2d_visible'] = _keypoints[i, :, -1][:, None]
+            newitem['joints_3d'] = _joints_3d[i, :, :3]
+            newitem['joints_3d_visible'] = _joints_3d[i, :, -1][:, None]
+            newitem['pose'] = _poses[i]
+            newitem['beta'] = _betas[i]
+            newitem['has_smpl'] = has_smpl
+            newitem['gender'] = _genders[i]
+            newitem['iuv_file'] = os.path.join(self.iuv_prefix, _iuv_names[i])
+            newitem['has_iuv'] = has_iuv
+            gt_db.append(newitem)
+        return gt_db
+
+    def __len__(self, ):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = cp.deepcopy(self.db[idx])
+        results['ann_info'] = self.ann_info
+        return self.pipeline(results)
diff --git a/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py b/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py
new file mode 100644
index 0000000..9ac9ead
--- /dev/null
+++ b/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+
+from mmpose.core.evaluation import keypoint_mpjpe
+from mmpose.datasets.builder import DATASETS
+from .mesh_base_dataset import MeshBaseDataset
+
+
+@DATASETS.register_module()
+class MeshH36MDataset(MeshBaseDataset):
+    """Human3.6M Dataset for 3D human mesh estimation. It inherits all function
+    from MeshBaseDataset and has its own evaluate function.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def evaluate(self, outputs, res_folder, metric='joint_error', logger=None):
+        """Evaluate 3D keypoint results."""
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['joint_error']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+        kpts = []
+        for out in outputs:
+            for (keypoints, image_path) in zip(out['keypoints_3d'],
+                                               out['image_path']):
+                kpts.append({
+                    'keypoints': keypoints.tolist(),
+                    'image': image_path,
+                })
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file)
+        name_value = OrderedDict(info_str)
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self, res_file):
+        """Keypoint evaluation.
+
+        Report mean per joint position error (MPJPE) and mean per joint
+        position error after rigid alignment (MPJPE-PA)
+        """
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.db)
+
+        pred_joints_3d = [pred['keypoints'] for pred in preds]
+        gt_joints_3d = [item['joints_3d'] for item in self.db]
+        gt_joints_visible = [item['joints_3d_visible'] for item in self.db]
+
+        pred_joints_3d = np.array(pred_joints_3d)
+        gt_joints_3d = np.array(gt_joints_3d)
+        gt_joints_visible = np.array(gt_joints_visible)
+
+        # we only evaluate on 14 lsp joints
+        joint_mapper = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]
+        pred_joints_3d = pred_joints_3d[:, joint_mapper, :]
+        pred_pelvis = (pred_joints_3d[:, 2] + pred_joints_3d[:, 3]) / 2
+        pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :]
+
+        gt_joints_3d = gt_joints_3d[:, joint_mapper, :]
+        gt_pelvis = (gt_joints_3d[:, 2] + gt_joints_3d[:, 3]) / 2
+        gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :]
+        gt_joints_visible = gt_joints_visible[:, joint_mapper, 0] > 0
+
+        mpjpe = keypoint_mpjpe(pred_joints_3d, gt_joints_3d, gt_joints_visible)
+        mpjpe_pa = keypoint_mpjpe(
+            pred_joints_3d,
+            gt_joints_3d,
+            gt_joints_visible,
+            alignment='procrustes')
+
+        info_str = []
+        info_str.append(('MPJPE', mpjpe * 1000))
+        info_str.append(('MPJPE-PA', mpjpe_pa * 1000))
+        return info_str
diff --git a/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py b/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py
new file mode 100644
index 0000000..244a7c3
--- /dev/null
+++ b/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+import numpy as np
+from torch.utils.data import ConcatDataset, Dataset, WeightedRandomSampler
+
+from mmpose.datasets.builder import DATASETS
+from .mesh_base_dataset import MeshBaseDataset
+
+
+@DATASETS.register_module()
+class MeshMixDataset(Dataset, metaclass=ABCMeta):
+    """Mix Dataset for 3D human mesh estimation.
+
+    The dataset combines data from multiple datasets (MeshBaseDataset) and
+    sample the data from different datasets with the provided proportions.
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Args:
+        configs (list): List of configs for multiple datasets.
+        partition (list): Sample proportion of multiple datasets. The length
+            of partition should be same with that of configs. The elements
+            of it should be non-negative and is not necessary summing up to
+            one.
+
+    Example:
+        >>> from mmpose.datasets import MeshMixDataset
+        >>> data_cfg = dict(
+        >>>     image_size=[256, 256],
+        >>>     iuv_size=[64, 64],
+        >>>     num_joints=24,
+        >>>     use_IUV=True,
+        >>>     uv_type='BF')
+        >>>
+        >>> mix_dataset = MeshMixDataset(
+        >>>     configs=[
+        >>>         dict(
+        >>>             ann_file='tests/data/h36m/test_h36m.npz',
+        >>>             img_prefix='tests/data/h36m',
+        >>>             data_cfg=data_cfg,
+        >>>             pipeline=[]),
+        >>>         dict(
+        >>>             ann_file='tests/data/h36m/test_h36m.npz',
+        >>>             img_prefix='tests/data/h36m',
+        >>>             data_cfg=data_cfg,
+        >>>             pipeline=[]),
+        >>>     ],
+        >>>     partition=[0.6, 0.4])
+    """
+
+    def __init__(self, configs, partition):
+        """Load data from multiple datasets."""
+        assert min(partition) >= 0
+        datasets = [MeshBaseDataset(**cfg) for cfg in configs]
+        self.dataset = ConcatDataset(datasets)
+        self.length = max(len(ds) for ds in datasets)
+        weights = [
+            np.ones(len(ds)) * p / len(ds)
+            for (p, ds) in zip(partition, datasets)
+        ]
+        weights = np.concatenate(weights, axis=0)
+        self.sampler = WeightedRandomSampler(weights, 1)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return self.length
+
+    def __getitem__(self, idx):
+        """Given index, sample the data from multiple datasets with the given
+        proportion."""
+        idx_new = list(self.sampler)[0]
+        return self.dataset[idx_new]
diff --git a/mmpose/datasets/datasets/mesh/mosh_dataset.py b/mmpose/datasets/datasets/mesh/mosh_dataset.py
new file mode 100644
index 0000000..3185265
--- /dev/null
+++ b/mmpose/datasets/datasets/mesh/mosh_dataset.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+from abc import ABCMeta
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmpose.datasets.builder import DATASETS
+from mmpose.datasets.pipelines import Compose
+
+
+@DATASETS.register_module()
+class MoshDataset(Dataset, metaclass=ABCMeta):
+    """Mosh Dataset for the adversarial training in 3D human mesh estimation
+    task.
+
+    The dataset return a dict containing real-world SMPL parameters.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self, ann_file, pipeline, test_mode=False):
+
+        self.ann_file = ann_file
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.db = self._get_db(ann_file)
+        self.pipeline = Compose(self.pipeline)
+
+    @staticmethod
+    def _get_db(ann_file):
+        """Load dataset."""
+        data = np.load(ann_file)
+        _betas = data['shape'].astype(np.float32)
+        _poses = data['pose'].astype(np.float32)
+        tmpl = dict(
+            pose=None,
+            beta=None,
+        )
+        gt_db = []
+        dataset_len = len(_betas)
+
+        for i in range(dataset_len):
+            newitem = cp.deepcopy(tmpl)
+            newitem['pose'] = _poses[i]
+            newitem['beta'] = _betas[i]
+            gt_db.append(newitem)
+        return gt_db
+
+    def __len__(self, ):
+        """Get the size of the dataset."""
+        return len(self.db)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        item = cp.deepcopy(self.db[idx])
+        trivial, pose, beta = \
+            np.zeros(3, dtype=np.float32), item['pose'], item['beta']
+        results = {
+            'mosh_theta':
+            np.concatenate((trivial, pose, beta), axis=0).astype(np.float32)
+        }
+        return self.pipeline(results)
diff --git a/mmpose/datasets/datasets/top_down/__init__.py b/mmpose/datasets/datasets/top_down/__init__.py
new file mode 100644
index 0000000..cc5b46a
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .topdown_aic_dataset import TopDownAicDataset
+from .topdown_coco_dataset import TopDownCocoDataset
+from .topdown_coco_wholebody_dataset import TopDownCocoWholeBodyDataset
+from .topdown_crowdpose_dataset import TopDownCrowdPoseDataset
+from .topdown_h36m_dataset import TopDownH36MDataset
+from .topdown_halpe_dataset import TopDownHalpeDataset
+from .topdown_jhmdb_dataset import TopDownJhmdbDataset
+from .topdown_mhp_dataset import TopDownMhpDataset
+from .topdown_mpii_dataset import TopDownMpiiDataset
+from .topdown_mpii_trb_dataset import TopDownMpiiTrbDataset
+from .topdown_ochuman_dataset import TopDownOCHumanDataset
+from .topdown_posetrack18_dataset import TopDownPoseTrack18Dataset
+from .topdown_posetrack18_video_dataset import TopDownPoseTrack18VideoDataset
+
+__all__ = [
+    'TopDownAicDataset',
+    'TopDownCocoDataset',
+    'TopDownCocoWholeBodyDataset',
+    'TopDownCrowdPoseDataset',
+    'TopDownMpiiDataset',
+    'TopDownMpiiTrbDataset',
+    'TopDownOCHumanDataset',
+    'TopDownPoseTrack18Dataset',
+    'TopDownJhmdbDataset',
+    'TopDownMhpDataset',
+    'TopDownH36MDataset',
+    'TopDownHalpeDataset',
+    'TopDownPoseTrack18VideoDataset',
+]
diff --git a/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py b/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py
new file mode 100644
index 0000000..13c41df
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownAicDataset(TopDownCocoDataset):
+    """AicDataset dataset for top-down pose estimation.
+
+    "AI Challenger : A Large-scale Dataset for Going Deeper
+    in Image Understanding", arXiv'2017.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1711.06475>`__
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    AIC keypoint indexes::
+
+        0: "right_shoulder",
+        1: "right_elbow",
+        2: "right_wrist",
+        3: "left_shoulder",
+        4: "left_elbow",
+        5: "left_wrist",
+        6: "right_hip",
+        7: "right_knee",
+        8: "right_ankle",
+        9: "left_hip",
+        10: "left_knee",
+        11: "left_ankle",
+        12: "head_top",
+        13: "neck"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/aic.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/top_down/topdown_base_dataset.py b/mmpose/datasets/datasets/top_down/topdown_base_dataset.py
new file mode 100644
index 0000000..dc99576
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_base_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+from torch.utils.data import Dataset
+
+
+class TopDownBaseDataset(Dataset, metaclass=ABCMeta):
+    """This class has been deprecated and replaced by
+    Kpt2dSviewRgbImgTopDownDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownBaseDataset has been replaced by '
+            'Kpt2dSviewRgbImgTopDownDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/663 for details.')
+               )
diff --git a/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py b/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py
new file mode 100644
index 0000000..664c881
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py
@@ -0,0 +1,405 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class TopDownCocoDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """CocoDataset dataset for top-down pose estimation.
+
+    "Microsoft COCO: Common Objects in Context", ECCV'2014.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1405.0312>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/coco.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        if (not self.test_mode) or self.use_gt_bbox:
+            # use ground truth bbox
+            gt_db = self._load_coco_keypoint_annotations()
+        else:
+            # use bbox from detection
+            gt_db = self._load_coco_person_detection_results()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+
+        Args:
+            img_id: coco image id
+
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _load_coco_person_detection_results(self):
+        """Load coco person detection results."""
+        num_joints = self.ann_info['num_joints']
+        all_boxes = None
+        with open(self.bbox_file, 'r') as f:
+            all_boxes = json.load(f)
+
+        if not all_boxes:
+            raise ValueError('=> Load %s fail!' % self.bbox_file)
+
+        print(f'=> Total boxes: {len(all_boxes)}')
+
+        kpt_db = []
+        bbox_id = 0
+        for det_res in all_boxes:
+            if det_res['category_id'] != 1:
+                continue
+
+            image_file = osp.join(self.img_prefix,
+                                  self.id2name[det_res['image_id']])
+            box = det_res['bbox']
+            score = det_res['score']
+
+            if score < self.det_bbox_thr:
+                continue
+
+            center, scale = self._xywh2cs(*box[:4])
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.ones((num_joints, 3), dtype=np.float32)
+            kpt_db.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'bbox': box[:4],
+                'bbox_score': score,
+                'dataset': self.dataset_name,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+        print(f'=> Total boxes after filter '
+              f'low score@{self.det_bbox_thr}: {bbox_id}')
+        return kpt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        info_str = self._do_python_keypoint_eval(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+        data_pack = [{
+            'cat_id': self._class_to_coco_ind[cls],
+            'cls_ind': cls_ind,
+            'cls': cls,
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        } for cls_ind, cls in enumerate(self.classes)
+                     if not cls == '__background__']
+
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point.tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        for img_id, persons in kpts.items():
+            num = len(persons)
+            kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key])
+            for i in range(num - 1, 0, -1):
+                if kpts[img_id][i][key] == kpts[img_id][i - 1][key]:
+                    del kpts[img_id][i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py b/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py
new file mode 100644
index 0000000..791a3c5
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py
@@ -0,0 +1,274 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import warnings
+
+import numpy as np
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownCocoWholeBodyDataset(TopDownCocoDataset):
+    """CocoWholeBodyDataset dataset for top-down pose estimation.
+
+    "Whole-Body Human Pose Estimation in the Wild", ECCV'2020.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.11858>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO-WholeBody keypoint indexes::
+
+        0-16: 17 body keypoints,
+        17-22: 6 foot keypoints,
+        23-90: 68 face keypoints,
+        91-132: 42 hand keypoints
+
+        In total, we have 133 keypoints for wholebody pose estimation.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/coco_wholebody.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.body_num = 17
+        self.foot_num = 6
+        self.face_num = 68
+        self.left_hand_num = 21
+        self.right_hand_num = 21
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        rec = []
+        bbox_id = 0
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints'] + obj['foot_kpts'] +
+                                 obj['face_kpts'] + obj['lefthand_kpts'] +
+                                 obj['righthand_kpts']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3] > 0)
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = os.path.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        """Get coco keypoint results."""
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpt['keypoints'] for img_kpt in img_kpts])
+            key_points = _key_points.reshape(-1,
+                                             self.ann_info['num_joints'] * 3)
+
+            cuts = np.cumsum([
+                0, self.body_num, self.foot_num, self.face_num,
+                self.left_hand_num, self.right_hand_num
+            ]) * 3
+
+            result = [{
+                'image_id': img_kpt['image_id'],
+                'category_id': cat_id,
+                'keypoints': key_point[cuts[0]:cuts[1]].tolist(),
+                'foot_kpts': key_point[cuts[1]:cuts[2]].tolist(),
+                'face_kpts': key_point[cuts[2]:cuts[3]].tolist(),
+                'lefthand_kpts': key_point[cuts[3]:cuts[4]].tolist(),
+                'righthand_kpts': key_point[cuts[4]:cuts[5]].tolist(),
+                'score': float(img_kpt['score']),
+                'center': img_kpt['center'].tolist(),
+                'scale': img_kpt['scale'].tolist()
+            } for img_kpt, key_point in zip(img_kpts, key_points)]
+
+            cat_results.extend(result)
+
+        return cat_results
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+
+        cuts = np.cumsum([
+            0, self.body_num, self.foot_num, self.face_num, self.left_hand_num,
+            self.right_hand_num
+        ])
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_body',
+            self.sigmas[cuts[0]:cuts[1]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_foot',
+            self.sigmas[cuts[1]:cuts[2]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_face',
+            self.sigmas[cuts[2]:cuts[3]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_lefthand',
+            self.sigmas[cuts[3]:cuts[4]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_righthand',
+            self.sigmas[cuts[4]:cuts[5]],
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_wholebody',
+            self.sigmas,
+            use_area=True)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py b/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py
new file mode 100644
index 0000000..b9b196f
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownCrowdPoseDataset(TopDownCocoDataset):
+    """CrowdPoseDataset dataset for top-down pose estimation.
+
+    "CrowdPose: Efficient Crowded Scenes Pose Estimation and
+    A New Benchmark", CVPR'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1812.00324>`__.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    CrowdPose keypoint indexes::
+
+        0: 'left_shoulder',
+        1: 'right_shoulder',
+        2: 'left_elbow',
+        3: 'right_elbow',
+        4: 'left_wrist',
+        5: 'right_wrist',
+        6: 'left_hip',
+        7: 'right_hip',
+        8: 'left_knee',
+        9: 'right_knee',
+        10: 'left_ankle',
+        11: 'right_ankle',
+        12: 'top_head',
+        13: 'neck'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/crowdpose.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco,
+            coco_det,
+            'keypoints_crowd',
+            self.sigmas,
+            use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AR', 'AR .5', 'AR .75', 'AP(E)', 'AP(M)',
+            'AP(H)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py b/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py
new file mode 100644
index 0000000..6bc49e3
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py
@@ -0,0 +1,206 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class TopDownH36MDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Human3.6M dataset for top-down 2D pose estimation.
+
+    "Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human
+    Sensing in Natural Environments", TPAMI`2014.
+    More details can be found in the `paper
+    <http://vision.imar.ro/human3.6m/pami-h36m.pdf>`__.
+
+    Human3.6M keypoint indexes::
+
+        0: 'root (pelvis)',
+        1: 'right_hip',
+        2: 'right_knee',
+        3: 'right_foot',
+        4: 'left_hip',
+        5: 'left_knee',
+        6: 'left_foot',
+        7: 'spine',
+        8: 'thorax',
+        9: 'neck_base',
+        10: 'head',
+        11: 'left_shoulder',
+        12: 'left_elbow',
+        13: 'left_wrist',
+        14: 'right_shoulder',
+        15: 'right_elbow',
+        16: 'right_wrist'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/h36m.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+                joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+                keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+                joints_3d[:, :2] = keypoints[:, :2]
+                joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                # use 1.25 padded bbox as input
+                center, scale = self._xywh2cs(*obj['bbox'][:4])
+
+                image_file = osp.join(self.img_prefix, self.id2name[img_id])
+
+                gt_db.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'dataset': self.dataset_name,
+                    'bbox': obj['bbox'],
+                    'bbox_score': 1,
+                    'bbox_id': bbox_id
+                })
+                bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate human3.6m 2d keypoint results. The pose prediction results
+        will be saved in `${res_folder}/result_keypoints.json`.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0],
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'PCK'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
diff --git a/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py b/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py
new file mode 100644
index 0000000..7042daa
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownHalpeDataset(TopDownCocoDataset):
+    """HalpeDataset for top-down pose estimation.
+
+    'https://github.com/Fang-Haoshu/Halpe-FullBody'
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Halpe keypoint indexes::
+
+        0-19: 20 body keypoints,
+        20-25: 6 foot keypoints,
+        26-93: 68 face keypoints,
+        94-135: 42 hand keypoints
+
+        In total, we have 136 keypoints for wholebody pose estimation.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/halpe.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.ann_info['use_different_joint_weights'] = False
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
diff --git a/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py b/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py
new file mode 100644
index 0000000..5204f04
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py
@@ -0,0 +1,361 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.core.evaluation.top_down_eval import keypoint_pck_accuracy
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownJhmdbDataset(TopDownCocoDataset):
+    """JhmdbDataset dataset for top-down pose estimation.
+
+    "Towards understanding action recognition", ICCV'2013.
+    More details can be found in the `paper
+    <https://openaccess.thecvf.com/content_iccv_2013/papers/\
+    Jhuang_Towards_Understanding_Action_2013_ICCV_paper.pdf>`__
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    sub-JHMDB keypoint indexes::
+
+        0: "neck",
+        1: "belly",
+        2: "head",
+        3: "right_shoulder",
+        4: "left_shoulder",
+        5: "right_hip",
+        6: "left_hip",
+        7: "right_elbow",
+        8: "left_elbow",
+        9: "right_knee",
+        10: "left_knee",
+        11: "right_wrist",
+        12: "left_wrist",
+        13: "right_ankle",
+        14: "left_ankle"
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/jhmdb.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            # JHMDB uses matlab format, index is 1-based,
+            # we should first convert to 0-based index
+            x -= 1
+            y -= 1
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        rec = []
+        bbox_id = 0
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+
+            # JHMDB uses matlab format, index is 1-based,
+            # we should first convert to 0-based index
+            joints_3d[:, :2] = keypoints[:, :2] - 1
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': f'{img_id}_{bbox_id:03}'
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _write_keypoint_results(self, keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self, res_file, metrics, pck_thr=0.2):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'PCKh', 'AUC', 'EPE'.
+            pck_thr (float): PCK threshold, default as 0.2.
+            pckh_thr (float): PCKh threshold, default as 0.7.
+            auc_nor (float): AUC normalization factor, default as 30 pixel.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.db)
+
+        outputs = []
+        gts = []
+        masks = []
+        threshold_bbox = []
+        threshold_torso = []
+
+        for pred, item in zip(preds, self.db):
+            outputs.append(np.array(pred['keypoints'])[:, :-1])
+            gts.append(np.array(item['joints_3d'])[:, :-1])
+            masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0)
+            if 'PCK' in metrics:
+                bbox = np.array(item['bbox'])
+                bbox_thr = np.max(bbox[2:])
+                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
+
+            if 'tPCK' in metrics:
+                torso_thr = np.linalg.norm(item['joints_3d'][4, :2] -
+                                           item['joints_3d'][5, :2])
+                if torso_thr < 1:
+                    torso_thr = np.linalg.norm(
+                        np.array(pred['keypoints'])[4, :2] -
+                        np.array(pred['keypoints'])[5, :2])
+                    warnings.warn('Torso Size < 1.')
+                threshold_torso.append(np.array([torso_thr, torso_thr]))
+
+        outputs = np.array(outputs)
+        gts = np.array(gts)
+        masks = np.array(masks)
+        threshold_bbox = np.array(threshold_bbox)
+        threshold_torso = np.array(threshold_torso)
+
+        if 'PCK' in metrics:
+            pck_p, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
+                                                  threshold_bbox)
+
+            stats_names = [
+                'Head PCK', 'Sho PCK', 'Elb PCK', 'Wri PCK', 'Hip PCK',
+                'Knee PCK', 'Ank PCK', 'Mean PCK'
+            ]
+
+            stats = [
+                pck_p[2], 0.5 * pck_p[3] + 0.5 * pck_p[4],
+                0.5 * pck_p[7] + 0.5 * pck_p[8],
+                0.5 * pck_p[11] + 0.5 * pck_p[12],
+                0.5 * pck_p[5] + 0.5 * pck_p[6],
+                0.5 * pck_p[9] + 0.5 * pck_p[10],
+                0.5 * pck_p[13] + 0.5 * pck_p[14], pck
+            ]
+
+            info_str.extend(list(zip(stats_names, stats)))
+
+        if 'tPCK' in metrics:
+            pck_p, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
+                                                  threshold_torso)
+
+            stats_names = [
+                'Head tPCK', 'Sho tPCK', 'Elb tPCK', 'Wri tPCK', 'Hip tPCK',
+                'Knee tPCK', 'Ank tPCK', 'Mean tPCK'
+            ]
+
+            stats = [
+                pck_p[2], 0.5 * pck_p[3] + 0.5 * pck_p[4],
+                0.5 * pck_p[7] + 0.5 * pck_p[8],
+                0.5 * pck_p[11] + 0.5 * pck_p[12],
+                0.5 * pck_p[5] + 0.5 * pck_p[6],
+                0.5 * pck_p[9] + 0.5 * pck_p[10],
+                0.5 * pck_p[13] + 0.5 * pck_p[14], pck
+            ]
+
+            info_str.extend(list(zip(stats_names, stats)))
+
+        return info_str
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCK', **kwargs):
+        """Evaluate onehand10k keypoint results. The pose prediction results
+        will be saved in `${res_folder}/result_keypoints.json`.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_path (list[str])
+                - output_heatmap (np.ndarray[N, K, H, W]): model outputs.
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'tPCK'.
+                PCK means normalized by the bounding boxes, while tPCK
+                means normalized by the torso size.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'tPCK']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            # convert 0-based index to 1-based index,
+            # and get the first two dimensions.
+            preds[..., :2] += 1.0
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts.append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py b/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py
new file mode 100644
index 0000000..050824a
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+from xtcocotools.cocoeval import COCOeval
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownMhpDataset(TopDownCocoDataset):
+    """MHPv2.0 dataset for top-down pose estimation.
+
+    "Understanding Humans in Crowded Scenes: Deep Nested Adversarial
+    Learning and A New Benchmark for Multi-Human Parsing", ACM MM'2018.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1804.03287>`__
+
+    Note that, the evaluation metric used here is mAP (adapted from COCO),
+    which may be different from the official evaluation codes.
+    'https://github.com/ZhaoJ9014/Multi-Human-Parsing/tree/master/'
+    'Evaluation/Multi-Human-Pose'
+    Please be cautious if you use the results in papers.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MHP keypoint indexes::
+
+        0: "right ankle",
+        1: "right knee",
+        2: "right hip",
+        3: "left hip",
+        4: "left knee",
+        5: "left ankle",
+        6: "pelvis",
+        7: "thorax",
+        8: "upper neck",
+        9: "head top",
+        10: "right wrist",
+        11: "right elbow",
+        12: "right shoulder",
+        13: "left shoulder",
+        14: "left elbow",
+        15: "left wrist",
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mhp.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        if 'image_thr' in data_cfg:
+            warnings.warn(
+                'image_thr is deprecated, '
+                'please use det_bbox_thr instead', DeprecationWarning)
+            self.det_bbox_thr = data_cfg['image_thr']
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
+
+    def _do_python_keypoint_eval(self, res_file):
+        """Keypoint evaluation using COCOAPI."""
+        coco_det = self.coco.loadRes(res_file)
+        coco_eval = COCOeval(
+            self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        stats_names = [
+            'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+
+        info_str = list(zip(stats_names, coco_eval.stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py b/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py
new file mode 100644
index 0000000..751046a
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from scipy.io import loadmat, savemat
+
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class TopDownMpiiDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """MPII Dataset for top-down pose estimation.
+
+    "2D Human Pose Estimation: New Benchmark and State of the Art Analysis"
+    ,CVPR'2014. More details can be found in the `paper
+    <http://human-pose.mpi-inf.mpg.de/contents/andriluka14cvpr.pdf>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MPII keypoint indexes::
+
+        0: 'right_ankle'
+        1: 'right_knee',
+        2: 'right_hip',
+        3: 'left_hip',
+        4: 'left_knee',
+        5: 'left_ankle',
+        6: 'pelvis',
+        7: 'thorax',
+        8: 'upper_neck',
+        9: 'head_top',
+        10: 'right_wrist',
+        11: 'right_elbow',
+        12: 'right_shoulder',
+        13: 'left_shoulder',
+        14: 'left_elbow',
+        15: 'left_wrist'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mpii.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            coco_style=False,
+            test_mode=test_mode)
+
+        self.db = self._get_db()
+        self.image_set = set(x['image_file'] for x in self.db)
+        self.num_images = len(self.image_set)
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        # create train/val split
+        with open(self.ann_file) as anno_file:
+            anno = json.load(anno_file)
+
+        gt_db = []
+        bbox_id = 0
+        for a in anno:
+            image_name = a['image']
+
+            center = np.array(a['center'], dtype=np.float32)
+            scale = np.array([a['scale'], a['scale']], dtype=np.float32)
+
+            # Adjust center/scale slightly to avoid cropping limbs
+            if center[0] != -1:
+                center[1] = center[1] + 15 * scale[1]
+                # padding to include proper amount of context
+                scale = scale * 1.25
+
+            # MPII uses matlab format, index is 1-based,
+            # we should first convert to 0-based index
+            center = center - 1
+
+            joints_3d = np.zeros((self.ann_info['num_joints'], 3),
+                                 dtype=np.float32)
+            joints_3d_visible = np.zeros((self.ann_info['num_joints'], 3),
+                                         dtype=np.float32)
+            if not self.test_mode:
+                joints = np.array(a['joints'])
+                joints_vis = np.array(a['joints_vis'])
+                assert len(joints) == self.ann_info['num_joints'], \
+                    f'joint num diff: {len(joints)}' + \
+                    f' vs {self.ann_info["num_joints"]}'
+
+                joints_3d[:, 0:2] = joints[:, 0:2] - 1
+                joints_3d_visible[:, :2] = joints_vis[:, None]
+            image_file = osp.join(self.img_prefix, image_name)
+            gt_db.append({
+                'image_file': image_file,
+                'bbox_id': bbox_id,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1
+            })
+            bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs):
+        """Evaluate PCKh for MPII dataset. Adapted from
+        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+        Copyright (c) Microsoft, under the MIT License.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['/val2017/000000\
+                    397133.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap.
+            res_folder (str, optional): The folder to save the testing
+                results. Default: None.
+            metric (str | list[str]): Metrics to be performed.
+                Defaults: 'PCKh'.
+
+        Returns:
+            dict: PCKh for each joint
+        """
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCKh']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            bbox_ids = result['bbox_ids']
+            batch_size = len(bbox_ids)
+            for i in range(batch_size):
+                kpts.append({'keypoints': preds[i], 'bbox_id': bbox_ids[i]})
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        preds = np.stack([kpt['keypoints'] for kpt in kpts])
+
+        # convert 0-based index to 1-based index,
+        # and get the first two dimensions.
+        preds = preds[..., :2] + 1.0
+
+        if res_folder:
+            pred_file = osp.join(res_folder, 'pred.mat')
+            savemat(pred_file, mdict={'preds': preds})
+
+        SC_BIAS = 0.6
+        threshold = 0.5
+
+        gt_file = osp.join(osp.dirname(self.ann_file), 'mpii_gt_val.mat')
+        gt_dict = loadmat(gt_file)
+        dataset_joints = gt_dict['dataset_joints']
+        jnt_missing = gt_dict['jnt_missing']
+        pos_gt_src = gt_dict['pos_gt_src']
+        headboxes_src = gt_dict['headboxes_src']
+
+        pos_pred_src = np.transpose(preds, [1, 2, 0])
+
+        head = np.where(dataset_joints == 'head')[1][0]
+        lsho = np.where(dataset_joints == 'lsho')[1][0]
+        lelb = np.where(dataset_joints == 'lelb')[1][0]
+        lwri = np.where(dataset_joints == 'lwri')[1][0]
+        lhip = np.where(dataset_joints == 'lhip')[1][0]
+        lkne = np.where(dataset_joints == 'lkne')[1][0]
+        lank = np.where(dataset_joints == 'lank')[1][0]
+
+        rsho = np.where(dataset_joints == 'rsho')[1][0]
+        relb = np.where(dataset_joints == 'relb')[1][0]
+        rwri = np.where(dataset_joints == 'rwri')[1][0]
+        rkne = np.where(dataset_joints == 'rkne')[1][0]
+        rank = np.where(dataset_joints == 'rank')[1][0]
+        rhip = np.where(dataset_joints == 'rhip')[1][0]
+
+        jnt_visible = 1 - jnt_missing
+        uv_error = pos_pred_src - pos_gt_src
+        uv_err = np.linalg.norm(uv_error, axis=1)
+        headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
+        headsizes = np.linalg.norm(headsizes, axis=0)
+        headsizes *= SC_BIAS
+        scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)
+        scaled_uv_err = uv_err / scale
+        scaled_uv_err = scaled_uv_err * jnt_visible
+        jnt_count = np.sum(jnt_visible, axis=1)
+        less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+        PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count
+
+        # save
+        rng = np.arange(0, 0.5 + 0.01, 0.01)
+        pckAll = np.zeros((len(rng), 16), dtype=np.float32)
+
+        for r, threshold in enumerate(rng):
+            less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+            pckAll[r, :] = 100. * np.sum(
+                less_than_threshold, axis=1) / jnt_count
+
+        PCKh = np.ma.array(PCKh, mask=False)
+        PCKh.mask[6:8] = True
+
+        jnt_count = np.ma.array(jnt_count, mask=False)
+        jnt_count.mask[6:8] = True
+        jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)
+
+        name_value = [('Head', PCKh[head]),
+                      ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
+                      ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
+                      ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
+                      ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
+                      ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
+                      ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
+                      ('PCKh', np.sum(PCKh * jnt_ratio)),
+                      ('PCKh@0.1', np.sum(pckAll[10, :] * jnt_ratio))]
+        name_value = OrderedDict(name_value)
+
+        return name_value
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py b/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py
new file mode 100644
index 0000000..a0da65b
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py
@@ -0,0 +1,310 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from mmpose.datasets.builder import DATASETS
+from ..base import Kpt2dSviewRgbImgTopDownDataset
+
+
+@DATASETS.register_module()
+class TopDownMpiiTrbDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """MPII-TRB Dataset dataset for top-down pose estimation.
+
+    "TRB: A Novel Triplet Representation for Understanding 2D Human Body",
+    ICCV'2019. More details can be found in the `paper
+    <https://arxiv.org/abs/1910.11535>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MPII-TRB keypoint indexes::
+
+        0: 'left_shoulder'
+        1: 'right_shoulder'
+        2: 'left_elbow'
+        3: 'right_elbow'
+        4: 'left_wrist'
+        5: 'right_wrist'
+        6: 'left_hip'
+        7: 'right_hip'
+        8: 'left_knee'
+        9: 'right_knee'
+        10: 'left_ankle'
+        11: 'right_ankle'
+        12: 'head'
+        13: 'neck'
+
+        14: 'right_neck'
+        15: 'left_neck'
+        16: 'medial_right_shoulder'
+        17: 'lateral_right_shoulder'
+        18: 'medial_right_bow'
+        19: 'lateral_right_bow'
+        20: 'medial_right_wrist'
+        21: 'lateral_right_wrist'
+        22: 'medial_left_shoulder'
+        23: 'lateral_left_shoulder'
+        24: 'medial_left_bow'
+        25: 'lateral_left_bow'
+        26: 'medial_left_wrist'
+        27: 'lateral_left_wrist'
+        28: 'medial_right_hip'
+        29: 'lateral_right_hip'
+        30: 'medial_right_knee'
+        31: 'lateral_right_knee'
+        32: 'medial_right_ankle'
+        33: 'lateral_right_ankle'
+        34: 'medial_left_hip'
+        35: 'lateral_left_hip'
+        36: 'medial_left_knee'
+        37: 'lateral_left_knee'
+        38: 'medial_left_ankle'
+        39: 'lateral_left_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/mpii_trb.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.db = self._get_db(ann_file)
+        self.image_set = set(x['image_file'] for x in self.db)
+        self.num_images = len(self.image_set)
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self, ann_file):
+        """Load dataset."""
+        with open(ann_file, 'r') as f:
+            data = json.load(f)
+        tmpl = dict(
+            image_file=None,
+            bbox_id=None,
+            center=None,
+            scale=None,
+            rotation=0,
+            joints_3d=None,
+            joints_3d_visible=None,
+            dataset=self.dataset_name)
+
+        imid2info = {
+            int(osp.splitext(x['file_name'])[0]): x
+            for x in data['images']
+        }
+
+        num_joints = self.ann_info['num_joints']
+        gt_db = []
+
+        for anno in data['annotations']:
+            newitem = cp.deepcopy(tmpl)
+            image_id = anno['image_id']
+            newitem['bbox_id'] = anno['id']
+            newitem['image_file'] = osp.join(self.img_prefix,
+                                             imid2info[image_id]['file_name'])
+
+            if max(anno['keypoints']) == 0:
+                continue
+
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            for ipt in range(num_joints):
+                joints_3d[ipt, 0] = anno['keypoints'][ipt * 3 + 0]
+                joints_3d[ipt, 1] = anno['keypoints'][ipt * 3 + 1]
+                joints_3d[ipt, 2] = 0
+                t_vis = min(anno['keypoints'][ipt * 3 + 2], 1)
+                joints_3d_visible[ipt, :] = (t_vis, t_vis, 0)
+
+            center = np.array(anno['center'], dtype=np.float32)
+            scale = self.ann_info['image_size'] / anno['scale'] / 200.0
+            newitem['center'] = center
+            newitem['scale'] = scale
+            newitem['joints_3d'] = joints_3d
+            newitem['joints_3d_visible'] = joints_3d_visible
+            if 'headbox' in anno:
+                newitem['headbox'] = anno['headbox']
+            gt_db.append(newitem)
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
+
+    def _evaluate_kernel(self, pred, joints_3d, joints_3d_visible, headbox):
+        """Evaluate one example."""
+        num_joints = self.ann_info['num_joints']
+        headbox = np.array(headbox)
+        threshold = np.linalg.norm(headbox[:2] - headbox[2:]) * 0.3
+        hit = np.zeros(num_joints, dtype=np.float32)
+        exist = np.zeros(num_joints, dtype=np.float32)
+
+        for i in range(num_joints):
+            pred_pt = pred[i]
+            gt_pt = joints_3d[i]
+            vis = joints_3d_visible[i][0]
+            if vis:
+                exist[i] = 1
+            else:
+                continue
+            distance = np.linalg.norm(pred_pt[:2] - gt_pt[:2])
+            if distance < threshold:
+                hit[i] = 1
+        return hit, exist
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs):
+        """Evaluate PCKh for MPII-TRB dataset.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['/val2017/\
+                    000000397133.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap.
+                - bbox_ids (list[str]): For example, ['27407'].
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metrics to be performed.
+                Defaults: 'PCKh'.
+
+        Returns:
+            dict: PCKh for each joint
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCKh']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = []
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                str_image_path = image_paths[i]
+                image_id = int(osp.basename(osp.splitext(str_image_path)[0]))
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self, res_file):
+        """Keypoint evaluation.
+
+        Report Mean Acc of skeleton, contour and all joints.
+        """
+        num_joints = self.ann_info['num_joints']
+        hit = np.zeros(num_joints, dtype=np.float32)
+        exist = np.zeros(num_joints, dtype=np.float32)
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+
+        assert len(preds) == len(
+            self.db), f'len(preds)={len(preds)}, len(self.db)={len(self.db)}'
+        for pred, item in zip(preds, self.db):
+            h, e = self._evaluate_kernel(pred['keypoints'], item['joints_3d'],
+                                         item['joints_3d_visible'],
+                                         item['headbox'])
+            hit += h
+            exist += e
+        skeleton = np.sum(hit[:14]) / np.sum(exist[:14])
+        contour = np.sum(hit[14:]) / np.sum(exist[14:])
+        mean = np.sum(hit) / np.sum(exist)
+
+        info_str = []
+        info_str.append(('Skeleton_acc', skeleton.item()))
+        info_str.append(('Contour_acc', contour.item()))
+        info_str.append(('PCKh', mean.item()))
+        return info_str
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py b/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py
new file mode 100644
index 0000000..0ad6b81
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv import Config
+
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownOCHumanDataset(TopDownCocoDataset):
+    """OChuman dataset for top-down pose estimation.
+
+    "Pose2Seg: Detection Free Human Instance Segmentation", CVPR'2019.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1803.10683>`__ .
+
+    "Occluded Human (OCHuman)" dataset contains 8110 heavily occluded
+    human instances within 4731 images. OCHuman dataset is designed for
+    validation and testing. To evaluate on OCHuman, the model should be
+    trained on COCO training set, and then test the robustness of the
+    model to occlusion using OCHuman.
+
+    OCHuman keypoint indexes (same as COCO)::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/ochuman.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        assert self.use_gt_bbox
+        gt_db = self._load_coco_keypoint_annotations()
+        return gt_db
diff --git a/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py b/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py
new file mode 100644
index 0000000..c690860
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from .topdown_coco_dataset import TopDownCocoDataset
+
+try:
+    from poseval import eval_helpers
+    from poseval.evaluateAP import evaluateAP
+    has_poseval = True
+except (ImportError, ModuleNotFoundError):
+    has_poseval = False
+
+
+@DATASETS.register_module()
+class TopDownPoseTrack18Dataset(TopDownCocoDataset):
+    """PoseTrack18 dataset for top-down pose estimation.
+
+    "Posetrack: A benchmark for human pose estimation and tracking", CVPR'2018.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1710.10000>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    PoseTrack2018 keypoint indexes::
+
+        0: 'nose',
+        1: 'head_bottom',
+        2: 'head_top',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            warnings.warn(
+                'dataset_info is missing. '
+                'Check https://github.com/open-mmlab/mmpose/pull/663 '
+                'for details.', DeprecationWarning)
+            cfg = Config.fromfile('configs/_base_/datasets/posetrack18.py')
+            dataset_info = cfg._cfg_dict['dataset_info']
+
+        super(TopDownCocoDataset, self).__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate posetrack keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - num_keypoints: K
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['val/010016_mpii_test\
+                    /000024.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap.
+                - bbox_id (list(int))
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_folder = tmp_folder.name
+
+        gt_folder = osp.join(
+            osp.dirname(self.ann_file),
+            osp.splitext(self.ann_file.split('_')[-1])[0])
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = defaultdict(list)
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, oks_thr, sigmas=self.sigmas)
+                valid_kpts[image_id].append(
+                    [img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts[image_id].append(img_kpts)
+
+        self._write_posetrack18_keypoint_results(valid_kpts, gt_folder,
+                                                 res_folder)
+
+        info_str = self._do_python_keypoint_eval(gt_folder, res_folder)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _write_posetrack18_keypoint_results(keypoint_results, gt_folder,
+                                            pred_folder):
+        """Write results into a json file.
+
+        Args:
+            keypoint_results (dict): keypoint results organized by image_id.
+            gt_folder (str): Path of directory for official gt files.
+            pred_folder (str): Path of directory to save the results.
+        """
+        categories = []
+
+        cat = {}
+        cat['supercategory'] = 'person'
+        cat['id'] = 1
+        cat['name'] = 'person'
+        cat['keypoints'] = [
+            'nose', 'head_bottom', 'head_top', 'left_ear', 'right_ear',
+            'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
+            'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee',
+            'right_knee', 'left_ankle', 'right_ankle'
+        ]
+        cat['skeleton'] = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13],
+                           [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10],
+                           [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5],
+                           [4, 6], [5, 7]]
+        categories.append(cat)
+
+        json_files = [
+            pos for pos in os.listdir(gt_folder) if pos.endswith('.json')
+        ]
+        for json_file in json_files:
+
+            with open(osp.join(gt_folder, json_file), 'r') as f:
+                gt = json.load(f)
+
+            annotations = []
+            images = []
+
+            for image in gt['images']:
+                im = {}
+                im['id'] = image['id']
+                im['file_name'] = image['file_name']
+                images.append(im)
+
+                img_kpts = keypoint_results[im['id']]
+
+                if len(img_kpts) == 0:
+                    continue
+                for track_id, img_kpt in enumerate(img_kpts[0]):
+                    ann = {}
+                    ann['image_id'] = img_kpt['image_id']
+                    ann['keypoints'] = np.array(
+                        img_kpt['keypoints']).reshape(-1).tolist()
+                    ann['scores'] = np.array(ann['keypoints']).reshape(
+                        [-1, 3])[:, 2].tolist()
+                    ann['score'] = float(img_kpt['score'])
+                    ann['track_id'] = track_id
+                    annotations.append(ann)
+
+            info = {}
+            info['images'] = images
+            info['categories'] = categories
+            info['annotations'] = annotations
+
+            with open(osp.join(pred_folder, json_file), 'w') as f:
+                json.dump(info, f, sort_keys=True, indent=4)
+
+    def _do_python_keypoint_eval(self, gt_folder, pred_folder):
+        """Keypoint evaluation using poseval."""
+
+        if not has_poseval:
+            raise ImportError('Please install poseval package for evaluation'
+                              'on PoseTrack dataset '
+                              '(see requirements/optional.txt)')
+
+        argv = ['', gt_folder + '/', pred_folder + '/']
+
+        print('Loading data')
+        gtFramesAll, prFramesAll = eval_helpers.load_data_dir(argv)
+
+        print('# gt frames  :', len(gtFramesAll))
+        print('# pred frames:', len(prFramesAll))
+
+        # evaluate per-frame multi-person pose estimation (AP)
+        # compute AP
+        print('Evaluation of per-frame multi-person pose estimation')
+        apAll, _, _ = evaluateAP(gtFramesAll, prFramesAll, None, False, False)
+
+        # print AP
+        print('Average Precision (AP) metric:')
+        eval_helpers.printTable(apAll)
+
+        stats = eval_helpers.getCum(apAll)
+
+        stats_names = [
+            'Head AP', 'Shou AP', 'Elb AP', 'Wri AP', 'Hip AP', 'Knee AP',
+            'Ankl AP', 'Total AP'
+        ]
+
+        info_str = list(zip(stats_names, stats))
+
+        return info_str
diff --git a/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py b/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py
new file mode 100644
index 0000000..045148d
--- /dev/null
+++ b/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py
@@ -0,0 +1,549 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import deprecated_api_warning
+
+from ....core.post_processing import oks_nms, soft_oks_nms
+from ...builder import DATASETS
+from ..base import Kpt2dSviewRgbVidTopDownDataset
+
+try:
+    from poseval import eval_helpers
+    from poseval.evaluateAP import evaluateAP
+    has_poseval = True
+except (ImportError, ModuleNotFoundError):
+    has_poseval = False
+
+
+@DATASETS.register_module()
+class TopDownPoseTrack18VideoDataset(Kpt2dSviewRgbVidTopDownDataset):
+    """PoseTrack18 dataset for top-down pose estimation.
+
+    "Posetrack: A benchmark for human pose estimation and tracking", CVPR'2018.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1710.10000>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    PoseTrack2018 keypoint indexes::
+
+        0: 'nose',
+        1: 'head_bottom',
+        2: 'head_top',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where videos/images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+        ph_fill_len (int): The length of the placeholder to fill in the
+            image filenames, default: 6 in PoseTrack18.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False,
+                 ph_fill_len=6):
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.use_gt_bbox = data_cfg['use_gt_bbox']
+        self.bbox_file = data_cfg['bbox_file']
+        self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0)
+        self.use_nms = data_cfg.get('use_nms', True)
+        self.soft_nms = data_cfg['soft_nms']
+        self.nms_thr = data_cfg['nms_thr']
+        self.oks_thr = data_cfg['oks_thr']
+        self.vis_thr = data_cfg['vis_thr']
+        self.frame_weight_train = data_cfg['frame_weight_train']
+        self.frame_weight_test = data_cfg['frame_weight_test']
+        self.frame_weight = self.frame_weight_test \
+            if self.test_mode else self.frame_weight_train
+
+        self.ph_fill_len = ph_fill_len
+
+        # select the frame indices
+        self.frame_index_rand = data_cfg.get('frame_index_rand', True)
+        self.frame_index_range = data_cfg.get('frame_index_range', [-2, 2])
+        self.num_adj_frames = data_cfg.get('num_adj_frames', 1)
+        self.frame_indices_train = data_cfg.get('frame_indices_train', None)
+        self.frame_indices_test = data_cfg.get('frame_indices_test',
+                                               [-2, -1, 0, 1, 2])
+
+        if self.frame_indices_train is not None:
+            self.frame_indices_train.sort()
+        self.frame_indices_test.sort()
+
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        if (not self.test_mode) or self.use_gt_bbox:
+            # use ground truth bbox
+            gt_db = self._load_coco_keypoint_annotations()
+        else:
+            # use bbox from detection
+            gt_db = self._load_posetrack_person_detection_results()
+        return gt_db
+
+    def _load_coco_keypoint_annotations(self):
+        """Ground truth bbox and keypoints."""
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        file_name = img_ann['file_name']
+        nframes = int(img_ann['nframes'])
+        frame_id = int(img_ann['frame_id'])
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_files = []
+            cur_image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            image_files.append(cur_image_file)
+
+            # "images/val/012834_mpii_test/000000.jpg" -->> "000000.jpg"
+            cur_image_name = file_name.split('/')[-1]
+            ref_idx = int(cur_image_name.replace('.jpg', ''))
+
+            # select the frame indices
+            if not self.test_mode and self.frame_indices_train is not None:
+                indices = self.frame_indices_train
+            elif not self.test_mode and self.frame_index_rand:
+                low, high = self.frame_index_range
+                indices = np.random.randint(low, high + 1, self.num_adj_frames)
+            else:
+                indices = self.frame_indices_test
+
+            for index in indices:
+                if self.test_mode and index == 0:
+                    continue
+                # the supporting frame index
+                support_idx = ref_idx + index
+                support_idx = np.clip(support_idx, 0, nframes - 1)
+                sup_image_file = cur_image_file.replace(
+                    cur_image_name,
+                    str(support_idx).zfill(self.ph_fill_len) + '.jpg')
+
+                if osp.exists(sup_image_file):
+                    image_files.append(sup_image_file)
+                else:
+                    warnings.warn(
+                        f'{sup_image_file} does not exist, '
+                        f'use {cur_image_file} instead.', UserWarning)
+                    image_files.append(cur_image_file)
+            rec.append({
+                'image_file': image_files,
+                'center': center,
+                'scale': scale,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id,
+                'nframes': nframes,
+                'frame_id': frame_id,
+                'frame_weight': self.frame_weight
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _load_posetrack_person_detection_results(self):
+        """Load Posetrack person detection results.
+
+        Only in test mode.
+        """
+        num_joints = self.ann_info['num_joints']
+        all_boxes = None
+        with open(self.bbox_file, 'r') as f:
+            all_boxes = json.load(f)
+
+        if not all_boxes:
+            raise ValueError('=> Load %s fail!' % self.bbox_file)
+
+        print(f'=> Total boxes: {len(all_boxes)}')
+
+        kpt_db = []
+        bbox_id = 0
+        for det_res in all_boxes:
+            if det_res['category_id'] != 1:
+                continue
+
+            score = det_res['score']
+            if score < self.det_bbox_thr:
+                continue
+
+            box = det_res['bbox']
+
+            # deal with different bbox file formats
+            if 'nframes' in det_res and 'frame_id' in det_res:
+                nframes = int(det_res['nframes'])
+                frame_id = int(det_res['frame_id'])
+            elif 'image_name' in det_res:
+                img_id = self.name2id[det_res['image_name']]
+                img_ann = self.coco.loadImgs(img_id)[0]
+                nframes = int(img_ann['nframes'])
+                frame_id = int(img_ann['frame_id'])
+            else:
+                img_id = det_res['image_id']
+                img_ann = self.coco.loadImgs(img_id)[0]
+                nframes = int(img_ann['nframes'])
+                frame_id = int(img_ann['frame_id'])
+
+            image_files = []
+            if 'image_name' in det_res:
+                file_name = det_res['image_name']
+            else:
+                file_name = self.id2name[det_res['image_id']]
+
+            cur_image_file = osp.join(self.img_prefix, file_name)
+            image_files.append(cur_image_file)
+
+            # "images/val/012834_mpii_test/000000.jpg" -->> "000000.jpg"
+            cur_image_name = file_name.split('/')[-1]
+            ref_idx = int(cur_image_name.replace('.jpg', ''))
+
+            indices = self.frame_indices_test
+            for index in indices:
+                if self.test_mode and index == 0:
+                    continue
+                # the supporting frame index
+                support_idx = ref_idx + index
+                support_idx = np.clip(support_idx, 0, nframes - 1)
+                sup_image_file = cur_image_file.replace(
+                    cur_image_name,
+                    str(support_idx).zfill(self.ph_fill_len) + '.jpg')
+
+                if osp.exists(sup_image_file):
+                    image_files.append(sup_image_file)
+                else:
+                    warnings.warn(f'{sup_image_file} does not exist, '
+                                  f'use {cur_image_file} instead.')
+                    image_files.append(cur_image_file)
+
+            center, scale = self._xywh2cs(*box[:4])
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.ones((num_joints, 3), dtype=np.float32)
+            kpt_db.append({
+                'image_file': image_files,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'bbox': box[:4],
+                'bbox_score': score,
+                'dataset': self.dataset_name,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'bbox_id': bbox_id,
+                'nframes': nframes,
+                'frame_id': frame_id,
+                'frame_weight': self.frame_weight
+            })
+            bbox_id = bbox_id + 1
+        print(f'=> Total boxes after filter '
+              f'low score@{self.det_bbox_thr}: {bbox_id}')
+        return kpt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate posetrack keypoint results. The pose prediction results
+        will be saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - num_keypoints: K
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['val/010016_mpii_test\
+                    /000024.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap.
+                - bbox_id (list(int))
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_folder = tmp_folder.name
+
+        gt_folder = osp.join(
+            osp.dirname(self.ann_file),
+            osp.splitext(self.ann_file.split('_')[-1])[0])
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                if not isinstance(image_paths[i], list):
+                    image_id = self.name2id[image_paths[i]
+                                            [len(self.img_prefix):]]
+                else:
+                    image_id = self.name2id[image_paths[i][0]
+                                            [len(self.img_prefix):]]
+
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = defaultdict(list)
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > vis_thr:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, oks_thr, sigmas=self.sigmas)
+                valid_kpts[image_id].append(
+                    [img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts[image_id].append(img_kpts)
+
+        self._write_keypoint_results(valid_kpts, gt_folder, res_folder)
+
+        info_str = self._do_keypoint_eval(gt_folder, res_folder)
+        name_value = OrderedDict(info_str)
+
+        if tmp_folder is not None:
+            tmp_folder.cleanup()
+
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoint_results, gt_folder, pred_folder):
+        """Write results into a json file.
+
+        Args:
+            keypoint_results (dict): keypoint results organized by image_id.
+            gt_folder (str): Path of directory for official gt files.
+            pred_folder (str): Path of directory to save the results.
+        """
+        categories = []
+
+        cat = {}
+        cat['supercategory'] = 'person'
+        cat['id'] = 1
+        cat['name'] = 'person'
+        cat['keypoints'] = [
+            'nose', 'head_bottom', 'head_top', 'left_ear', 'right_ear',
+            'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
+            'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee',
+            'right_knee', 'left_ankle', 'right_ankle'
+        ]
+        cat['skeleton'] = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13],
+                           [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10],
+                           [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5],
+                           [4, 6], [5, 7]]
+        categories.append(cat)
+
+        json_files = [
+            pos for pos in os.listdir(gt_folder) if pos.endswith('.json')
+        ]
+        for json_file in json_files:
+
+            with open(osp.join(gt_folder, json_file), 'r') as f:
+                gt = json.load(f)
+
+            annotations = []
+            images = []
+
+            for image in gt['images']:
+                im = {}
+                im['id'] = image['id']
+                im['file_name'] = image['file_name']
+                images.append(im)
+
+                img_kpts = keypoint_results[im['id']]
+
+                if len(img_kpts) == 0:
+                    continue
+                for track_id, img_kpt in enumerate(img_kpts[0]):
+                    ann = {}
+                    ann['image_id'] = img_kpt['image_id']
+                    ann['keypoints'] = np.array(
+                        img_kpt['keypoints']).reshape(-1).tolist()
+                    ann['scores'] = np.array(ann['keypoints']).reshape(
+                        [-1, 3])[:, 2].tolist()
+                    ann['score'] = float(img_kpt['score'])
+                    ann['track_id'] = track_id
+                    annotations.append(ann)
+
+            info = {}
+            info['images'] = images
+            info['categories'] = categories
+            info['annotations'] = annotations
+
+            with open(osp.join(pred_folder, json_file), 'w') as f:
+                json.dump(info, f, sort_keys=True, indent=4)
+
+    def _do_keypoint_eval(self, gt_folder, pred_folder):
+        """Keypoint evaluation using poseval."""
+
+        if not has_poseval:
+            raise ImportError('Please install poseval package for evaluation'
+                              'on PoseTrack dataset '
+                              '(see requirements/optional.txt)')
+
+        argv = ['', gt_folder + '/', pred_folder + '/']
+
+        print('Loading data')
+        gtFramesAll, prFramesAll = eval_helpers.load_data_dir(argv)
+
+        print('# gt frames  :', len(gtFramesAll))
+        print('# pred frames:', len(prFramesAll))
+
+        # evaluate per-frame multi-person pose estimation (AP)
+        # compute AP
+        print('Evaluation of per-frame multi-person pose estimation')
+        apAll, _, _ = evaluateAP(gtFramesAll, prFramesAll, None, False, False)
+
+        # print AP
+        print('Average Precision (AP) metric:')
+        eval_helpers.printTable(apAll)
+
+        stats = eval_helpers.getCum(apAll)
+
+        stats_names = [
+            'Head AP', 'Shou AP', 'Elb AP', 'Wri AP', 'Hip AP', 'Knee AP',
+            'Ankl AP', 'Total AP'
+        ]
+
+        info_str = list(zip(stats_names, stats))
+
+        return info_str
diff --git a/mmpose/datasets/pipelines/__init__.py b/mmpose/datasets/pipelines/__init__.py
new file mode 100644
index 0000000..cf06db1
--- /dev/null
+++ b/mmpose/datasets/pipelines/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bottom_up_transform import *  # noqa
+from .hand_transform import *  # noqa
+from .loading import LoadImageFromFile  # noqa
+from .mesh_transform import *  # noqa
+from .pose3d_transform import *  # noqa
+from .shared_transform import *  # noqa
+from .top_down_transform import *  # noqa
diff --git a/mmpose/datasets/pipelines/bottom_up_transform.py b/mmpose/datasets/pipelines/bottom_up_transform.py
new file mode 100644
index 0000000..032ce45
--- /dev/null
+++ b/mmpose/datasets/pipelines/bottom_up_transform.py
@@ -0,0 +1,816 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+from mmpose.core.post_processing import (get_affine_transform, get_warp_matrix,
+                                         warp_affine_joints)
+from mmpose.datasets.builder import PIPELINES
+from .shared_transform import Compose
+
+
+def _ceil_to_multiples_of(x, base=64):
+    """Transform x to the integral multiple of the base."""
+    return int(np.ceil(x / base)) * base
+
+
+def _get_multi_scale_size(image,
+                          input_size,
+                          current_scale,
+                          min_scale,
+                          use_udp=False):
+    """Get the size for multi-scale training.
+
+    Args:
+        image: Input image.
+        input_size (np.ndarray[2]): Size (w, h) of the image input.
+        current_scale (float): Scale factor.
+        min_scale (float): Minimal scale.
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Returns:
+        tuple: A tuple containing multi-scale sizes.
+
+        - (w_resized, h_resized) (tuple(int)): resized width/height
+        - center (np.ndarray)image center
+        - scale (np.ndarray): scales wrt width/height
+    """
+    assert len(input_size) == 2
+    h, w, _ = image.shape
+
+    # calculate the size for min_scale
+    min_input_w = _ceil_to_multiples_of(min_scale * input_size[0], 64)
+    min_input_h = _ceil_to_multiples_of(min_scale * input_size[1], 64)
+    if w < h:
+        w_resized = int(min_input_w * current_scale / min_scale)
+        h_resized = int(
+            _ceil_to_multiples_of(min_input_w / w * h, 64) * current_scale /
+            min_scale)
+        if use_udp:
+            scale_w = w - 1.0
+            scale_h = (h_resized - 1.0) / (w_resized - 1.0) * (w - 1.0)
+        else:
+            scale_w = w / 200.0
+            scale_h = h_resized / w_resized * w / 200.0
+    else:
+        h_resized = int(min_input_h * current_scale / min_scale)
+        w_resized = int(
+            _ceil_to_multiples_of(min_input_h / h * w, 64) * current_scale /
+            min_scale)
+        if use_udp:
+            scale_h = h - 1.0
+            scale_w = (w_resized - 1.0) / (h_resized - 1.0) * (h - 1.0)
+        else:
+            scale_h = h / 200.0
+            scale_w = w_resized / h_resized * h / 200.0
+    if use_udp:
+        center = (scale_w / 2.0, scale_h / 2.0)
+    else:
+        center = np.array([round(w / 2.0), round(h / 2.0)])
+    return (w_resized, h_resized), center, np.array([scale_w, scale_h])
+
+
+def _resize_align_multi_scale(image, input_size, current_scale, min_scale):
+    """Resize the images for multi-scale training.
+
+    Args:
+        image: Input image
+        input_size (np.ndarray[2]): Size (w, h) of the image input
+        current_scale (float): Current scale
+        min_scale (float): Minimal scale
+
+    Returns:
+        tuple: A tuple containing image info.
+
+        - image_resized (np.ndarray): resized image
+        - center (np.ndarray): center of image
+        - scale (np.ndarray): scale
+    """
+    assert len(input_size) == 2
+    size_resized, center, scale = _get_multi_scale_size(
+        image, input_size, current_scale, min_scale)
+
+    trans = get_affine_transform(center, scale, 0, size_resized)
+    image_resized = cv2.warpAffine(image, trans, size_resized)
+
+    return image_resized, center, scale
+
+
+def _resize_align_multi_scale_udp(image, input_size, current_scale, min_scale):
+    """Resize the images for multi-scale training.
+
+    Args:
+        image: Input image
+        input_size (np.ndarray[2]): Size (w, h) of the image input
+        current_scale (float): Current scale
+        min_scale (float): Minimal scale
+
+    Returns:
+        tuple: A tuple containing image info.
+
+        - image_resized (np.ndarray): resized image
+        - center (np.ndarray): center of image
+        - scale (np.ndarray): scale
+    """
+    assert len(input_size) == 2
+    size_resized, _, _ = _get_multi_scale_size(image, input_size,
+                                               current_scale, min_scale, True)
+
+    _, center, scale = _get_multi_scale_size(image, input_size, min_scale,
+                                             min_scale, True)
+
+    trans = get_warp_matrix(
+        theta=0,
+        size_input=np.array(scale, dtype=np.float32),
+        size_dst=np.array(size_resized, dtype=np.float32) - 1.0,
+        size_target=np.array(scale, dtype=np.float32))
+    image_resized = cv2.warpAffine(
+        image.copy(), trans, size_resized, flags=cv2.INTER_LINEAR)
+
+    return image_resized, center, scale
+
+
+class HeatmapGenerator:
+    """Generate heatmaps for bottom-up models.
+
+    Args:
+        num_joints (int): Number of keypoints
+        output_size (np.ndarray): Size (w, h) of feature map
+        sigma (int): Sigma of the heatmaps.
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, output_size, num_joints, sigma=-1, use_udp=False):
+        if not isinstance(output_size, np.ndarray):
+            output_size = np.array(output_size)
+        if output_size.size > 1:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        else:
+            self.output_size = np.array([output_size, output_size],
+                                        dtype=np.int)
+        self.num_joints = num_joints
+        if sigma < 0:
+            sigma = self.output_size.prod()**0.5 / 64
+        self.sigma = sigma
+        size = 6 * sigma + 3
+        self.use_udp = use_udp
+        if use_udp:
+            self.x = np.arange(0, size, 1, np.float32)
+            self.y = self.x[:, None]
+        else:
+            x = np.arange(0, size, 1, np.float32)
+            y = x[:, None]
+            x0, y0 = 3 * sigma + 1, 3 * sigma + 1
+            self.g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+
+    def __call__(self, joints):
+        """Generate heatmaps."""
+        hms = np.zeros(
+            (self.num_joints, self.output_size[1], self.output_size[0]),
+            dtype=np.float32)
+
+        sigma = self.sigma
+        for p in joints:
+            for idx, pt in enumerate(p):
+                if pt[2] > 0:
+                    x, y = int(pt[0]), int(pt[1])
+                    if x < 0 or y < 0 or \
+                       x >= self.output_size[0] or y >= self.output_size[1]:
+                        continue
+
+                    if self.use_udp:
+                        x0 = 3 * sigma + 1 + pt[0] - x
+                        y0 = 3 * sigma + 1 + pt[1] - y
+                        g = np.exp(-((self.x - x0)**2 + (self.y - y0)**2) /
+                                   (2 * sigma**2))
+                    else:
+                        g = self.g
+
+                    ul = int(np.round(x - 3 * sigma -
+                                      1)), int(np.round(y - 3 * sigma - 1))
+                    br = int(np.round(x + 3 * sigma +
+                                      2)), int(np.round(y + 3 * sigma + 2))
+
+                    c, d = max(0,
+                               -ul[0]), min(br[0], self.output_size[0]) - ul[0]
+                    a, b = max(0,
+                               -ul[1]), min(br[1], self.output_size[1]) - ul[1]
+
+                    cc, dd = max(0, ul[0]), min(br[0], self.output_size[0])
+                    aa, bb = max(0, ul[1]), min(br[1], self.output_size[1])
+                    hms[idx, aa:bb,
+                        cc:dd] = np.maximum(hms[idx, aa:bb, cc:dd], g[a:b,
+                                                                      c:d])
+        return hms
+
+
+class JointsEncoder:
+    """Encodes the visible joints into (coordinates, score); The coordinate of
+    one joint and its score are of `int` type.
+
+    (idx * output_size**2 + y * output_size + x, 1) or (0, 0).
+
+    Args:
+        max_num_people(int): Max number of people in an image
+        num_joints(int): Number of keypoints
+        output_size(np.ndarray): Size (w, h) of feature map
+        tag_per_joint(bool):  Option to use one tag map per joint.
+    """
+
+    def __init__(self, max_num_people, num_joints, output_size, tag_per_joint):
+        self.max_num_people = max_num_people
+        self.num_joints = num_joints
+        if not isinstance(output_size, np.ndarray):
+            output_size = np.array(output_size)
+        if output_size.size > 1:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        else:
+            self.output_size = np.array([output_size, output_size],
+                                        dtype=np.int)
+        self.tag_per_joint = tag_per_joint
+
+    def __call__(self, joints):
+        """
+        Note:
+            - number of people in image: N
+            - number of keypoints: K
+            - max number of people in an image: M
+
+        Args:
+            joints (np.ndarray[N,K,3])
+
+        Returns:
+            visible_kpts (np.ndarray[M,K,2]).
+        """
+        visible_kpts = np.zeros((self.max_num_people, self.num_joints, 2),
+                                dtype=np.float32)
+        for i in range(len(joints)):
+            tot = 0
+            for idx, pt in enumerate(joints[i]):
+                x, y = int(pt[0]), int(pt[1])
+                if (pt[2] > 0 and 0 <= y < self.output_size[1]
+                        and 0 <= x < self.output_size[0]):
+                    if self.tag_per_joint:
+                        visible_kpts[i][tot] = \
+                            (idx * self.output_size.prod()
+                             + y * self.output_size[0] + x, 1)
+                    else:
+                        visible_kpts[i][tot] = (y * self.output_size[0] + x, 1)
+                    tot += 1
+        return visible_kpts
+
+
+class PAFGenerator:
+    """Generate part affinity fields.
+
+    Args:
+        output_size (np.ndarray): Size (w, h) of feature map.
+        limb_width (int): Limb width of part affinity fields.
+        skeleton (list[list]): connections of joints.
+    """
+
+    def __init__(self, output_size, limb_width, skeleton):
+        if not isinstance(output_size, np.ndarray):
+            output_size = np.array(output_size)
+        if output_size.size > 1:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        else:
+            self.output_size = np.array([output_size, output_size],
+                                        dtype=np.int)
+        self.limb_width = limb_width
+        self.skeleton = skeleton
+
+    def _accumulate_paf_map_(self, pafs, src, dst, count):
+        """Accumulate part affinity fields between two given joints.
+
+        Args:
+            pafs (np.ndarray[2,H,W]): paf maps (2 dimensions:x axis and
+                y axis) for a certain limb connection. This argument will
+                be modified inplace.
+            src (np.ndarray[2,]): coordinates of the source joint.
+            dst (np.ndarray[2,]): coordinates of the destination joint.
+            count (np.ndarray[H,W]): count map that preserves the number
+                of non-zero vectors at each point. This argument will be
+                modified inplace.
+        """
+        limb_vec = dst - src
+        norm = np.linalg.norm(limb_vec)
+        if norm == 0:
+            unit_limb_vec = np.zeros(2)
+        else:
+            unit_limb_vec = limb_vec / norm
+
+        min_x = max(np.floor(min(src[0], dst[0]) - self.limb_width), 0)
+        max_x = min(
+            np.ceil(max(src[0], dst[0]) + self.limb_width),
+            self.output_size[0] - 1)
+        min_y = max(np.floor(min(src[1], dst[1]) - self.limb_width), 0)
+        max_y = min(
+            np.ceil(max(src[1], dst[1]) + self.limb_width),
+            self.output_size[1] - 1)
+
+        range_x = list(range(int(min_x), int(max_x + 1), 1))
+        range_y = list(range(int(min_y), int(max_y + 1), 1))
+
+        mask = np.zeros_like(count, dtype=bool)
+        if len(range_x) > 0 and len(range_y) > 0:
+            xx, yy = np.meshgrid(range_x, range_y)
+            delta_x = xx - src[0]
+            delta_y = yy - src[1]
+            dist = np.abs(delta_x * unit_limb_vec[1] -
+                          delta_y * unit_limb_vec[0])
+            mask_local = (dist < self.limb_width)
+            mask[yy, xx] = mask_local
+
+        pafs[0, mask] += unit_limb_vec[0]
+        pafs[1, mask] += unit_limb_vec[1]
+        count += mask
+
+        return pafs, count
+
+    def __call__(self, joints):
+        """Generate the target part affinity fields."""
+        pafs = np.zeros(
+            (len(self.skeleton) * 2, self.output_size[1], self.output_size[0]),
+            dtype=np.float32)
+
+        for idx, sk in enumerate(self.skeleton):
+            count = np.zeros((self.output_size[1], self.output_size[0]),
+                             dtype=np.float32)
+
+            for p in joints:
+                src = p[sk[0]]
+                dst = p[sk[1]]
+                if src[2] > 0 and dst[2] > 0:
+                    self._accumulate_paf_map_(pafs[2 * idx:2 * idx + 2],
+                                              src[:2], dst[:2], count)
+
+            pafs[2 * idx:2 * idx + 2] /= np.maximum(count, 1)
+
+        return pafs
+
+
+@PIPELINES.register_module()
+class BottomUpRandomFlip:
+    """Data augmentation with random image flip for bottom-up.
+
+    Args:
+        flip_prob (float): Probability of flip.
+    """
+
+    def __init__(self, flip_prob=0.5):
+        self.flip_prob = flip_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        image, mask, joints = results['img'], results['mask'], results[
+            'joints']
+        self.flip_index = results['ann_info']['flip_index']
+        self.output_size = results['ann_info']['heatmap_size']
+
+        assert isinstance(mask, list)
+        assert isinstance(joints, list)
+        assert len(mask) == len(joints)
+        assert len(mask) == len(self.output_size)
+
+        if np.random.random() < self.flip_prob:
+            image = image[:, ::-1].copy() - np.zeros_like(image)
+            for i, _output_size in enumerate(self.output_size):
+                if not isinstance(_output_size, np.ndarray):
+                    _output_size = np.array(_output_size)
+                if _output_size.size > 1:
+                    assert len(_output_size) == 2
+                else:
+                    _output_size = np.array([_output_size, _output_size],
+                                            dtype=np.int)
+                mask[i] = mask[i][:, ::-1].copy()
+                joints[i] = joints[i][:, self.flip_index]
+                joints[i][:, :, 0] = _output_size[0] - joints[i][:, :, 0] - 1
+        results['img'], results['mask'], results[
+            'joints'] = image, mask, joints
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpRandomAffine:
+    """Data augmentation with random scaling & rotating.
+
+    Args:
+        rot_factor (int): Rotating to [-rotation_factor, rotation_factor]
+        scale_factor (float): Scaling to [1-scale_factor, 1+scale_factor]
+        scale_type: wrt ``long`` or ``short`` length of the image.
+        trans_factor: Translation factor.
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self,
+                 rot_factor,
+                 scale_factor,
+                 scale_type,
+                 trans_factor,
+                 use_udp=False):
+        self.max_rotation = rot_factor
+        self.min_scale = scale_factor[0]
+        self.max_scale = scale_factor[1]
+        self.scale_type = scale_type
+        self.trans_factor = trans_factor
+        self.use_udp = use_udp
+
+    def _get_scale(self, image_size, resized_size):
+        w, h = image_size
+        w_resized, h_resized = resized_size
+        if w / w_resized < h / h_resized:
+            if self.scale_type == 'long':
+                w_pad = h / h_resized * w_resized
+                h_pad = h
+            elif self.scale_type == 'short':
+                w_pad = w
+                h_pad = w / w_resized * h_resized
+            else:
+                raise ValueError(f'Unknown scale type: {self.scale_type}')
+        else:
+            if self.scale_type == 'long':
+                w_pad = w
+                h_pad = w / w_resized * h_resized
+            elif self.scale_type == 'short':
+                w_pad = h / h_resized * w_resized
+                h_pad = h
+            else:
+                raise ValueError(f'Unknown scale type: {self.scale_type}')
+
+        scale = np.array([w_pad, h_pad], dtype=np.float32)
+
+        return scale
+
+    def __call__(self, results):
+        """Perform data augmentation with random scaling & rotating."""
+        image, mask, joints = results['img'], results['mask'], results[
+            'joints']
+
+        self.input_size = results['ann_info']['image_size']
+        if not isinstance(self.input_size, np.ndarray):
+            self.input_size = np.array(self.input_size)
+        if self.input_size.size > 1:
+            assert len(self.input_size) == 2
+        else:
+            self.input_size = [self.input_size, self.input_size]
+        self.output_size = results['ann_info']['heatmap_size']
+
+        assert isinstance(mask, list)
+        assert isinstance(joints, list)
+        assert len(mask) == len(joints)
+        assert len(mask) == len(self.output_size), (len(mask),
+                                                    len(self.output_size),
+                                                    self.output_size)
+
+        height, width = image.shape[:2]
+        if self.use_udp:
+            center = np.array(((width - 1.0) / 2, (height - 1.0) / 2))
+        else:
+            center = np.array((width / 2, height / 2))
+
+        img_scale = np.array([width, height], dtype=np.float32)
+        aug_scale = np.random.random() * (self.max_scale - self.min_scale) \
+            + self.min_scale
+        img_scale *= aug_scale
+        aug_rot = (np.random.random() * 2 - 1) * self.max_rotation
+
+        if self.trans_factor > 0:
+            dx = np.random.randint(-self.trans_factor * img_scale[0] / 200.0,
+                                   self.trans_factor * img_scale[0] / 200.0)
+            dy = np.random.randint(-self.trans_factor * img_scale[1] / 200.0,
+                                   self.trans_factor * img_scale[1] / 200.0)
+
+            center[0] += dx
+            center[1] += dy
+        if self.use_udp:
+            for i, _output_size in enumerate(self.output_size):
+                if not isinstance(_output_size, np.ndarray):
+                    _output_size = np.array(_output_size)
+                if _output_size.size > 1:
+                    assert len(_output_size) == 2
+                else:
+                    _output_size = [_output_size, _output_size]
+
+                scale = self._get_scale(img_scale, _output_size)
+
+                trans = get_warp_matrix(
+                    theta=aug_rot,
+                    size_input=center * 2.0,
+                    size_dst=np.array(
+                        (_output_size[0], _output_size[1]), dtype=np.float32) -
+                    1.0,
+                    size_target=scale)
+                mask[i] = cv2.warpAffine(
+                    (mask[i] * 255).astype(np.uint8),
+                    trans, (int(_output_size[0]), int(_output_size[1])),
+                    flags=cv2.INTER_LINEAR) / 255
+                mask[i] = (mask[i] > 0.5).astype(np.float32)
+                joints[i][:, :, 0:2] = \
+                    warp_affine_joints(joints[i][:, :, 0:2].copy(), trans)
+                if results['ann_info']['scale_aware_sigma']:
+                    joints[i][:, :, 3] = joints[i][:, :, 3] / aug_scale
+            scale = self._get_scale(img_scale, self.input_size)
+            mat_input = get_warp_matrix(
+                theta=aug_rot,
+                size_input=center * 2.0,
+                size_dst=np.array((self.input_size[0], self.input_size[1]),
+                                  dtype=np.float32) - 1.0,
+                size_target=scale)
+            image = cv2.warpAffine(
+                image,
+                mat_input, (int(self.input_size[0]), int(self.input_size[1])),
+                flags=cv2.INTER_LINEAR)
+        else:
+            for i, _output_size in enumerate(self.output_size):
+                if not isinstance(_output_size, np.ndarray):
+                    _output_size = np.array(_output_size)
+                if _output_size.size > 1:
+                    assert len(_output_size) == 2
+                else:
+                    _output_size = [_output_size, _output_size]
+                scale = self._get_scale(img_scale, _output_size)
+                mat_output = get_affine_transform(
+                    center=center,
+                    scale=scale / 200.0,
+                    rot=aug_rot,
+                    output_size=_output_size)
+                mask[i] = cv2.warpAffine(
+                    (mask[i] * 255).astype(np.uint8), mat_output,
+                    (int(_output_size[0]), int(_output_size[1]))) / 255
+                mask[i] = (mask[i] > 0.5).astype(np.float32)
+
+                joints[i][:, :, 0:2] = \
+                    warp_affine_joints(joints[i][:, :, 0:2], mat_output)
+                if results['ann_info']['scale_aware_sigma']:
+                    joints[i][:, :, 3] = joints[i][:, :, 3] / aug_scale
+
+            scale = self._get_scale(img_scale, self.input_size)
+            mat_input = get_affine_transform(
+                center=center,
+                scale=scale / 200.0,
+                rot=aug_rot,
+                output_size=self.input_size)
+            image = cv2.warpAffine(image, mat_input, (int(
+                self.input_size[0]), int(self.input_size[1])))
+
+        results['img'], results['mask'], results[
+            'joints'] = image, mask, joints
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpGenerateHeatmapTarget:
+    """Generate multi-scale heatmap target for bottom-up.
+
+    Args:
+        sigma (int): Sigma of heatmap Gaussian
+        max_num_people (int): Maximum number of people in an image
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, sigma, use_udp=False):
+        self.sigma = sigma
+        self.use_udp = use_udp
+
+    def _generate(self, num_joints, heatmap_size):
+        """Get heatmap generator."""
+        heatmap_generator = [
+            HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp)
+            for output_size in heatmap_size
+        ]
+        return heatmap_generator
+
+    def __call__(self, results):
+        """Generate multi-scale heatmap target for bottom-up."""
+        heatmap_generator = \
+            self._generate(results['ann_info']['num_joints'],
+                           results['ann_info']['heatmap_size'])
+        target_list = list()
+        joints_list = results['joints']
+
+        for scale_id in range(results['ann_info']['num_scales']):
+            heatmaps = heatmap_generator[scale_id](joints_list[scale_id])
+            target_list.append(heatmaps.astype(np.float32))
+        results['target'] = target_list
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpGenerateTarget:
+    """Generate multi-scale heatmap target for associate embedding.
+
+    Args:
+        sigma (int): Sigma of heatmap Gaussian
+        max_num_people (int): Maximum number of people in an image
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, sigma, max_num_people, use_udp=False):
+        self.sigma = sigma
+        self.max_num_people = max_num_people
+        self.use_udp = use_udp
+
+    def _generate(self, num_joints, heatmap_size):
+        """Get heatmap generator and joint encoder."""
+        heatmap_generator = [
+            HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp)
+            for output_size in heatmap_size
+        ]
+        joints_encoder = [
+            JointsEncoder(self.max_num_people, num_joints, output_size, True)
+            for output_size in heatmap_size
+        ]
+        return heatmap_generator, joints_encoder
+
+    def __call__(self, results):
+        """Generate multi-scale heatmap target for bottom-up."""
+        heatmap_generator, joints_encoder = \
+            self._generate(results['ann_info']['num_joints'],
+                           results['ann_info']['heatmap_size'])
+        target_list = list()
+        mask_list, joints_list = results['mask'], results['joints']
+
+        for scale_id in range(results['ann_info']['num_scales']):
+            target_t = heatmap_generator[scale_id](joints_list[scale_id])
+            joints_t = joints_encoder[scale_id](joints_list[scale_id])
+
+            target_list.append(target_t.astype(np.float32))
+            mask_list[scale_id] = mask_list[scale_id].astype(np.float32)
+            joints_list[scale_id] = joints_t.astype(np.int32)
+
+        results['masks'], results['joints'] = mask_list, joints_list
+        results['targets'] = target_list
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpGeneratePAFTarget:
+    """Generate multi-scale heatmaps and part affinity fields (PAF) target for
+    bottom-up. Paper ref: Cao et al. Realtime Multi-Person 2D Human Pose
+    Estimation using Part Affinity Fields (CVPR 2017).
+
+    Args:
+        limb_width (int): Limb width of part affinity fields
+    """
+
+    def __init__(self, limb_width, skeleton=None):
+        self.limb_width = limb_width
+        self.skeleton = skeleton
+
+    def _generate(self, heatmap_size, skeleton):
+        """Get PAF generator."""
+        paf_generator = [
+            PAFGenerator(output_size, self.limb_width, skeleton)
+            for output_size in heatmap_size
+        ]
+        return paf_generator
+
+    def __call__(self, results):
+        """Generate multi-scale part affinity fields for bottom-up."""
+        if self.skeleton is None:
+            assert results['ann_info']['skeleton'] is not None
+            self.skeleton = results['ann_info']['skeleton']
+
+        paf_generator = \
+            self._generate(results['ann_info']['heatmap_size'],
+                           self.skeleton)
+        target_list = list()
+        joints_list = results['joints']
+
+        for scale_id in range(results['ann_info']['num_scales']):
+            pafs = paf_generator[scale_id](joints_list[scale_id])
+            target_list.append(pafs.astype(np.float32))
+
+        results['target'] = target_list
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpGetImgSize:
+    """Get multi-scale image sizes for bottom-up, including base_size and
+    test_scale_factor. Keep the ratio and the image is resized to
+    `results['ann_info']['image_size']×current_scale`.
+
+    Args:
+        test_scale_factor (List[float]): Multi scale
+        current_scale (int): default 1
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, test_scale_factor, current_scale=1, use_udp=False):
+        self.test_scale_factor = test_scale_factor
+        self.min_scale = min(test_scale_factor)
+        self.current_scale = current_scale
+        self.use_udp = use_udp
+
+    def __call__(self, results):
+        """Get multi-scale image sizes for bottom-up."""
+        input_size = results['ann_info']['image_size']
+        if not isinstance(input_size, np.ndarray):
+            input_size = np.array(input_size)
+        if input_size.size > 1:
+            assert len(input_size) == 2
+        else:
+            input_size = np.array([input_size, input_size], dtype=np.int)
+        img = results['img']
+
+        h, w, _ = img.shape
+
+        # calculate the size for min_scale
+        min_input_w = _ceil_to_multiples_of(self.min_scale * input_size[0], 64)
+        min_input_h = _ceil_to_multiples_of(self.min_scale * input_size[1], 64)
+        if w < h:
+            w_resized = int(min_input_w * self.current_scale / self.min_scale)
+            h_resized = int(
+                _ceil_to_multiples_of(min_input_w / w * h, 64) *
+                self.current_scale / self.min_scale)
+            if self.use_udp:
+                scale_w = w - 1.0
+                scale_h = (h_resized - 1.0) / (w_resized - 1.0) * (w - 1.0)
+            else:
+                scale_w = w / 200.0
+                scale_h = h_resized / w_resized * w / 200.0
+        else:
+            h_resized = int(min_input_h * self.current_scale / self.min_scale)
+            w_resized = int(
+                _ceil_to_multiples_of(min_input_h / h * w, 64) *
+                self.current_scale / self.min_scale)
+            if self.use_udp:
+                scale_h = h - 1.0
+                scale_w = (w_resized - 1.0) / (h_resized - 1.0) * (h - 1.0)
+            else:
+                scale_h = h / 200.0
+                scale_w = w_resized / h_resized * h / 200.0
+        if self.use_udp:
+            center = (scale_w / 2.0, scale_h / 2.0)
+        else:
+            center = np.array([round(w / 2.0), round(h / 2.0)])
+        results['ann_info']['test_scale_factor'] = self.test_scale_factor
+        results['ann_info']['base_size'] = (w_resized, h_resized)
+        results['ann_info']['center'] = center
+        results['ann_info']['scale'] = np.array([scale_w, scale_h])
+
+        return results
+
+
+@PIPELINES.register_module()
+class BottomUpResizeAlign:
+    """Resize multi-scale size and align transform for bottom-up.
+
+    Args:
+        transforms (List): ToTensor & Normalize
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, transforms, use_udp=False):
+        self.transforms = Compose(transforms)
+        if use_udp:
+            self._resize_align_multi_scale = _resize_align_multi_scale_udp
+        else:
+            self._resize_align_multi_scale = _resize_align_multi_scale
+
+    def __call__(self, results):
+        """Resize multi-scale size and align transform for bottom-up."""
+        input_size = results['ann_info']['image_size']
+        if not isinstance(input_size, np.ndarray):
+            input_size = np.array(input_size)
+        if input_size.size > 1:
+            assert len(input_size) == 2
+        else:
+            input_size = np.array([input_size, input_size], dtype=np.int)
+        test_scale_factor = results['ann_info']['test_scale_factor']
+        aug_data = []
+
+        for _, s in enumerate(sorted(test_scale_factor, reverse=True)):
+            _results = results.copy()
+            image_resized, _, _ = self._resize_align_multi_scale(
+                _results['img'], input_size, s, min(test_scale_factor))
+            _results['img'] = image_resized
+            _results = self.transforms(_results)
+            transformed_img = _results['img'].unsqueeze(0)
+            aug_data.append(transformed_img)
+
+        results['ann_info']['aug_data'] = aug_data
+
+        return results
diff --git a/mmpose/datasets/pipelines/hand_transform.py b/mmpose/datasets/pipelines/hand_transform.py
new file mode 100644
index 0000000..b83e399
--- /dev/null
+++ b/mmpose/datasets/pipelines/hand_transform.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmpose.datasets.builder import PIPELINES
+from .top_down_transform import TopDownRandomFlip
+
+
+@PIPELINES.register_module()
+class HandRandomFlip(TopDownRandomFlip):
+    """Data augmentation with random image flip. A child class of
+    TopDownRandomFlip.
+
+    Required keys: 'img', 'joints_3d', 'joints_3d_visible', 'center',
+    'hand_type', 'rel_root_depth' and 'ann_info'.
+
+    Modifies key: 'img', 'joints_3d', 'joints_3d_visible', 'center',
+    'hand_type', 'rel_root_depth'.
+
+    Args:
+        flip_prob (float): Probability of flip.
+    """
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        # base flip augmentation
+        super().__call__(results)
+
+        # flip hand type and root depth
+        hand_type = results['hand_type']
+        rel_root_depth = results['rel_root_depth']
+        flipped = results['flipped']
+        if flipped:
+            hand_type[0], hand_type[1] = hand_type[1], hand_type[0]
+            rel_root_depth = -rel_root_depth
+        results['hand_type'] = hand_type
+        results['rel_root_depth'] = rel_root_depth
+        return results
+
+
+@PIPELINES.register_module()
+class HandGenerateRelDepthTarget:
+    """Generate the target relative root depth.
+
+    Required keys: 'rel_root_depth', 'rel_root_valid', 'ann_info'.
+
+    Modified keys: 'target', 'target_weight'.
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        rel_root_depth = results['rel_root_depth']
+        rel_root_valid = results['rel_root_valid']
+        cfg = results['ann_info']
+        D = cfg['heatmap_size_root']
+        root_depth_bound = cfg['root_depth_bound']
+        target = (rel_root_depth / root_depth_bound + 0.5) * D
+        target_weight = rel_root_valid * (target >= 0) * (target <= D)
+        results['target'] = target * np.ones(1, dtype=np.float32)
+        results['target_weight'] = target_weight * np.ones(1, dtype=np.float32)
+        return results
diff --git a/mmpose/datasets/pipelines/loading.py b/mmpose/datasets/pipelines/loading.py
new file mode 100644
index 0000000..6475005
--- /dev/null
+++ b/mmpose/datasets/pipelines/loading.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile:
+    """Loading image(s) from file.
+
+    Required key: "image_file".
+
+    Added key: "img".
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): Flags specifying the color type of a loaded image,
+          candidates are 'color', 'grayscale' and 'unchanged'.
+        channel_order (str): Order of channel, candidates are 'bgr' and 'rgb'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 channel_order='rgb',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.channel_order = channel_order
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _read_image(self, path):
+        img_bytes = self.file_client.get(path)
+        img = mmcv.imfrombytes(
+            img_bytes, flag=self.color_type, channel_order=self.channel_order)
+        if img is None:
+            raise ValueError(f'Fail to read {path}')
+        if self.to_float32:
+            img = img.astype(np.float32)
+        return img
+
+    def __call__(self, results):
+        """Loading image(s) from file."""
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        image_file = results.get('image_file', None)
+
+        if isinstance(image_file, (list, tuple)):
+            # Load images from a list of paths
+            results['img'] = [self._read_image(path) for path in image_file]
+        elif image_file is not None:
+            # Load single image from path
+            results['img'] = self._read_image(image_file)
+        else:
+            if 'img' not in results:
+                # If `image_file`` is not in results, check the `img` exists
+                # and format the image. This for compatibility when the image
+                # is manually set outside the pipeline.
+                raise KeyError('Either `image_file` or `img` should exist in '
+                               'results.')
+            assert isinstance(results['img'], np.ndarray)
+            if self.color_type == 'color' and self.channel_order == 'rgb':
+                # The original results['img'] is assumed to be image(s) in BGR
+                # order, so we convert the color according to the arguments.
+                if results['img'].ndim == 3:
+                    results['img'] = mmcv.bgr2rgb(results['img'])
+                elif results['img'].ndim == 4:
+                    results['img'] = np.concatenate(
+                        [mmcv.bgr2rgb(img) for img in results['img']], axis=0)
+                else:
+                    raise ValueError('results["img"] has invalid shape '
+                                     f'{results["img"].shape}')
+
+            results['image_file'] = None
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
diff --git a/mmpose/datasets/pipelines/mesh_transform.py b/mmpose/datasets/pipelines/mesh_transform.py
new file mode 100644
index 0000000..e3f32fe
--- /dev/null
+++ b/mmpose/datasets/pipelines/mesh_transform.py
@@ -0,0 +1,399 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import mmcv
+import numpy as np
+import torch
+
+from mmpose.core.post_processing import (affine_transform, fliplr_joints,
+                                         get_affine_transform)
+from mmpose.datasets.builder import PIPELINES
+
+
+def _flip_smpl_pose(pose):
+    """Flip SMPL pose parameters horizontally.
+
+    Args:
+        pose (np.ndarray([72])): SMPL pose parameters
+
+    Returns:
+        pose_flipped
+    """
+
+    flippedParts = [
+        0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11, 15, 16, 17, 12, 13, 14, 18, 19,
+        20, 24, 25, 26, 21, 22, 23, 27, 28, 29, 33, 34, 35, 30, 31, 32, 36, 37,
+        38, 42, 43, 44, 39, 40, 41, 45, 46, 47, 51, 52, 53, 48, 49, 50, 57, 58,
+        59, 54, 55, 56, 63, 64, 65, 60, 61, 62, 69, 70, 71, 66, 67, 68
+    ]
+    pose_flipped = pose[flippedParts]
+    # Negate the second and the third dimension of the axis-angle
+    pose_flipped[1::3] = -pose_flipped[1::3]
+    pose_flipped[2::3] = -pose_flipped[2::3]
+    return pose_flipped
+
+
+def _flip_iuv(iuv, uv_type='BF'):
+    """Flip IUV image horizontally.
+
+    Note:
+        IUV image height: H
+        IUV image width: W
+
+    Args:
+        iuv np.ndarray([H, W, 3]): IUV image
+        uv_type (str): The type of the UV map.
+            Candidate values:
+                'DP': The UV map used in DensePose project.
+                'SMPL': The default UV map of SMPL model.
+                'BF': The UV map used in DecoMR project.
+            Default: 'BF'
+
+    Returns:
+        iuv_flipped np.ndarray([H, W, 3]): Flipped IUV image
+    """
+    assert uv_type in ['DP', 'SMPL', 'BF']
+    if uv_type == 'BF':
+        iuv_flipped = iuv[:, ::-1, :]
+        iuv_flipped[:, :, 1] = 255 - iuv_flipped[:, :, 1]
+    else:
+        # The flip of other UV map is complex, not finished yet.
+        raise NotImplementedError(
+            f'The flip of {uv_type} UV map is not implemented yet.')
+
+    return iuv_flipped
+
+
+def _construct_rotation_matrix(rot, size=3):
+    """Construct the in-plane rotation matrix.
+
+    Args:
+        rot (float): Rotation angle (degree).
+        size (int): The size of the rotation matrix.
+            Candidate Values: 2, 3. Defaults to 3.
+
+    Returns:
+        rot_mat (np.ndarray([size, size]): Rotation matrix.
+    """
+    rot_mat = np.eye(size, dtype=np.float32)
+    if rot != 0:
+        rot_rad = np.deg2rad(rot)
+        sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+        rot_mat[0, :2] = [cs, -sn]
+        rot_mat[1, :2] = [sn, cs]
+
+    return rot_mat
+
+
+def _rotate_joints_3d(joints_3d, rot):
+    """Rotate the 3D joints in the local coordinates.
+
+    Note:
+        Joints number: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        rot (float): Rotation angle (degree).
+
+    Returns:
+        joints_3d_rotated
+    """
+    # in-plane rotation
+    # 3D joints are rotated counterclockwise,
+    # so the rot angle is inversed.
+    rot_mat = _construct_rotation_matrix(-rot, 3)
+
+    joints_3d_rotated = np.einsum('ij,kj->ki', rot_mat, joints_3d)
+    joints_3d_rotated = joints_3d_rotated.astype('float32')
+    return joints_3d_rotated
+
+
+def _rotate_smpl_pose(pose, rot):
+    """Rotate SMPL pose parameters. SMPL (https://smpl.is.tue.mpg.de/) is a 3D
+    human model.
+
+    Args:
+        pose (np.ndarray([72])): SMPL pose parameters
+        rot (float): Rotation angle (degree).
+
+    Returns:
+        pose_rotated
+    """
+    pose_rotated = pose.copy()
+    if rot != 0:
+        rot_mat = _construct_rotation_matrix(-rot)
+        orient = pose[:3]
+        # find the rotation of the body in camera frame
+        per_rdg, _ = cv2.Rodrigues(orient)
+        # apply the global rotation to the global orientation
+        res_rot, _ = cv2.Rodrigues(np.dot(rot_mat, per_rdg))
+        pose_rotated[:3] = (res_rot.T)[0]
+
+    return pose_rotated
+
+
+def _flip_joints_3d(joints_3d, joints_3d_visible, flip_pairs):
+    """Flip human joints in 3D space horizontally.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+
+    Returns:
+        joints_3d_flipped, joints_3d_visible_flipped
+    """
+
+    assert len(joints_3d) == len(joints_3d_visible)
+
+    joints_3d_flipped = joints_3d.copy()
+    joints_3d_visible_flipped = joints_3d_visible.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+        joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
+        joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
+
+    # Flip horizontally
+    joints_3d_flipped[:, 0] = -joints_3d_flipped[:, 0]
+    joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped
+
+    return joints_3d_flipped, joints_3d_visible_flipped
+
+
+@PIPELINES.register_module()
+class LoadIUVFromFile:
+    """Loading IUV image from file."""
+
+    def __init__(self, to_float32=False):
+        self.to_float32 = to_float32
+        self.color_type = 'color'
+        # channel relations: iuv->bgr
+        self.channel_order = 'bgr'
+
+    def __call__(self, results):
+        """Loading image from file."""
+        has_iuv = results['has_iuv']
+        use_iuv = results['ann_info']['use_IUV']
+        if has_iuv and use_iuv:
+            iuv_file = results['iuv_file']
+            iuv = mmcv.imread(iuv_file, self.color_type, self.channel_order)
+            if iuv is None:
+                raise ValueError(f'Fail to read {iuv_file}')
+        else:
+            has_iuv = 0
+            iuv = None
+
+        results['has_iuv'] = has_iuv
+        results['iuv'] = iuv
+        return results
+
+
+@PIPELINES.register_module()
+class IUVToTensor:
+    """Transform IUV image to part index mask and uv coordinates image. The 3
+    channels of IUV image means: part index, u coordinates, v coordinates.
+
+    Required key: 'iuv', 'ann_info'.
+    Modifies key: 'part_index', 'uv_coordinates'.
+
+    Args:
+        results (dict): contain all information about training.
+    """
+
+    def __call__(self, results):
+        iuv = results['iuv']
+        if iuv is None:
+            H, W = results['ann_info']['iuv_size']
+            part_index = torch.zeros([1, H, W], dtype=torch.long)
+            uv_coordinates = torch.zeros([2, H, W], dtype=torch.float32)
+        else:
+            part_index = torch.LongTensor(iuv[:, :, 0])[None, :, :]
+            uv_coordinates = torch.FloatTensor(iuv[:, :, 1:]) / 255
+            uv_coordinates = uv_coordinates.permute(2, 0, 1)
+        results['part_index'] = part_index
+        results['uv_coordinates'] = uv_coordinates
+        return results
+
+
+@PIPELINES.register_module()
+class MeshRandomChannelNoise:
+    """Data augmentation with random channel noise.
+
+    Required keys: 'img'
+    Modifies key: 'img'
+
+    Args:
+        noise_factor (float): Multiply each channel with
+         a factor between``[1-scale_factor, 1+scale_factor]``
+    """
+
+    def __init__(self, noise_factor=0.4):
+        self.noise_factor = noise_factor
+
+    def __call__(self, results):
+        """Perform data augmentation with random channel noise."""
+        img = results['img']
+
+        # Each channel is multiplied with a number
+        # in the area [1-self.noise_factor, 1+self.noise_factor]
+        pn = np.random.uniform(1 - self.noise_factor, 1 + self.noise_factor,
+                               (1, 3))
+        img = cv2.multiply(img, pn)
+
+        results['img'] = img
+        return results
+
+
+@PIPELINES.register_module()
+class MeshRandomFlip:
+    """Data augmentation with random image flip.
+
+    Required keys: 'img', 'joints_2d','joints_2d_visible', 'joints_3d',
+    'joints_3d_visible', 'center', 'pose', 'iuv' and 'ann_info'.
+    Modifies key: 'img', 'joints_2d','joints_2d_visible', 'joints_3d',
+    'joints_3d_visible', 'center', 'pose', 'iuv'.
+
+    Args:
+        flip_prob (float): Probability of flip.
+    """
+
+    def __init__(self, flip_prob=0.5):
+        self.flip_prob = flip_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        if np.random.rand() > self.flip_prob:
+            return results
+
+        img = results['img']
+        joints_2d = results['joints_2d']
+        joints_2d_visible = results['joints_2d_visible']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        pose = results['pose']
+        center = results['center']
+
+        img = img[:, ::-1, :]
+        pose = _flip_smpl_pose(pose)
+
+        joints_2d, joints_2d_visible = fliplr_joints(
+            joints_2d, joints_2d_visible, img.shape[1],
+            results['ann_info']['flip_pairs'])
+
+        joints_3d, joints_3d_visible = _flip_joints_3d(
+            joints_3d, joints_3d_visible, results['ann_info']['flip_pairs'])
+        center[0] = img.shape[1] - center[0] - 1
+
+        if 'iuv' in results.keys():
+            iuv = results['iuv']
+            if iuv is not None:
+                iuv = _flip_iuv(iuv, results['ann_info']['uv_type'])
+            results['iuv'] = iuv
+
+        results['img'] = img
+        results['joints_2d'] = joints_2d
+        results['joints_2d_visible'] = joints_2d_visible
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+        results['pose'] = pose
+        results['center'] = center
+        return results
+
+
+@PIPELINES.register_module()
+class MeshGetRandomScaleRotation:
+    """Data augmentation with random scaling & rotating.
+
+    Required key: 'scale'. Modifies key: 'scale' and 'rotation'.
+
+    Args:
+        rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
+        scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
+        rot_prob (float): Probability of random rotation.
+    """
+
+    def __init__(self, rot_factor=30, scale_factor=0.25, rot_prob=0.6):
+        self.rot_factor = rot_factor
+        self.scale_factor = scale_factor
+        self.rot_prob = rot_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random scaling & rotating."""
+        s = results['scale']
+
+        sf = self.scale_factor
+        rf = self.rot_factor
+
+        s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+        s = s * s_factor
+
+        r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
+        r = r_factor if np.random.rand() <= self.rot_prob else 0
+
+        results['scale'] = s
+        results['rotation'] = r
+
+        return results
+
+
+@PIPELINES.register_module()
+class MeshAffine:
+    """Affine transform the image to get input image. Affine transform the 2D
+    keypoints, 3D kepoints and IUV image too.
+
+    Required keys: 'img', 'joints_2d','joints_2d_visible', 'joints_3d',
+    'joints_3d_visible', 'pose', 'iuv', 'ann_info','scale',  'rotation' and
+    'center'. Modifies key: 'img', 'joints_2d','joints_2d_visible',
+    'joints_3d',  'pose', 'iuv'.
+    """
+
+    def __call__(self, results):
+        image_size = results['ann_info']['image_size']
+
+        img = results['img']
+        joints_2d = results['joints_2d']
+        joints_2d_visible = results['joints_2d_visible']
+        joints_3d = results['joints_3d']
+        pose = results['pose']
+
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+        trans = get_affine_transform(c, s, r, image_size)
+
+        img = cv2.warpAffine(
+            img,
+            trans, (int(image_size[0]), int(image_size[1])),
+            flags=cv2.INTER_LINEAR)
+
+        for i in range(results['ann_info']['num_joints']):
+            if joints_2d_visible[i, 0] > 0.0:
+                joints_2d[i] = affine_transform(joints_2d[i], trans)
+
+        joints_3d = _rotate_joints_3d(joints_3d, r)
+        pose = _rotate_smpl_pose(pose, r)
+
+        results['img'] = img
+        results['joints_2d'] = joints_2d
+        results['joints_2d_visible'] = joints_2d_visible
+        results['joints_3d'] = joints_3d
+        results['pose'] = pose
+
+        if 'iuv' in results.keys():
+            iuv = results['iuv']
+            if iuv is not None:
+                iuv_size = results['ann_info']['iuv_size']
+                iuv = cv2.warpAffine(
+                    iuv,
+                    trans, (int(iuv_size[0]), int(iuv_size[1])),
+                    flags=cv2.INTER_NEAREST)
+            results['iuv'] = iuv
+
+        return results
diff --git a/mmpose/datasets/pipelines/pose3d_transform.py b/mmpose/datasets/pipelines/pose3d_transform.py
new file mode 100644
index 0000000..1249378
--- /dev/null
+++ b/mmpose/datasets/pipelines/pose3d_transform.py
@@ -0,0 +1,643 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.utils import build_from_cfg
+
+from mmpose.core.camera import CAMERAS
+from mmpose.core.post_processing import fliplr_regression
+from mmpose.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class GetRootCenteredPose:
+    """Zero-center the pose around a given root joint. Optionally, the root
+    joint can be removed from the original pose and stored as a separate item.
+
+    Note that the root-centered joints may no longer align with some annotation
+    information (e.g. flip_pairs, num_joints, inference_channel, etc.) due to
+    the removal of the root joint.
+
+    Args:
+        item (str): The name of the pose to apply root-centering.
+        root_index (int): Root joint index in the pose.
+        visible_item (str): The name of the visibility item.
+        remove_root (bool): If true, remove the root joint from the pose
+        root_name (str): Optional. If not none, it will be used as the key to
+            store the root position separated from the original pose.
+
+    Required keys:
+        item
+
+    Modified keys:
+        item, visible_item, root_name
+    """
+
+    def __init__(self,
+                 item,
+                 root_index,
+                 visible_item=None,
+                 remove_root=False,
+                 root_name=None):
+        self.item = item
+        self.root_index = root_index
+        self.remove_root = remove_root
+        self.root_name = root_name
+        self.visible_item = visible_item
+
+    def __call__(self, results):
+        assert self.item in results
+        joints = results[self.item]
+        root_idx = self.root_index
+
+        assert joints.ndim >= 2 and joints.shape[-2] > root_idx,\
+            f'Got invalid joint shape {joints.shape}'
+
+        root = joints[..., root_idx:root_idx + 1, :]
+        joints = joints - root
+
+        results[self.item] = joints
+        if self.root_name is not None:
+            results[self.root_name] = root
+
+        if self.remove_root:
+            results[self.item] = np.delete(
+                results[self.item], root_idx, axis=-2)
+            if self.visible_item is not None:
+                assert self.visible_item in results
+                results[self.visible_item] = np.delete(
+                    results[self.visible_item], root_idx, axis=-2)
+            # Add a flag to avoid latter transforms that rely on the root
+            # joint or the original joint index
+            results[f'{self.item}_root_removed'] = True
+
+            # Save the root index which is necessary to restore the global pose
+            if self.root_name is not None:
+                results[f'{self.root_name}_index'] = self.root_index
+
+        return results
+
+
+@PIPELINES.register_module()
+class NormalizeJointCoordinate:
+    """Normalize the joint coordinate with given mean and std.
+
+    Args:
+        item (str): The name of the pose to normalize.
+        mean (array): Mean values of joint coordinates in shape [K, C].
+        std (array): Std values of joint coordinates in shape [K, C].
+        norm_param_file (str): Optionally load a dict containing `mean` and
+            `std` from a file using `mmcv.load`.
+
+    Required keys:
+        item
+
+    Modified keys:
+        item
+    """
+
+    def __init__(self, item, mean=None, std=None, norm_param_file=None):
+        self.item = item
+        self.norm_param_file = norm_param_file
+        if norm_param_file is not None:
+            norm_param = mmcv.load(norm_param_file)
+            assert 'mean' in norm_param and 'std' in norm_param
+            mean = norm_param['mean']
+            std = norm_param['std']
+        else:
+            assert mean is not None
+            assert std is not None
+
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+
+    def __call__(self, results):
+        assert self.item in results
+        results[self.item] = (results[self.item] - self.mean) / self.std
+        results[f'{self.item}_mean'] = self.mean.copy()
+        results[f'{self.item}_std'] = self.std.copy()
+        return results
+
+
+@PIPELINES.register_module()
+class ImageCoordinateNormalization:
+    """Normalize the 2D joint coordinate with image width and height. Range [0,
+    w] is mapped to [-1, 1], while preserving the aspect ratio.
+
+    Args:
+        item (str|list[str]): The name of the pose to normalize.
+        norm_camera (bool): Whether to normalize camera intrinsics.
+            Default: False.
+        camera_param (dict|None): The camera parameter dict. See the camera
+            class definition for more details. If None is given, the camera
+            parameter will be obtained during processing of each data sample
+            with the key "camera_param".
+
+    Required keys:
+        item
+
+    Modified keys:
+        item (, camera_param)
+    """
+
+    def __init__(self, item, norm_camera=False, camera_param=None):
+        self.item = item
+        if isinstance(self.item, str):
+            self.item = [self.item]
+
+        self.norm_camera = norm_camera
+
+        if camera_param is None:
+            self.static_camera = False
+        else:
+            self.static_camera = True
+            self.camera_param = camera_param
+
+    def __call__(self, results):
+        center = np.array(
+            [0.5 * results['image_width'], 0.5 * results['image_height']],
+            dtype=np.float32)
+        scale = np.array(0.5 * results['image_width'], dtype=np.float32)
+
+        for item in self.item:
+            results[item] = (results[item] - center) / scale
+
+        if self.norm_camera:
+            if self.static_camera:
+                camera_param = copy.deepcopy(self.camera_param)
+            else:
+                assert 'camera_param' in results, \
+                    'Camera parameters are missing.'
+                camera_param = results['camera_param']
+            assert 'f' in camera_param and 'c' in camera_param
+            camera_param['f'] = camera_param['f'] / scale
+            camera_param['c'] = (camera_param['c'] - center[:, None]) / scale
+            if 'camera_param' not in results:
+                results['camera_param'] = dict()
+            results['camera_param'].update(camera_param)
+
+        return results
+
+
+@PIPELINES.register_module()
+class CollectCameraIntrinsics:
+    """Store camera intrinsics in a 1-dim array, including f, c, k, p.
+
+    Args:
+        camera_param (dict|None): The camera parameter dict. See the camera
+            class definition for more details. If None is given, the camera
+            parameter will be obtained during processing of each data sample
+            with the key "camera_param".
+        need_distortion (bool): Whether need distortion parameters k and p.
+            Default: True.
+
+    Required keys:
+        camera_param (if camera parameters are not given in initialization)
+
+    Modified keys:
+        intrinsics
+    """
+
+    def __init__(self, camera_param=None, need_distortion=True):
+        if camera_param is None:
+            self.static_camera = False
+        else:
+            self.static_camera = True
+            self.camera_param = camera_param
+        self.need_distortion = need_distortion
+
+    def __call__(self, results):
+        if self.static_camera:
+            camera_param = copy.deepcopy(self.camera_param)
+        else:
+            assert 'camera_param' in results, 'Camera parameters are missing.'
+            camera_param = results['camera_param']
+        assert 'f' in camera_param and 'c' in camera_param
+        intrinsics = np.concatenate(
+            [camera_param['f'].reshape(2), camera_param['c'].reshape(2)])
+        if self.need_distortion:
+            assert 'k' in camera_param and 'p' in camera_param
+            intrinsics = np.concatenate([
+                intrinsics, camera_param['k'].reshape(3),
+                camera_param['p'].reshape(2)
+            ])
+        results['intrinsics'] = intrinsics
+
+        return results
+
+
+@PIPELINES.register_module()
+class CameraProjection:
+    """Apply camera projection to joint coordinates.
+
+    Args:
+        item (str): The name of the pose to apply camera projection.
+        mode (str): The type of camera projection, supported options are
+
+            - world_to_camera
+            - world_to_pixel
+            - camera_to_world
+            - camera_to_pixel
+        output_name (str|None): The name of the projected pose. If None
+            (default) is given, the projected pose will be stored in place.
+        camera_type (str): The camera class name (should be registered in
+            CAMERA).
+        camera_param (dict|None): The camera parameter dict. See the camera
+            class definition for more details. If None is given, the camera
+            parameter will be obtained during processing of each data sample
+            with the key "camera_param".
+
+    Required keys:
+
+        - item
+        - camera_param (if camera parameters are not given in initialization)
+
+    Modified keys:
+        output_name
+    """
+
+    def __init__(self,
+                 item,
+                 mode,
+                 output_name=None,
+                 camera_type='SimpleCamera',
+                 camera_param=None):
+        self.item = item
+        self.mode = mode
+        self.output_name = output_name
+        self.camera_type = camera_type
+        allowed_mode = {
+            'world_to_camera',
+            'world_to_pixel',
+            'camera_to_world',
+            'camera_to_pixel',
+        }
+        if mode not in allowed_mode:
+            raise ValueError(
+                f'Got invalid mode: {mode}, allowed modes are {allowed_mode}')
+
+        if camera_param is None:
+            self.static_camera = False
+        else:
+            self.static_camera = True
+            self.camera = self._build_camera(camera_param)
+
+    def _build_camera(self, param):
+        cfgs = dict(type=self.camera_type, param=param)
+        return build_from_cfg(cfgs, CAMERAS)
+
+    def __call__(self, results):
+        assert self.item in results
+        joints = results[self.item]
+
+        if self.static_camera:
+            camera = self.camera
+        else:
+            assert 'camera_param' in results, 'Camera parameters are missing.'
+            camera = self._build_camera(results['camera_param'])
+
+        if self.mode == 'world_to_camera':
+            output = camera.world_to_camera(joints)
+        elif self.mode == 'world_to_pixel':
+            output = camera.world_to_pixel(joints)
+        elif self.mode == 'camera_to_world':
+            output = camera.camera_to_world(joints)
+        elif self.mode == 'camera_to_pixel':
+            output = camera.camera_to_pixel(joints)
+        else:
+            raise NotImplementedError
+
+        output_name = self.output_name
+        if output_name is None:
+            output_name = self.item
+
+        results[output_name] = output
+        return results
+
+
+@PIPELINES.register_module()
+class RelativeJointRandomFlip:
+    """Data augmentation with random horizontal joint flip around a root joint.
+
+    Args:
+        item (str|list[str]): The name of the pose to flip.
+        flip_cfg (dict|list[dict]): Configurations of the fliplr_regression
+            function. It should contain the following arguments:
+
+            - ``center_mode``: The mode to set the center location on the \
+                x-axis to flip around.
+            - ``center_x`` or ``center_index``: Set the x-axis location or \
+                the root joint's index to define the flip center.
+
+            Please refer to the docstring of the fliplr_regression function for
+            more details.
+        visible_item (str|list[str]): The name of the visibility item which
+            will be flipped accordingly along with the pose.
+        flip_prob (float): Probability of flip.
+        flip_camera (bool): Whether to flip horizontal distortion coefficients.
+        camera_param (dict|None): The camera parameter dict. See the camera
+            class definition for more details. If None is given, the camera
+            parameter will be obtained during processing of each data sample
+            with the key "camera_param".
+
+    Required keys:
+        item
+
+    Modified keys:
+        item (, camera_param)
+    """
+
+    def __init__(self,
+                 item,
+                 flip_cfg,
+                 visible_item=None,
+                 flip_prob=0.5,
+                 flip_camera=False,
+                 camera_param=None):
+        self.item = item
+        self.flip_cfg = flip_cfg
+        self.vis_item = visible_item
+        self.flip_prob = flip_prob
+        self.flip_camera = flip_camera
+        if camera_param is None:
+            self.static_camera = False
+        else:
+            self.static_camera = True
+            self.camera_param = camera_param
+
+        if isinstance(self.item, str):
+            self.item = [self.item]
+        if isinstance(self.flip_cfg, dict):
+            self.flip_cfg = [self.flip_cfg] * len(self.item)
+        assert len(self.item) == len(self.flip_cfg)
+        if isinstance(self.vis_item, str):
+            self.vis_item = [self.vis_item]
+
+    def __call__(self, results):
+
+        if results.get(f'{self.item}_root_removed', False):
+            raise RuntimeError('The transform RelativeJointRandomFlip should '
+                               f'not be applied to {self.item} whose root '
+                               'joint has been removed and joint indices have '
+                               'been changed')
+
+        if np.random.rand() <= self.flip_prob:
+
+            flip_pairs = results['ann_info']['flip_pairs']
+
+            # flip joint coordinates
+            for i, item in enumerate(self.item):
+                assert item in results
+                joints = results[item]
+
+                joints_flipped = fliplr_regression(joints, flip_pairs,
+                                                   **self.flip_cfg[i])
+
+                results[item] = joints_flipped
+
+            # flip joint visibility
+            for vis_item in self.vis_item:
+                assert vis_item in results
+                visible = results[vis_item]
+                visible_flipped = visible.copy()
+                for left, right in flip_pairs:
+                    visible_flipped[..., left, :] = visible[..., right, :]
+                    visible_flipped[..., right, :] = visible[..., left, :]
+                results[vis_item] = visible_flipped
+
+            # flip horizontal distortion coefficients
+            if self.flip_camera:
+                if self.static_camera:
+                    camera_param = copy.deepcopy(self.camera_param)
+                else:
+                    assert 'camera_param' in results, \
+                        'Camera parameters are missing.'
+                    camera_param = results['camera_param']
+                assert 'c' in camera_param
+                camera_param['c'][0] *= -1
+
+                if 'p' in camera_param:
+                    camera_param['p'][0] *= -1
+
+                if 'camera_param' not in results:
+                    results['camera_param'] = dict()
+                results['camera_param'].update(camera_param)
+
+        return results
+
+
+@PIPELINES.register_module()
+class PoseSequenceToTensor:
+    """Convert pose sequence from numpy array to Tensor.
+
+    The original pose sequence should have a shape of [T,K,C] or [K,C], where
+    T is the sequence length, K and C are keypoint number and dimension. The
+    converted pose sequence will have a shape of [KxC, T].
+
+    Args:
+        item (str): The name of the pose sequence
+
+    Required keys:
+        item
+
+    Modified keys:
+        item
+    """
+
+    def __init__(self, item):
+        self.item = item
+
+    def __call__(self, results):
+        assert self.item in results
+        seq = results[self.item]
+
+        assert isinstance(seq, np.ndarray)
+        assert seq.ndim in {2, 3}
+
+        if seq.ndim == 2:
+            seq = seq[None, ...]
+
+        T = seq.shape[0]
+        seq = seq.transpose(1, 2, 0).reshape(-1, T)
+        results[self.item] = torch.from_numpy(seq)
+
+        return results
+
+
+@PIPELINES.register_module()
+class Generate3DHeatmapTarget:
+    """Generate the target 3d heatmap.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'.
+    Modified keys: 'target', and 'target_weight'.
+
+    Args:
+        sigma: Sigma of heatmap gaussian.
+        joint_indices (list): Indices of joints used for heatmap generation.
+            If None (default) is given, all joints will be used.
+        max_bound (float): The maximal value of heatmap.
+    """
+
+    def __init__(self, sigma=2, joint_indices=None, max_bound=1.0):
+        self.sigma = sigma
+        self.joint_indices = joint_indices
+        self.max_bound = max_bound
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        cfg = results['ann_info']
+        image_size = cfg['image_size']
+        W, H, D = cfg['heatmap_size']
+        heatmap3d_depth_bound = cfg['heatmap3d_depth_bound']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+
+        # select the joints used for target generation
+        if self.joint_indices is not None:
+            joints_3d = joints_3d[self.joint_indices, ...]
+            joints_3d_visible = joints_3d_visible[self.joint_indices, ...]
+            joint_weights = joint_weights[self.joint_indices, ...]
+        num_joints = joints_3d.shape[0]
+
+        # get the joint location in heatmap coordinates
+        mu_x = joints_3d[:, 0] * W / image_size[0]
+        mu_y = joints_3d[:, 1] * H / image_size[1]
+        mu_z = (joints_3d[:, 2] / heatmap3d_depth_bound + 0.5) * D
+
+        target = np.zeros([num_joints, D, H, W], dtype=np.float32)
+
+        target_weight = joints_3d_visible[:, 0].astype(np.float32)
+        target_weight = target_weight * (mu_z >= 0) * (mu_z < D)
+        if use_different_joint_weights:
+            target_weight = target_weight * joint_weights
+        target_weight = target_weight[:, None]
+
+        # only compute the voxel value near the joints location
+        tmp_size = 3 * self.sigma
+
+        # get neighboring voxels coordinates
+        x = y = z = np.arange(2 * tmp_size + 1, dtype=np.float32) - tmp_size
+        zz, yy, xx = np.meshgrid(z, y, x)
+        xx = xx[None, ...].astype(np.float32)
+        yy = yy[None, ...].astype(np.float32)
+        zz = zz[None, ...].astype(np.float32)
+        mu_x = mu_x[..., None, None, None]
+        mu_y = mu_y[..., None, None, None]
+        mu_z = mu_z[..., None, None, None]
+        xx, yy, zz = xx + mu_x, yy + mu_y, zz + mu_z
+
+        # round the coordinates
+        xx = xx.round().clip(0, W - 1)
+        yy = yy.round().clip(0, H - 1)
+        zz = zz.round().clip(0, D - 1)
+
+        # compute the target value near joints
+        local_target = \
+            np.exp(-((xx - mu_x)**2 + (yy - mu_y)**2 + (zz - mu_z)**2) /
+                   (2 * self.sigma**2))
+
+        # put the local target value to the full target heatmap
+        local_size = xx.shape[1]
+        idx_joints = np.tile(
+            np.arange(num_joints)[:, None, None, None],
+            [1, local_size, local_size, local_size])
+        idx = np.stack([idx_joints, zz, yy, xx],
+                       axis=-1).astype(int).reshape(-1, 4)
+        target[idx[:, 0], idx[:, 1], idx[:, 2],
+               idx[:, 3]] = local_target.reshape(-1)
+        target = target * self.max_bound
+        results['target'] = target
+        results['target_weight'] = target_weight
+        return results
+
+
+@PIPELINES.register_module()
+class GenerateVoxel3DHeatmapTarget:
+    """Generate the target 3d heatmap.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info_3d'.
+    Modified keys: 'target', and 'target_weight'.
+
+    Args:
+        sigma: Sigma of heatmap gaussian (mm).
+        joint_indices (list): Indices of joints used for heatmap generation.
+            If None (default) is given, all joints will be used.
+    """
+
+    def __init__(self, sigma=200.0, joint_indices=None):
+        self.sigma = sigma  # mm
+        self.joint_indices = joint_indices
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        cfg = results['ann_info']
+
+        num_people = len(joints_3d)
+        num_joints = joints_3d[0].shape[0]
+
+        if self.joint_indices is not None:
+            num_joints = len(self.joint_indices)
+            joint_indices = self.joint_indices
+        else:
+            joint_indices = list(range(num_joints))
+
+        space_size = cfg['space_size']
+        space_center = cfg['space_center']
+        cube_size = cfg['cube_size']
+        grids_x = np.linspace(-space_size[0] / 2, space_size[0] / 2,
+                              cube_size[0]) + space_center[0]
+        grids_y = np.linspace(-space_size[1] / 2, space_size[1] / 2,
+                              cube_size[1]) + space_center[1]
+        grids_z = np.linspace(-space_size[2] / 2, space_size[2] / 2,
+                              cube_size[2]) + space_center[2]
+
+        target = np.zeros(
+            (num_joints, cube_size[0], cube_size[1], cube_size[2]),
+            dtype=np.float32)
+
+        for n in range(num_people):
+            for idx, joint_id in enumerate(joint_indices):
+                mu_x = joints_3d[n][joint_id][0]
+                mu_y = joints_3d[n][joint_id][1]
+                mu_z = joints_3d[n][joint_id][2]
+                vis = joints_3d_visible[n][joint_id][0]
+                if vis < 1:
+                    continue
+                i_x = [
+                    np.searchsorted(grids_x, mu_x - 3 * self.sigma),
+                    np.searchsorted(grids_x, mu_x + 3 * self.sigma, 'right')
+                ]
+                i_y = [
+                    np.searchsorted(grids_y, mu_y - 3 * self.sigma),
+                    np.searchsorted(grids_y, mu_y + 3 * self.sigma, 'right')
+                ]
+                i_z = [
+                    np.searchsorted(grids_z, mu_z - 3 * self.sigma),
+                    np.searchsorted(grids_z, mu_z + 3 * self.sigma, 'right')
+                ]
+                if i_x[0] >= i_x[1] or i_y[0] >= i_y[1] or i_z[0] >= i_z[1]:
+                    continue
+                kernel_xs, kernel_ys, kernel_zs = np.meshgrid(
+                    grids_x[i_x[0]:i_x[1]],
+                    grids_y[i_y[0]:i_y[1]],
+                    grids_z[i_z[0]:i_z[1]],
+                    indexing='ij')
+                g = np.exp(-((kernel_xs - mu_x)**2 + (kernel_ys - mu_y)**2 +
+                             (kernel_zs - mu_z)**2) / (2 * self.sigma**2))
+                target[idx, i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]] \
+                    = np.maximum(target[idx, i_x[0]:i_x[1],
+                                 i_y[0]:i_y[1], i_z[0]:i_z[1]], g)
+
+        target = np.clip(target, 0, 1)
+        if target.shape[0] == 1:
+            target = target[0]
+
+        results['targets_3d'] = target
+
+        return results
diff --git a/mmpose/datasets/pipelines/shared_transform.py b/mmpose/datasets/pipelines/shared_transform.py
new file mode 100644
index 0000000..e4fea80
--- /dev/null
+++ b/mmpose/datasets/pipelines/shared_transform.py
@@ -0,0 +1,527 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+from mmcv.utils import build_from_cfg
+from numpy import random
+from torchvision.transforms import functional as F
+
+from ..builder import PIPELINES
+
+try:
+    import albumentations
+except ImportError:
+    albumentations = None
+
+
+@PIPELINES.register_module()
+class ToTensor:
+    """Transform image to Tensor.
+
+    Required key: 'img'. Modifies key: 'img'.
+
+    Args:
+        results (dict): contain all information about training.
+    """
+
+    def __call__(self, results):
+        if isinstance(results['img'], (list, tuple)):
+            results['img'] = [F.to_tensor(img) for img in results['img']]
+        else:
+            results['img'] = F.to_tensor(results['img'])
+
+        return results
+
+
+@PIPELINES.register_module()
+class NormalizeTensor:
+    """Normalize the Tensor image (CxHxW), with mean and std.
+
+    Required key: 'img'. Modifies key: 'img'.
+
+    Args:
+        mean (list[float]): Mean values of 3 channels.
+        std (list[float]): Std values of 3 channels.
+    """
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, results):
+        if isinstance(results['img'], (list, tuple)):
+            results['img'] = [
+                F.normalize(img, mean=self.mean, std=self.std)
+                for img in results['img']
+            ]
+        else:
+            results['img'] = F.normalize(
+                results['img'], mean=self.mean, std=self.std)
+
+        return results
+
+
+@PIPELINES.register_module()
+class Compose:
+    """Compose a data pipeline with a sequence of transforms.
+
+    Args:
+        transforms (list[dict | callable]): Either config
+          dicts of transforms or transform objects.
+    """
+
+    def __init__(self, transforms):
+        assert isinstance(transforms, Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict, but got'
+                                f' {type(transform)}')
+
+    def __call__(self, data):
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+            dict: Transformed data.
+        """
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        """Compute the string representation."""
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'\n    {t}'
+        format_string += '\n)'
+        return format_string
+
+
+@PIPELINES.register_module()
+class Collect:
+    """Collect data from the loader relevant to the specific task.
+
+    This keeps the items in `keys` as it is, and collect items in `meta_keys`
+    into a meta item called `meta_name`.This is usually the last stage of the
+    data loader pipeline.
+    For example, when keys='imgs', meta_keys=('filename', 'label',
+    'original_shape'), meta_name='img_metas', the results will be a dict with
+    keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of
+    another dict with keys 'filename', 'label', 'original_shape'.
+
+    Args:
+        keys (Sequence[str|tuple]): Required keys to be collected. If a tuple
+          (key, key_new) is given as an element, the item retrieved by key will
+          be renamed as key_new in collected data.
+        meta_name (str): The name of the key that contains meta information.
+          This key is always populated. Default: "img_metas".
+        meta_keys (Sequence[str|tuple]): Keys that are collected under
+          meta_name. The contents of the `meta_name` dictionary depends
+          on `meta_keys`.
+    """
+
+    def __init__(self, keys, meta_keys, meta_name='img_metas'):
+        self.keys = keys
+        self.meta_keys = meta_keys
+        self.meta_name = meta_name
+
+    def __call__(self, results):
+        """Performs the Collect formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+              to the next transform in pipeline.
+        """
+        if 'ann_info' in results:
+            results.update(results['ann_info'])
+
+        data = {}
+        for key in self.keys:
+            if isinstance(key, tuple):
+                assert len(key) == 2
+                key_src, key_tgt = key[:2]
+            else:
+                key_src = key_tgt = key
+            data[key_tgt] = results[key_src]
+
+        meta = {}
+        if len(self.meta_keys) != 0:
+            for key in self.meta_keys:
+                if isinstance(key, tuple):
+                    assert len(key) == 2
+                    key_src, key_tgt = key[:2]
+                else:
+                    key_src = key_tgt = key
+                meta[key_tgt] = results[key_src]
+        if 'bbox_id' in results:
+            meta['bbox_id'] = results['bbox_id']
+        data[self.meta_name] = DC(meta, cpu_only=True)
+
+        return data
+
+    def __repr__(self):
+        """Compute the string representation."""
+        return (f'{self.__class__.__name__}('
+                f'keys={self.keys}, meta_keys={self.meta_keys})')
+
+
+@PIPELINES.register_module()
+class Albumentation:
+    """Albumentation augmentation (pixel-level transforms only). Adds custom
+    pixel-level transformations from Albumentations library. Please visit
+    `https://albumentations.readthedocs.io` to get more information.
+
+    Note: we only support pixel-level transforms.
+    Please visit `https://github.com/albumentations-team/`
+    `albumentations#pixel-level-transforms`
+    to get more information about pixel-level transforms.
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block:: python
+
+        [
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of Albumentation transformations
+        keymap (dict): Contains {'input key':'albumentation-style key'},
+            e.g., {'img': 'image'}.
+    """
+
+    def __init__(self, transforms, keymap=None):
+        if albumentations is None:
+            raise RuntimeError('albumentations is not installed')
+
+        self.transforms = transforms
+        self.filter_lost_elements = False
+
+        self.aug = albumentations.Compose(
+            [self.albu_builder(t) for t in self.transforms])
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg):
+        """Import a module from albumentations.
+
+        It resembles some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmcv.is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            if not hasattr(albumentations.augmentations.transforms, obj_type):
+                warnings.warn('{obj_type} is not pixel-level transformations. '
+                              'Please use with caution.')
+            obj_cls = getattr(albumentations, obj_type)
+        else:
+            raise TypeError(f'type must be a str, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d, keymap):
+        """Dictionary mapper.
+
+        Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {keymap.get(k, k): v for k, v in d.items()}
+        return updated_dict
+
+    def __call__(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+
+        results = self.aug(**results)
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PhotometricDistortion:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def convert(self, img, alpha=1, beta=0):
+        """Multiple with alpha and add beta with clip."""
+        img = img.astype(np.float32) * alpha + beta
+        img = np.clip(img, 0, 255)
+        return img.astype(np.uint8)
+
+    def brightness(self, img):
+        """Brightness distortion."""
+        if random.randint(2):
+            return self.convert(
+                img,
+                beta=random.uniform(-self.brightness_delta,
+                                    self.brightness_delta))
+        return img
+
+    def contrast(self, img):
+        """Contrast distortion."""
+        if random.randint(2):
+            return self.convert(
+                img,
+                alpha=random.uniform(self.contrast_lower, self.contrast_upper))
+        return img
+
+    def saturation(self, img):
+        # Apply saturation distortion to hsv-formatted img
+        img[:, :, 1] = self.convert(
+            img[:, :, 1],
+            alpha=random.uniform(self.saturation_lower, self.saturation_upper))
+        return img
+
+    def hue(self, img):
+        # Apply hue distortion to hsv-formatted img
+        img[:, :, 0] = (img[:, :, 0].astype(int) +
+                        random.randint(-self.hue_delta, self.hue_delta)) % 180
+        return img
+
+    def swap_channels(self, img):
+        # Apply channel swap
+        if random.randint(2):
+            img = img[..., random.permutation(3)]
+        return img
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        img = results['img']
+        # random brightness
+        img = self.brightness(img)
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            img = self.contrast(img)
+
+        hsv_mode = random.randint(4)
+        if hsv_mode:
+            # random saturation/hue distortion
+            img = mmcv.bgr2hsv(img)
+            if hsv_mode == 1 or hsv_mode == 3:
+                img = self.saturation(img)
+            if hsv_mode == 2 or hsv_mode == 3:
+                img = self.hue(img)
+            img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            img = self.contrast(img)
+
+        # randomly swap channels
+        self.swap_channels(img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(brightness_delta={self.brightness_delta}, '
+                     f'contrast_range=({self.contrast_lower}, '
+                     f'{self.contrast_upper}), '
+                     f'saturation_range=({self.saturation_lower}, '
+                     f'{self.saturation_upper}), '
+                     f'hue_delta={self.hue_delta})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MultiItemProcess:
+    """Process each item and merge multi-item results to lists.
+
+    Args:
+        pipeline (dict): Dictionary to construct pipeline for a single item.
+    """
+
+    def __init__(self, pipeline):
+        self.pipeline = Compose(pipeline)
+
+    def __call__(self, results):
+        results_ = {}
+        for idx, result in results.items():
+            single_result = self.pipeline(result)
+            for k, v in single_result.items():
+                if k in results_:
+                    results_[k].append(v)
+                else:
+                    results_[k] = [v]
+
+        return results_
+
+
+@PIPELINES.register_module()
+class DiscardDuplicatedItems:
+
+    def __init__(self, keys_list):
+        """Discard duplicated single-item results.
+
+        Args:
+            keys_list (list): List of keys that need to be deduplicate.
+        """
+        self.keys_list = keys_list
+
+    def __call__(self, results):
+        for k, v in results.items():
+            if k in self.keys_list:
+                assert isinstance(v, Sequence)
+                results[k] = v[0]
+
+        return results
+
+
+@PIPELINES.register_module()
+class MultitaskGatherTarget:
+    """Gather the targets for multitask heads.
+
+    Args:
+        pipeline_list (list[list]): List of pipelines for all heads.
+        pipeline_indices (list[int]): Pipeline index of each head.
+    """
+
+    def __init__(self,
+                 pipeline_list,
+                 pipeline_indices=None,
+                 keys=('target', 'target_weight')):
+        self.keys = keys
+        self.pipelines = []
+        for pipeline in pipeline_list:
+            self.pipelines.append(Compose(pipeline))
+        if pipeline_indices is None:
+            self.pipeline_indices = list(range(len(pipeline_list)))
+        else:
+            self.pipeline_indices = pipeline_indices
+
+    def __call__(self, results):
+        # generate target and target weights using all pipelines
+        pipeline_outputs = []
+        for pipeline in self.pipelines:
+            pipeline_output = pipeline(results)
+            pipeline_outputs.append(pipeline_output.copy())
+
+        for key in self.keys:
+            result_key = []
+            for ind in self.pipeline_indices:
+                result_key.append(pipeline_outputs[ind].get(key, None))
+            results[key] = result_key
+        return results
+
+
+@PIPELINES.register_module()
+class RenameKeys:
+    """Rename the keys.
+
+    Args:
+        key_pairs (Sequence[tuple]): Required keys to be renamed.
+            If a tuple (key_src, key_tgt) is given as an element,
+            the item retrieved by key_src will be renamed as key_tgt.
+    """
+
+    def __init__(self, key_pairs):
+        self.key_pairs = key_pairs
+
+    def __call__(self, results):
+        """Rename keys."""
+        for key_pair in self.key_pairs:
+            assert len(key_pair) == 2
+            key_src, key_tgt = key_pair
+            results[key_tgt] = results.pop(key_src)
+        return results
diff --git a/mmpose/datasets/pipelines/top_down_transform.py b/mmpose/datasets/pipelines/top_down_transform.py
new file mode 100644
index 0000000..1af1ea9
--- /dev/null
+++ b/mmpose/datasets/pipelines/top_down_transform.py
@@ -0,0 +1,736 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+from mmpose.core.post_processing import (affine_transform, fliplr_joints,
+                                         get_affine_transform, get_warp_matrix,
+                                         warp_affine_joints)
+from mmpose.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class TopDownRandomFlip:
+    """Data augmentation with random image flip.
+
+    Required keys: 'img', 'joints_3d', 'joints_3d_visible', 'center' and
+    'ann_info'.
+
+    Modifies key: 'img', 'joints_3d', 'joints_3d_visible', 'center' and
+    'flipped'.
+
+    Args:
+        flip (bool): Option to perform random flip.
+        flip_prob (float): Probability of flip.
+    """
+
+    def __init__(self, flip_prob=0.5):
+        self.flip_prob = flip_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        img = results['img']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        center = results['center']
+
+        # A flag indicating whether the image is flipped,
+        # which can be used by child class.
+        flipped = False
+        if np.random.rand() <= self.flip_prob:
+            flipped = True
+            if not isinstance(img, list):
+                img = img[:, ::-1, :]
+            else:
+                img = [i[:, ::-1, :] for i in img]
+            if not isinstance(img, list):
+                joints_3d, joints_3d_visible = fliplr_joints(
+                    joints_3d, joints_3d_visible, img.shape[1],
+                    results['ann_info']['flip_pairs'])
+                center[0] = img.shape[1] - center[0] - 1
+            else:
+                joints_3d, joints_3d_visible = fliplr_joints(
+                    joints_3d, joints_3d_visible, img[0].shape[1],
+                    results['ann_info']['flip_pairs'])
+                center[0] = img[0].shape[1] - center[0] - 1
+
+        results['img'] = img
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+        results['center'] = center
+        results['flipped'] = flipped
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownHalfBodyTransform:
+    """Data augmentation with half-body transform. Keep only the upper body or
+    the lower body at random.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', and 'ann_info'.
+
+    Modifies key: 'scale' and 'center'.
+
+    Args:
+        num_joints_half_body (int): Threshold of performing
+            half-body transform. If the body has fewer number
+            of joints (< num_joints_half_body), ignore this step.
+        prob_half_body (float): Probability of half-body transform.
+    """
+
+    def __init__(self, num_joints_half_body=8, prob_half_body=0.3):
+        self.num_joints_half_body = num_joints_half_body
+        self.prob_half_body = prob_half_body
+
+    @staticmethod
+    def half_body_transform(cfg, joints_3d, joints_3d_visible):
+        """Get center&scale for half-body transform."""
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(cfg['num_joints']):
+            if joints_3d_visible[joint_id][0] > 0:
+                if joint_id in cfg['upper_body_ids']:
+                    upper_joints.append(joints_3d[joint_id])
+                else:
+                    lower_joints.append(joints_3d[joint_id])
+
+        if np.random.randn() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        elif len(lower_joints) > 2:
+            selected_joints = lower_joints
+        else:
+            selected_joints = upper_joints
+
+        if len(selected_joints) < 2:
+            return None, None
+
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+
+        left_top = np.amin(selected_joints, axis=0)
+
+        right_bottom = np.amax(selected_joints, axis=0)
+
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+
+        aspect_ratio = cfg['image_size'][0] / cfg['image_size'][1]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        scale = scale * 1.5
+        return center, scale
+
+    def __call__(self, results):
+        """Perform data augmentation with half-body transform."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+
+        if (np.sum(joints_3d_visible[:, 0]) > self.num_joints_half_body
+                and np.random.rand() < self.prob_half_body):
+
+            c_half_body, s_half_body = self.half_body_transform(
+                results['ann_info'], joints_3d, joints_3d_visible)
+
+            if c_half_body is not None and s_half_body is not None:
+                results['center'] = c_half_body
+                results['scale'] = s_half_body
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownGetRandomScaleRotation:
+    """Data augmentation with random scaling & rotating.
+
+    Required key: 'scale'.
+
+    Modifies key: 'scale' and 'rotation'.
+
+    Args:
+        rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
+        scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
+        rot_prob (float): Probability of random rotation.
+    """
+
+    def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6):
+        self.rot_factor = rot_factor
+        self.scale_factor = scale_factor
+        self.rot_prob = rot_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random scaling & rotating."""
+        s = results['scale']
+
+        sf = self.scale_factor
+        rf = self.rot_factor
+
+        s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+        s = s * s_factor
+
+        r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
+        r = r_factor if np.random.rand() <= self.rot_prob else 0
+
+        results['scale'] = s
+        results['rotation'] = r
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownAffine:
+    """Affine transform the image to make input.
+
+    Required keys:'img', 'joints_3d', 'joints_3d_visible', 'ann_info','scale',
+    'rotation' and 'center'.
+
+    Modified keys:'img', 'joints_3d', and 'joints_3d_visible'.
+
+    Args:
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, use_udp=False):
+        self.use_udp = use_udp
+
+    def __call__(self, results):
+        image_size = results['ann_info']['image_size']
+
+        img = results['img']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+
+        if self.use_udp:
+            trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
+            if not isinstance(img, list):
+                img = cv2.warpAffine(
+                    img,
+                    trans, (int(image_size[0]), int(image_size[1])),
+                    flags=cv2.INTER_LINEAR)
+            else:
+                img = [
+                    cv2.warpAffine(
+                        i,
+                        trans, (int(image_size[0]), int(image_size[1])),
+                        flags=cv2.INTER_LINEAR) for i in img
+                ]
+
+            joints_3d[:, 0:2] = \
+                warp_affine_joints(joints_3d[:, 0:2].copy(), trans)
+
+        else:
+            trans = get_affine_transform(c, s, r, image_size)
+            if not isinstance(img, list):
+                img = cv2.warpAffine(
+                    img,
+                    trans, (int(image_size[0]), int(image_size[1])),
+                    flags=cv2.INTER_LINEAR)
+            else:
+                img = [
+                    cv2.warpAffine(
+                        i,
+                        trans, (int(image_size[0]), int(image_size[1])),
+                        flags=cv2.INTER_LINEAR) for i in img
+                ]
+            for i in range(results['ann_info']['num_joints']):
+                if joints_3d_visible[i, 0] > 0.0:
+                    joints_3d[i,
+                              0:2] = affine_transform(joints_3d[i, 0:2], trans)
+
+        results['img'] = img
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownGenerateTarget:
+    """Generate the target heatmap.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'.
+
+    Modified keys: 'target', and 'target_weight'.
+
+    Args:
+        sigma: Sigma of heatmap gaussian for 'MSRA' approach.
+        kernel: Kernel of heatmap gaussian for 'Megvii' approach.
+        encoding (str): Approach to generate target heatmaps.
+            Currently supported approaches: 'MSRA', 'Megvii', 'UDP'.
+            Default:'MSRA'
+        unbiased_encoding (bool): Option to use unbiased
+            encoding methods.
+            Paper ref: Zhang et al. Distribution-Aware Coordinate
+            Representation for Human Pose Estimation (CVPR 2020).
+        keypoint_pose_distance: Keypoint pose distance for UDP.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+        target_type (str): supported targets: 'GaussianHeatmap',
+            'CombinedTarget'. Default:'GaussianHeatmap'
+            CombinedTarget: The combination of classification target
+            (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self,
+                 sigma=2,
+                 kernel=(11, 11),
+                 valid_radius_factor=0.0546875,
+                 target_type='GaussianHeatmap',
+                 encoding='MSRA',
+                 unbiased_encoding=False):
+        self.sigma = sigma
+        self.unbiased_encoding = unbiased_encoding
+        self.kernel = kernel
+        self.valid_radius_factor = valid_radius_factor
+        self.target_type = target_type
+        self.encoding = encoding
+
+    def _msra_generate_target(self, cfg, joints_3d, joints_3d_visible, sigma):
+        """Generate the target heatmap via "MSRA" approach.
+
+        Args:
+            cfg (dict): data config
+            joints_3d: np.ndarray ([num_joints, 3])
+            joints_3d_visible: np.ndarray ([num_joints, 3])
+            sigma: Sigma of heatmap gaussian
+        Returns:
+            tuple: A tuple containing targets.
+
+            - target: Target heatmaps.
+            - target_weight: (1: visible, 0: invisible)
+        """
+        num_joints = cfg['num_joints']
+        image_size = cfg['image_size']
+        W, H = cfg['heatmap_size']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+
+        target_weight = np.zeros((num_joints, 1), dtype=np.float32)
+        target = np.zeros((num_joints, H, W), dtype=np.float32)
+
+        # 3-sigma rule
+        tmp_size = sigma * 3
+
+        if self.unbiased_encoding:
+            for joint_id in range(num_joints):
+                target_weight[joint_id] = joints_3d_visible[joint_id, 0]
+
+                feat_stride = image_size / [W, H]
+                mu_x = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y = joints_3d[joint_id][1] / feat_stride[1]
+                # Check that any part of the gaussian is in-bounds
+                ul = [mu_x - tmp_size, mu_y - tmp_size]
+                br = [mu_x + tmp_size + 1, mu_y + tmp_size + 1]
+                if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0:
+                    target_weight[joint_id] = 0
+
+                if target_weight[joint_id] == 0:
+                    continue
+
+                x = np.arange(0, W, 1, np.float32)
+                y = np.arange(0, H, 1, np.float32)
+                y = y[:, None]
+
+                if target_weight[joint_id] > 0.5:
+                    target[joint_id] = np.exp(-((x - mu_x)**2 +
+                                                (y - mu_y)**2) /
+                                              (2 * sigma**2))
+        else:
+            for joint_id in range(num_joints):
+                target_weight[joint_id] = joints_3d_visible[joint_id, 0]
+
+                feat_stride = image_size / [W, H]
+                mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5)
+                mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5)
+                # Check that any part of the gaussian is in-bounds
+                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+                if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0:
+                    target_weight[joint_id] = 0
+
+                if target_weight[joint_id] > 0.5:
+                    size = 2 * tmp_size + 1
+                    x = np.arange(0, size, 1, np.float32)
+                    y = x[:, None]
+                    x0 = y0 = size // 2
+                    # The gaussian is not normalized,
+                    # we want the center value to equal 1
+                    g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+
+                    # Usable gaussian range
+                    g_x = max(0, -ul[0]), min(br[0], W) - ul[0]
+                    g_y = max(0, -ul[1]), min(br[1], H) - ul[1]
+                    # Image range
+                    img_x = max(0, ul[0]), min(br[0], W)
+                    img_y = max(0, ul[1]), min(br[1], H)
+
+                    target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+
+        if use_different_joint_weights:
+            target_weight = np.multiply(target_weight, joint_weights)
+
+        return target, target_weight
+
+    def _megvii_generate_target(self, cfg, joints_3d, joints_3d_visible,
+                                kernel):
+        """Generate the target heatmap via "Megvii" approach.
+
+        Args:
+            cfg (dict): data config
+            joints_3d: np.ndarray ([num_joints, 3])
+            joints_3d_visible: np.ndarray ([num_joints, 3])
+            kernel: Kernel of heatmap gaussian
+
+        Returns:
+            tuple: A tuple containing targets.
+
+            - target: Target heatmaps.
+            - target_weight: (1: visible, 0: invisible)
+        """
+
+        num_joints = cfg['num_joints']
+        image_size = cfg['image_size']
+        W, H = cfg['heatmap_size']
+        heatmaps = np.zeros((num_joints, H, W), dtype='float32')
+        target_weight = np.zeros((num_joints, 1), dtype=np.float32)
+
+        for i in range(num_joints):
+            target_weight[i] = joints_3d_visible[i, 0]
+
+            if target_weight[i] < 1:
+                continue
+
+            target_y = int(joints_3d[i, 1] * H / image_size[1])
+            target_x = int(joints_3d[i, 0] * W / image_size[0])
+
+            if (target_x >= W or target_x < 0) \
+                    or (target_y >= H or target_y < 0):
+                target_weight[i] = 0
+                continue
+
+            heatmaps[i, target_y, target_x] = 1
+            heatmaps[i] = cv2.GaussianBlur(heatmaps[i], kernel, 0)
+            maxi = heatmaps[i, target_y, target_x]
+
+            heatmaps[i] /= maxi / 255
+
+        return heatmaps, target_weight
+
+    def _udp_generate_target(self, cfg, joints_3d, joints_3d_visible, factor,
+                             target_type):
+        """Generate the target heatmap via 'UDP' approach. Paper ref: Huang et
+        al. The Devil is in the Details: Delving into Unbiased Data Processing
+        for Human Pose Estimation (CVPR 2020).
+
+        Note:
+            - num keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+            - num target channels: C
+            - C = K if target_type=='GaussianHeatmap'
+            - C = 3*K if target_type=='CombinedTarget'
+
+        Args:
+            cfg (dict): data config
+            joints_3d (np.ndarray[K, 3]): Annotated keypoints.
+            joints_3d_visible (np.ndarray[K, 3]): Visibility of keypoints.
+            factor (float): kernel factor for GaussianHeatmap target or
+                valid radius factor for CombinedTarget.
+            target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
+                GaussianHeatmap: Heatmap target with gaussian distribution.
+                CombinedTarget: The combination of classification target
+                (response map) and regression target (offset map).
+
+        Returns:
+            tuple: A tuple containing targets.
+
+            - target (np.ndarray[C, H, W]): Target heatmaps.
+            - target_weight (np.ndarray[K, 1]): (1: visible, 0: invisible)
+        """
+        num_joints = cfg['num_joints']
+        image_size = cfg['image_size']
+        heatmap_size = cfg['heatmap_size']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+
+        target_weight = np.ones((num_joints, 1), dtype=np.float32)
+        target_weight[:, 0] = joints_3d_visible[:, 0]
+
+        if target_type.lower() == 'GaussianHeatmap'.lower():
+            target = np.zeros((num_joints, heatmap_size[1], heatmap_size[0]),
+                              dtype=np.float32)
+
+            tmp_size = factor * 3
+
+            # prepare for gaussian
+            size = 2 * tmp_size + 1
+            x = np.arange(0, size, 1, np.float32)
+            y = x[:, None]
+
+            for joint_id in range(num_joints):
+                feat_stride = (image_size - 1.0) / (heatmap_size - 1.0)
+                mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5)
+                mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5)
+                # Check that any part of the gaussian is in-bounds
+                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+                if ul[0] >= heatmap_size[0] or ul[1] >= heatmap_size[1] \
+                        or br[0] < 0 or br[1] < 0:
+                    # If not, just return the image as is
+                    target_weight[joint_id] = 0
+                    continue
+
+                # # Generate gaussian
+                mu_x_ac = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y_ac = joints_3d[joint_id][1] / feat_stride[1]
+                x0 = y0 = size // 2
+                x0 += mu_x_ac - mu_x
+                y0 += mu_y_ac - mu_y
+                g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * factor**2))
+
+                # Usable gaussian range
+                g_x = max(0, -ul[0]), min(br[0], heatmap_size[0]) - ul[0]
+                g_y = max(0, -ul[1]), min(br[1], heatmap_size[1]) - ul[1]
+                # Image range
+                img_x = max(0, ul[0]), min(br[0], heatmap_size[0])
+                img_y = max(0, ul[1]), min(br[1], heatmap_size[1])
+
+                v = target_weight[joint_id]
+                if v > 0.5:
+                    target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+
+        elif target_type.lower() == 'CombinedTarget'.lower():
+            target = np.zeros(
+                (num_joints, 3, heatmap_size[1] * heatmap_size[0]),
+                dtype=np.float32)
+            feat_width = heatmap_size[0]
+            feat_height = heatmap_size[1]
+            feat_x_int = np.arange(0, feat_width)
+            feat_y_int = np.arange(0, feat_height)
+            feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)
+            feat_x_int = feat_x_int.flatten()
+            feat_y_int = feat_y_int.flatten()
+            # Calculate the radius of the positive area in classification
+            #   heatmap.
+            valid_radius = factor * heatmap_size[1]
+            feat_stride = (image_size - 1.0) / (heatmap_size - 1.0)
+            for joint_id in range(num_joints):
+                mu_x = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y = joints_3d[joint_id][1] / feat_stride[1]
+                x_offset = (mu_x - feat_x_int) / valid_radius
+                y_offset = (mu_y - feat_y_int) / valid_radius
+                dis = x_offset**2 + y_offset**2
+                keep_pos = np.where(dis <= 1)[0]
+                v = target_weight[joint_id]
+                if v > 0.5:
+                    target[joint_id, 0, keep_pos] = 1
+                    target[joint_id, 1, keep_pos] = x_offset[keep_pos]
+                    target[joint_id, 2, keep_pos] = y_offset[keep_pos]
+            target = target.reshape(num_joints * 3, heatmap_size[1],
+                                    heatmap_size[0])
+        else:
+            raise ValueError('target_type should be either '
+                             "'GaussianHeatmap' or 'CombinedTarget'")
+
+        if use_different_joint_weights:
+            target_weight = np.multiply(target_weight, joint_weights)
+
+        return target, target_weight
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+
+        assert self.encoding in ['MSRA', 'Megvii', 'UDP']
+
+        if self.encoding == 'MSRA':
+            if isinstance(self.sigma, list):
+                num_sigmas = len(self.sigma)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                heatmap_size = cfg['heatmap_size']
+
+                target = np.empty(
+                    (0, num_joints, heatmap_size[1], heatmap_size[0]),
+                    dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_sigmas):
+                    target_i, target_weight_i = self._msra_generate_target(
+                        cfg, joints_3d, joints_3d_visible, self.sigma[i])
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._msra_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible,
+                    self.sigma)
+
+        elif self.encoding == 'Megvii':
+            if isinstance(self.kernel, list):
+                num_kernels = len(self.kernel)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                W, H = cfg['heatmap_size']
+
+                target = np.empty((0, num_joints, H, W), dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_kernels):
+                    target_i, target_weight_i = self._megvii_generate_target(
+                        cfg, joints_3d, joints_3d_visible, self.kernel[i])
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._megvii_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible,
+                    self.kernel)
+
+        elif self.encoding == 'UDP':
+            if self.target_type.lower() == 'CombinedTarget'.lower():
+                factors = self.valid_radius_factor
+                channel_factor = 3
+            elif self.target_type.lower() == 'GaussianHeatmap'.lower():
+                factors = self.sigma
+                channel_factor = 1
+            else:
+                raise ValueError('target_type should be either '
+                                 "'GaussianHeatmap' or 'CombinedTarget'")
+            if isinstance(factors, list):
+                num_factors = len(factors)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                W, H = cfg['heatmap_size']
+
+                target = np.empty((0, channel_factor * num_joints, H, W),
+                                  dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_factors):
+                    target_i, target_weight_i = self._udp_generate_target(
+                        cfg, joints_3d, joints_3d_visible, factors[i],
+                        self.target_type)
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._udp_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible, factors,
+                    self.target_type)
+        else:
+            raise ValueError(
+                f'Encoding approach {self.encoding} is not supported!')
+
+        if results['ann_info'].get('max_num_joints', None) is not None:
+            W, H = results['ann_info']['heatmap_size']
+            padded_length = int(results['ann_info'].get('max_num_joints') - results['ann_info'].get('num_joints'))
+            target_weight = np.concatenate([target_weight, np.zeros((padded_length, 1), dtype=np.float32)], 0)
+            target = np.concatenate([target, np.zeros((padded_length, H, W), dtype=np.float32)], 0)
+
+        results['target'] = target
+        results['target_weight'] = target_weight
+
+        results['dataset_idx'] = results['ann_info'].get('dataset_idx', 0)
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownGenerateTargetRegression:
+    """Generate the target regression vector (coordinates).
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'. Modified keys:
+    'target', and 'target_weight'.
+    """
+
+    def __init__(self):
+        pass
+
+    def _generate_target(self, cfg, joints_3d, joints_3d_visible):
+        """Generate the target regression vector.
+
+        Args:
+            cfg (dict): data config
+            joints_3d: np.ndarray([num_joints, 3])
+            joints_3d_visible: np.ndarray([num_joints, 3])
+
+        Returns:
+             target, target_weight(1: visible, 0: invisible)
+        """
+        image_size = cfg['image_size']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+
+        mask = (joints_3d[:, 0] >= 0) * (
+            joints_3d[:, 0] <= image_size[0] - 1) * (joints_3d[:, 1] >= 0) * (
+                joints_3d[:, 1] <= image_size[1] - 1)
+
+        target = joints_3d[:, :2] / image_size
+
+        target = target.astype(np.float32)
+        target_weight = joints_3d_visible[:, :2] * mask[:, None]
+
+        if use_different_joint_weights:
+            target_weight = np.multiply(target_weight, joint_weights)
+
+        return target, target_weight
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+
+        target, target_weight = self._generate_target(results['ann_info'],
+                                                      joints_3d,
+                                                      joints_3d_visible)
+
+        results['target'] = target
+        results['target_weight'] = target_weight
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownRandomTranslation:
+    """Data augmentation with random translation.
+
+    Required key: 'scale' and 'center'.
+
+    Modifies key: 'center'.
+
+    Note:
+        - bbox height: H
+        - bbox width: W
+
+    Args:
+        trans_factor (float): Translating center to
+            ``[-trans_factor, trans_factor] * [W, H] + center``.
+        trans_prob (float): Probability of random translation.
+    """
+
+    def __init__(self, trans_factor=0.15, trans_prob=1.0):
+        self.trans_factor = trans_factor
+        self.trans_prob = trans_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random translation."""
+        center = results['center']
+        scale = results['scale']
+        if np.random.rand() <= self.trans_prob:
+            # reference bbox size is [200, 200] pixels
+            center += self.trans_factor * np.random.uniform(
+                -1, 1, size=2) * scale * 200
+        results['center'] = center
+        return results
diff --git a/mmpose/datasets/registry.py b/mmpose/datasets/registry.py
new file mode 100644
index 0000000..ba3cc49
--- /dev/null
+++ b/mmpose/datasets/registry.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .builder import DATASETS, PIPELINES
+
+__all__ = ['DATASETS', 'PIPELINES']
+
+warnings.simplefilter('once', DeprecationWarning)
+warnings.warn(
+    'Registries (DATASETS, PIPELINES) have been moved to '
+    'mmpose.datasets.builder. Importing from '
+    'mmpose.models.registry will be deprecated in the future.',
+    DeprecationWarning)
diff --git a/mmpose/datasets/samplers/__init__.py b/mmpose/datasets/samplers/__init__.py
new file mode 100644
index 0000000..da09eff
--- /dev/null
+++ b/mmpose/datasets/samplers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .distributed_sampler import DistributedSampler
+
+__all__ = ['DistributedSampler']
diff --git a/mmpose/datasets/samplers/distributed_sampler.py b/mmpose/datasets/samplers/distributed_sampler.py
new file mode 100644
index 0000000..bcb5f52
--- /dev/null
+++ b/mmpose/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+
+
+class DistributedSampler(_DistributedSampler):
+    """DistributedSampler inheriting from
+    `torch.utils.data.DistributedSampler`.
+
+    In pytorch of lower versions, there is no `shuffle` argument. This child
+    class will port one to DistributedSampler.
+    """
+
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 seed=0):
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        # for the compatibility from PyTorch 1.3+
+        self.seed = seed if seed is not None else 0
+
+    def __iter__(self):
+        """Deterministically shuffle based on epoch."""
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch + self.seed)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
diff --git a/mmpose/deprecated.py b/mmpose/deprecated.py
new file mode 100644
index 0000000..b930901
--- /dev/null
+++ b/mmpose/deprecated.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .datasets.builder import DATASETS
+from .datasets.datasets.base import Kpt2dSviewRgbImgTopDownDataset
+from .models.builder import HEADS, POSENETS
+from .models.detectors import AssociativeEmbedding
+from .models.heads import (AEHigherResolutionHead, AESimpleHead,
+                           DeepposeRegressionHead, HMRMeshHead,
+                           TopdownHeatmapMSMUHead,
+                           TopdownHeatmapMultiStageHead,
+                           TopdownHeatmapSimpleHead)
+
+
+@DATASETS.register_module()
+class TopDownFreiHandDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownFreiHandDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownFreiHandDataset has been renamed into FreiHandDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@DATASETS.register_module()
+class TopDownOneHand10KDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownOneHand10KDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownOneHand10KDataset has been renamed into OneHand10KDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@DATASETS.register_module()
+class TopDownPanopticDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownPanopticDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownPanopticDataset has been renamed into PanopticDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@HEADS.register_module()
+class BottomUpHigherResolutionHead(AEHigherResolutionHead):
+    """Bottom-up head for Higher Resolution.
+
+    BottomUpHigherResolutionHead has been renamed into AEHigherResolutionHead,
+    check https://github.com/open- mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUpHigherResolutionHead has been renamed into '
+            'AEHigherResolutionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class BottomUpSimpleHead(AESimpleHead):
+    """Bottom-up simple head.
+
+    BottomUpSimpleHead has been renamed into AESimpleHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUpHigherResolutionHead has been renamed into '
+            'AEHigherResolutionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownSimpleHead(TopdownHeatmapSimpleHead):
+    """Top-down heatmap simple head.
+
+    TopDownSimpleHead has been renamed into TopdownHeatmapSimpleHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownSimpleHead has been renamed into '
+            'TopdownHeatmapSimpleHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownMultiStageHead(TopdownHeatmapMultiStageHead):
+    """Top-down heatmap multi-stage head.
+
+    TopDownMultiStageHead has been renamed into TopdownHeatmapMultiStageHead,
+    check https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownMultiStageHead has been renamed into '
+            'TopdownHeatmapMultiStageHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownMSMUHead(TopdownHeatmapMSMUHead):
+    """Heads for multi-stage multi-unit heads.
+
+    TopDownMSMUHead has been renamed into TopdownHeatmapMSMUHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownMSMUHead has been renamed into '
+            'TopdownHeatmapMSMUHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class MeshHMRHead(HMRMeshHead):
+    """SMPL parameters regressor head.
+
+    MeshHMRHead has been renamed into HMRMeshHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'MeshHMRHead has been renamed into '
+            'HMRMeshHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class FcHead(DeepposeRegressionHead):
+    """FcHead (deprecated).
+
+    FcHead has been renamed into DeepposeRegressionHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'FcHead has been renamed into '
+            'DeepposeRegressionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@POSENETS.register_module()
+class BottomUp(AssociativeEmbedding):
+    """Associative Embedding.
+
+    BottomUp has been renamed into AssociativeEmbedding, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUp has been renamed into '
+            'AssociativeEmbedding, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
diff --git a/mmpose/models/__init__.py b/mmpose/models/__init__.py
new file mode 100644
index 0000000..dbec55e
--- /dev/null
+++ b/mmpose/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa
+from .builder import (BACKBONES, HEADS, LOSSES, MESH_MODELS, NECKS, POSENETS,
+                      build_backbone, build_head, build_loss, build_mesh_model,
+                      build_neck, build_posenet)
+from .detectors import *  # noqa
+from .heads import *  # noqa
+from .losses import *  # noqa
+from .necks import *  # noqa
+from .utils import *  # noqa
+
+__all__ = [
+    'BACKBONES', 'HEADS', 'NECKS', 'LOSSES', 'POSENETS', 'MESH_MODELS',
+    'build_backbone', 'build_head', 'build_loss', 'build_posenet',
+    'build_neck', 'build_mesh_model'
+]
diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py
new file mode 100644
index 0000000..2b8efcf
--- /dev/null
+++ b/mmpose/models/backbones/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .alexnet import AlexNet
+from .cpm import CPM
+from .hourglass import HourglassNet
+from .hourglass_ae import HourglassAENet
+from .hrformer import HRFormer
+from .hrnet import HRNet
+from .litehrnet import LiteHRNet
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v3 import MobileNetV3
+from .mspn import MSPN
+from .regnet import RegNet
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .rsn import RSN
+from .scnet import SCNet
+from .seresnet import SEResNet
+from .seresnext import SEResNeXt
+from .shufflenet_v1 import ShuffleNetV1
+from .shufflenet_v2 import ShuffleNetV2
+from .tcn import TCN
+from .v2v_net import V2VNet
+from .vgg import VGG
+from .vipnas_mbv3 import ViPNAS_MobileNetV3
+from .vipnas_resnet import ViPNAS_ResNet
+from .vit import ViT
+from .vit_moe import ViTMoE
+
+__all__ = [
+    'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
+    'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
+    'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
+    'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
+    'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT', 'ViTMoE'
+]
diff --git a/mmpose/models/backbones/alexnet.py b/mmpose/models/backbones/alexnet.py
new file mode 100644
index 0000000..a8efd74
--- /dev/null
+++ b/mmpose/models/backbones/alexnet.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+@BACKBONES.register_module()
+class AlexNet(BaseBackbone):
+    """`AlexNet <https://en.wikipedia.org/wiki/AlexNet>`__ backbone.
+
+    The input for AlexNet is a 224x224 RGB image.
+
+    Args:
+        num_classes (int): number of classes for classification.
+            The default value is -1, which uses the backbone as
+            a feature extractor without the top classifier.
+    """
+
+    def __init__(self, num_classes=-1):
+        super().__init__()
+        self.num_classes = num_classes
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(),
+                nn.Linear(256 * 6 * 6, 4096),
+                nn.ReLU(inplace=True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(inplace=True),
+                nn.Linear(4096, num_classes),
+            )
+
+    def forward(self, x):
+
+        x = self.features(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), 256 * 6 * 6)
+            x = self.classifier(x)
+
+        return x
diff --git a/mmpose/models/backbones/base_backbone.py b/mmpose/models/backbones/base_backbone.py
new file mode 100644
index 0000000..d64dca1
--- /dev/null
+++ b/mmpose/models/backbones/base_backbone.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+# from .utils import load_checkpoint
+from mmcv_custom.checkpoint import load_checkpoint
+
+class BaseBackbone(nn.Module, metaclass=ABCMeta):
+    """Base backbone.
+
+    This class defines the basic functions of a backbone. Any backbone that
+    inherits this class should at least define its own `forward` function.
+    """
+
+    def init_weights(self, pretrained=None, patch_padding='pad', part_features=None):
+        """Init backbone weights.
+
+        Args:
+            pretrained (str | None): If pretrained is a string, then it
+                initializes backbone weights by loading the pretrained
+                checkpoint. If pretrained is None, then it follows default
+                initializer or customized initializer in subclasses.
+        """
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding, part_features=part_features)
+        elif pretrained is None:
+            # use default initializer or customized initializer in subclasses
+            pass
+        else:
+            raise TypeError('pretrained must be a str or None.'
+                            f' But received {type(pretrained)}.')
+
+    @abstractmethod
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor | tuple[Tensor]): x could be a torch.Tensor or a tuple of
+                torch.Tensor, containing input data for forward computation.
+        """
diff --git a/mmpose/models/backbones/cpm.py b/mmpose/models/backbones/cpm.py
new file mode 100644
index 0000000..458245d
--- /dev/null
+++ b/mmpose/models/backbones/cpm.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint
+
+
+class CpmBlock(nn.Module):
+    """CpmBlock for Convolutional Pose Machine.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        channels (list): Output channels of each conv module.
+        kernels (list): Kernel sizes of each conv module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels=(128, 128, 128),
+                 kernels=(11, 11, 11),
+                 norm_cfg=None):
+        super().__init__()
+
+        assert len(channels) == len(kernels)
+        layers = []
+        for i in range(len(channels)):
+            if i == 0:
+                input_channels = in_channels
+            else:
+                input_channels = channels[i - 1]
+            layers.append(
+                ConvModule(
+                    input_channels,
+                    channels[i],
+                    kernels[i],
+                    padding=(kernels[i] - 1) // 2,
+                    norm_cfg=norm_cfg))
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Model forward function."""
+        out = self.model(x)
+        return out
+
+
+@BACKBONES.register_module()
+class CPM(BaseBackbone):
+    """CPM backbone.
+
+    Convolutional Pose Machines.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1602.00134>`__ .
+
+    Args:
+        in_channels (int): The input channels of the CPM.
+        out_channels (int): The output channels of the CPM.
+        feat_channels (int): Feature channel of each CPM stage.
+        middle_channels (int): Feature channel of conv after the middle stage.
+        num_stages (int): Number of stages.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import CPM
+        >>> import torch
+        >>> self = CPM(3, 17)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 368, 368)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 feat_channels=128,
+                 middle_channels=32,
+                 num_stages=6,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        assert in_channels == 3
+
+        self.num_stages = num_stages
+        assert self.num_stages >= 1
+
+        self.stem = nn.Sequential(
+            ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 32, 5, padding=2, norm_cfg=norm_cfg),
+            ConvModule(32, 512, 9, padding=4, norm_cfg=norm_cfg),
+            ConvModule(512, 512, 1, padding=0, norm_cfg=norm_cfg),
+            ConvModule(512, out_channels, 1, padding=0, act_cfg=None))
+
+        self.middle = nn.Sequential(
+            ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+        self.cpm_stages = nn.ModuleList([
+            CpmBlock(
+                middle_channels + out_channels,
+                channels=[feat_channels, feat_channels, feat_channels],
+                kernels=[11, 11, 11],
+                norm_cfg=norm_cfg) for _ in range(num_stages - 1)
+        ])
+
+        self.middle_conv = nn.ModuleList([
+            nn.Sequential(
+                ConvModule(
+                    128, middle_channels, 5, padding=2, norm_cfg=norm_cfg))
+            for _ in range(num_stages - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            nn.Sequential(
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    1,
+                    padding=0,
+                    norm_cfg=norm_cfg),
+                ConvModule(feat_channels, out_channels, 1, act_cfg=None))
+            for _ in range(num_stages - 1)
+        ])
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        stage1_out = self.stem(x)
+        middle_out = self.middle(x)
+        out_feats = []
+
+        out_feats.append(stage1_out)
+
+        for ind in range(self.num_stages - 1):
+            single_stage = self.cpm_stages[ind]
+            out_conv = self.out_convs[ind]
+
+            inp_feat = torch.cat(
+                [out_feats[-1], self.middle_conv[ind](middle_out)], 1)
+            cpm_feat = single_stage(inp_feat)
+            out_feat = out_conv(cpm_feat)
+            out_feats.append(out_feat)
+
+        return out_feats
diff --git a/mmpose/models/backbones/hourglass.py b/mmpose/models/backbones/hourglass.py
new file mode 100644
index 0000000..bf75fad
--- /dev/null
+++ b/mmpose/models/backbones/hourglass.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .resnet import BasicBlock, ResLayer
+from .utils import load_checkpoint
+
+
+class HourglassModule(nn.Module):
+    """Hourglass Module for HourglassNet backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in current and
+            follow-up HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 depth,
+                 stage_channels,
+                 stage_blocks,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.depth = depth
+
+        cur_block = stage_blocks[0]
+        next_block = stage_blocks[1]
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ResLayer(
+            BasicBlock, cur_block, cur_channel, cur_channel, norm_cfg=norm_cfg)
+
+        self.low1 = ResLayer(
+            BasicBlock,
+            cur_block,
+            cur_channel,
+            next_channel,
+            stride=2,
+            norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
+                                        stage_blocks[1:])
+        else:
+            self.low2 = ResLayer(
+                BasicBlock,
+                next_block,
+                next_channel,
+                next_channel,
+                norm_cfg=norm_cfg)
+
+        self.low3 = ResLayer(
+            BasicBlock,
+            cur_block,
+            next_channel,
+            cur_channel,
+            norm_cfg=norm_cfg,
+            downsample_first=False)
+
+        self.up2 = nn.Upsample(scale_factor=2)
+
+    def forward(self, x):
+        """Model forward function."""
+        up1 = self.up1(x)
+        low1 = self.low1(x)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2
+
+
+@BACKBONES.register_module()
+class HourglassNet(BaseBackbone):
+    """HourglassNet backbone.
+
+    Stacked Hourglass Networks for Human Pose Estimation.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1603.06937>`__ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (list[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channel (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import HourglassNet
+        >>> import torch
+        >>> self = HourglassNet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 256, 128, 128)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times=5,
+                 num_stacks=2,
+                 stage_channels=(256, 256, 384, 384, 384, 512),
+                 stage_blocks=(2, 2, 2, 2, 2, 4),
+                 feat_channel=256,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) == len(stage_blocks)
+        assert len(stage_channels) > downsample_times
+
+        cur_channel = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(3, 128, 7, padding=3, stride=2, norm_cfg=norm_cfg),
+            ResLayer(BasicBlock, 1, 128, 256, stride=2, norm_cfg=norm_cfg))
+
+        self.hourglass_modules = nn.ModuleList([
+            HourglassModule(downsample_times, stage_channels, stage_blocks)
+            for _ in range(num_stacks)
+        ])
+
+        self.inters = ResLayer(
+            BasicBlock,
+            num_stacks - 1,
+            cur_channel,
+            cur_channel,
+            norm_cfg=norm_cfg)
+
+        self.conv1x1s = nn.ModuleList([
+            ConvModule(
+                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
+            for _ in range(num_stacks)
+        ])
+
+        self.remap_convs = nn.ModuleList([
+            ConvModule(
+                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = self.conv1x1s[ind](
+                    inter_feat) + self.remap_convs[ind](
+                        out_feat)
+                inter_feat = self.inters[ind](self.relu(inter_feat))
+
+        return out_feats
diff --git a/mmpose/models/backbones/hourglass_ae.py b/mmpose/models/backbones/hourglass_ae.py
new file mode 100644
index 0000000..5a700e5
--- /dev/null
+++ b/mmpose/models/backbones/hourglass_ae.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, MaxPool2d, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint
+
+
+class HourglassAEModule(nn.Module):
+    """Modified Hourglass Module for HourglassNet_AE backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 depth,
+                 stage_channels,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.depth = depth
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ConvModule(
+            cur_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.pool1 = MaxPool2d(2, 2)
+
+        self.low1 = ConvModule(
+            cur_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassAEModule(depth - 1, stage_channels[1:])
+        else:
+            self.low2 = ConvModule(
+                next_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.low3 = ConvModule(
+            next_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.up2 = nn.UpsamplingNearest2d(scale_factor=2)
+
+    def forward(self, x):
+        """Model forward function."""
+        up1 = self.up1(x)
+        pool1 = self.pool1(x)
+        low1 = self.low1(pool1)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2
+
+
+@BACKBONES.register_module()
+class HourglassAENet(BaseBackbone):
+    """Hourglass-AE Network proposed by Newell et al.
+
+    Associative Embedding: End-to-End Learning for Joint
+    Detection and Grouping.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1611.05424>`__ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (list[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channels (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import HourglassAENet
+        >>> import torch
+        >>> self = HourglassAENet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 512, 512)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 34, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times=4,
+                 num_stacks=1,
+                 out_channels=34,
+                 stage_channels=(256, 384, 512, 640, 768),
+                 feat_channels=256,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) > downsample_times
+
+        cur_channels = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(3, 64, 7, padding=3, stride=2, norm_cfg=norm_cfg),
+            ConvModule(64, 128, 3, padding=1, norm_cfg=norm_cfg),
+            MaxPool2d(2, 2),
+            ConvModule(128, 128, 3, padding=1, norm_cfg=norm_cfg),
+            ConvModule(128, feat_channels, 3, padding=1, norm_cfg=norm_cfg),
+        )
+
+        self.hourglass_modules = nn.ModuleList([
+            nn.Sequential(
+                HourglassAEModule(
+                    downsample_times, stage_channels, norm_cfg=norm_cfg),
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg),
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg)) for _ in range(num_stacks)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channels,
+                out_channels,
+                1,
+                padding=0,
+                norm_cfg=None,
+                act_cfg=None) for _ in range(num_stacks)
+        ])
+
+        self.remap_out_convs = nn.ModuleList([
+            ConvModule(
+                out_channels,
+                feat_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for _ in range(num_stacks - 1)
+        ])
+
+        self.remap_feature_convs = nn.ModuleList([
+            ConvModule(
+                feat_channels,
+                feat_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = inter_feat + self.remap_out_convs[ind](
+                    out_feat) + self.remap_feature_convs[ind](
+                        hourglass_feat)
+
+        return out_feats
diff --git a/mmpose/models/backbones/hrformer.py b/mmpose/models/backbones/hrformer.py
new file mode 100644
index 0000000..b843300
--- /dev/null
+++ b/mmpose/models/backbones/hrformer.py
@@ -0,0 +1,746 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import math
+
+import torch
+import torch.nn as nn
+# from timm.models.layers import to_2tuple, trunc_normal_
+from mmcv.cnn import (build_activation_layer, build_conv_layer,
+                      build_norm_layer, trunc_normal_init)
+from mmcv.cnn.bricks.transformer import build_dropout
+from mmcv.runner import BaseModule
+from torch.nn.functional import pad
+
+from ..builder import BACKBONES
+from .hrnet import Bottleneck, HRModule, HRNet
+
+
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W)
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def build_drop_path(drop_path_rate):
+    """Build drop path layer."""
+    return build_dropout(dict(type='DropPath', drop_prob=drop_path_rate))
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        with_rpe (bool, optional): If True, use relative position bias.
+            Default: True.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 with_rpe=True,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        self.with_rpe = with_rpe
+        if self.with_rpe:
+            # define a parameter table of relative position bias
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(
+                    (2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                    num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+            Wh, Ww = self.window_size
+            rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+            rel_position_index = rel_index_coords + rel_index_coords.T
+            rel_position_index = rel_position_index.flip(1).contiguous()
+            self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_init(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (B*num_windows, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.with_rpe:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1],
+                    self.window_size[0] * self.window_size[1],
+                    -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class LocalWindowSelfAttention(BaseModule):
+    r""" Local-window Self Attention (LSA) module with relative position bias.
+
+    This module is the short-range self-attention module in the
+    Interlaced Sparse Self-Attention <https://arxiv.org/abs/1907.12273>`_.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int] | int): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        with_rpe (bool, optional): If True, use relative position bias.
+            Default: True.
+        with_pad_mask (bool, optional): If True, mask out the padded tokens in
+            the attention process. Default: False.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 with_rpe=True,
+                 with_pad_mask=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(window_size, int):
+            window_size = (window_size, window_size)
+        self.window_size = window_size
+        self.with_pad_mask = with_pad_mask
+        self.attn = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            with_rpe=with_rpe,
+            init_cfg=init_cfg)
+
+    def forward(self, x, H, W, **kwargs):
+        """Forward function."""
+        B, N, C = x.shape
+        x = x.view(B, H, W, C)
+        Wh, Ww = self.window_size
+
+        # center-pad the feature on H and W axes
+        pad_h = math.ceil(H / Wh) * Wh - H
+        pad_w = math.ceil(W / Ww) * Ww - W
+        x = pad(x, (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2))
+
+        # permute
+        x = x.view(B, math.ceil(H / Wh), Wh, math.ceil(W / Ww), Ww, C)
+        x = x.permute(0, 1, 3, 2, 4, 5)
+        x = x.reshape(-1, Wh * Ww, C)  # (B*num_window, Wh*Ww, C)
+
+        # attention
+        if self.with_pad_mask and pad_h > 0 and pad_w > 0:
+            pad_mask = x.new_zeros(1, H, W, 1)
+            pad_mask = pad(
+                pad_mask, [
+                    0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ],
+                value=-float('inf'))
+            pad_mask = pad_mask.view(1, math.ceil(H / Wh), Wh,
+                                     math.ceil(W / Ww), Ww, 1)
+            pad_mask = pad_mask.permute(1, 3, 0, 2, 4, 5)
+            pad_mask = pad_mask.reshape(-1, Wh * Ww)
+            pad_mask = pad_mask[:, None, :].expand([-1, Wh * Ww, -1])
+            out = self.attn(x, pad_mask, **kwargs)
+        else:
+            out = self.attn(x, **kwargs)
+
+        # reverse permutation
+        out = out.reshape(B, math.ceil(H / Wh), math.ceil(W / Ww), Wh, Ww, C)
+        out = out.permute(0, 1, 3, 2, 4, 5)
+        out = out.reshape(B, H + pad_h, W + pad_w, C)
+
+        # de-pad
+        out = out[:, pad_h // 2:H + pad_h // 2, pad_w // 2:W + pad_w // 2]
+        return out.reshape(B, N, C)
+
+
+class CrossFFN(BaseModule):
+    r"""FFN with Depthwise Conv of HRFormer.
+
+    Args:
+        in_features (int): The feature dimension.
+        hidden_features (int, optional): The hidden dimension of FFNs.
+            Defaults: The same as in_features.
+        act_cfg (dict, optional): Config of activation layer.
+            Default: dict(type='GELU').
+        dw_act_cfg (dict, optional): Config of activation layer appended
+            right after DW Conv. Default: dict(type='GELU').
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='SyncBN').
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_cfg=dict(type='GELU'),
+                 dw_act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = build_activation_layer(act_cfg)
+        self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1)
+        self.act2 = build_activation_layer(dw_act_cfg)
+        self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = build_activation_layer(act_cfg)
+        self.norm3 = build_norm_layer(norm_cfg, out_features)[1]
+
+        # put the modules togather
+        self.layers = [
+            self.fc1, self.norm1, self.act1, self.dw3x3, self.norm2, self.act2,
+            self.fc2, self.norm3, self.act3
+        ]
+
+    def forward(self, x, H, W):
+        """Forward function."""
+        x = nlc_to_nchw(x, (H, W))
+        for layer in self.layers:
+            x = layer(x)
+        x = nchw_to_nlc(x)
+        return x
+
+
+class HRFormerBlock(BaseModule):
+    """High-Resolution Block for HRFormer.
+
+    Args:
+        in_features (int): The input dimension.
+        out_features (int): The output dimension.
+        num_heads (int): The number of head within each LSA.
+        window_size (int, optional): The window size for the LSA.
+            Default: 7
+        mlp_ratio (int, optional): The expansion ration of FFN.
+            Default: 4
+        act_cfg (dict, optional): Config of activation layer.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='SyncBN').
+        transformer_norm_cfg (dict, optional): Config of transformer norm
+            layer. Default: dict(type='LN', eps=1e-6).
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.0,
+                 drop_path=0.0,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN'),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 init_cfg=None,
+                 **kwargs):
+        super(HRFormerBlock, self).__init__(init_cfg=init_cfg)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+
+        self.norm1 = build_norm_layer(transformer_norm_cfg, in_features)[1]
+        self.attn = LocalWindowSelfAttention(
+            in_features,
+            num_heads=num_heads,
+            window_size=window_size,
+            init_cfg=None,
+            **kwargs)
+
+        self.norm2 = build_norm_layer(transformer_norm_cfg, out_features)[1]
+        self.ffn = CrossFFN(
+            in_features=in_features,
+            hidden_features=int(in_features * mlp_ratio),
+            out_features=out_features,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            dw_act_cfg=act_cfg,
+            init_cfg=None)
+
+        self.drop_path = build_drop_path(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x):
+        """Forward function."""
+        B, C, H, W = x.size()
+        # Attention
+        x = x.view(B, C, -1).permute(0, 2, 1)
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        # FFN
+        x = x + self.drop_path(self.ffn(self.norm2(x), H, W))
+        x = x.permute(0, 2, 1).view(B, C, H, W)
+        return x
+
+    def extra_repr(self):
+        """(Optional) Set the extra information about this module."""
+        return 'num_heads={}, window_size={}, mlp_ratio={}'.format(
+            self.num_heads, self.window_size, self.mlp_ratio)
+
+
+class HRFomerModule(HRModule):
+    """High-Resolution Module for HRFormer.
+
+    Args:
+        num_branches (int): The number of branches in the HRFormerModule.
+        block (nn.Module): The building block of HRFormer.
+            The block should be the HRFormerBlock.
+        num_blocks (tuple): The number of blocks in each branch.
+            The length must be equal to num_branches.
+        num_inchannels (tuple): The number of input channels in each branch.
+            The length must be equal to num_branches.
+        num_channels (tuple): The number of channels in each branch.
+            The length must be equal to num_branches.
+        num_heads (tuple): The number of heads within the LSAs.
+        num_window_sizes (tuple): The window size for the LSAs.
+        num_mlp_ratios (tuple): The expansion ratio for the FFNs.
+        drop_path (int, optional): The drop path rate of HRFomer.
+            Default: 0.0
+        multiscale_output (bool, optional): Whether to output multi-level
+            features produced by multiple branches. If False, only the first
+            level feature will be output. Default: True.
+        conv_cfg (dict, optional): Config of the conv layers.
+            Default: None.
+        norm_cfg (dict, optional): Config of the norm layers appended
+            right after conv. Default: dict(type='SyncBN', requires_grad=True)
+        transformer_norm_cfg (dict, optional): Config of the norm layers.
+            Default: dict(type='LN', eps=1e-6)
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False
+        upsample_cfg(dict, optional): The config of upsample layers in fuse
+            layers. Default: dict(mode='bilinear', align_corners=False)
+    """
+
+    def __init__(self,
+                 num_branches,
+                 block,
+                 num_blocks,
+                 num_inchannels,
+                 num_channels,
+                 num_heads,
+                 num_window_sizes,
+                 num_mlp_ratios,
+                 multiscale_output=True,
+                 drop_paths=0.0,
+                 with_rpe=True,
+                 with_pad_mask=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='SyncBN', requires_grad=True),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 with_cp=False,
+                 upsample_cfg=dict(mode='bilinear', align_corners=False)):
+
+        self.transformer_norm_cfg = transformer_norm_cfg
+        self.drop_paths = drop_paths
+        self.num_heads = num_heads
+        self.num_window_sizes = num_window_sizes
+        self.num_mlp_ratios = num_mlp_ratios
+        self.with_rpe = with_rpe
+        self.with_pad_mask = with_pad_mask
+
+        super().__init__(num_branches, block, num_blocks, num_inchannels,
+                         num_channels, multiscale_output, with_cp, conv_cfg,
+                         norm_cfg, upsample_cfg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Build one branch."""
+        # HRFormerBlock does not support down sample layer yet.
+        assert stride == 1 and self.in_channels[branch_index] == num_channels[
+            branch_index]
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                num_heads=self.num_heads[branch_index],
+                window_size=self.num_window_sizes[branch_index],
+                mlp_ratio=self.num_mlp_ratios[branch_index],
+                drop_path=self.drop_paths[0],
+                norm_cfg=self.norm_cfg,
+                transformer_norm_cfg=self.transformer_norm_cfg,
+                init_cfg=None,
+                with_rpe=self.with_rpe,
+                with_pad_mask=self.with_pad_mask))
+
+        self.in_channels[
+            branch_index] = self.in_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    num_heads=self.num_heads[branch_index],
+                    window_size=self.num_window_sizes[branch_index],
+                    mlp_ratio=self.num_mlp_ratios[branch_index],
+                    drop_path=self.drop_paths[i],
+                    norm_cfg=self.norm_cfg,
+                    transformer_norm_cfg=self.transformer_norm_cfg,
+                    init_cfg=None,
+                    with_rpe=self.with_rpe,
+                    with_pad_mask=self.with_pad_mask))
+        return nn.Sequential(*layers)
+
+    def _make_fuse_layers(self):
+        """Build fuse layers."""
+        if self.num_branches == 1:
+            return None
+        num_branches = self.num_branches
+        num_inchannels = self.in_channels
+        fuse_layers = []
+        for i in range(num_branches if self.multiscale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_inchannels[i],
+                                kernel_size=1,
+                                stride=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_inchannels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i),
+                                mode=self.upsample_cfg['mode'],
+                                align_corners=self.
+                                upsample_cfg['align_corners'])))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            with_out_act = False
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            with_out_act = True
+                        sub_modules = [
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_inchannels[j],
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                groups=num_inchannels[j],
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg,
+                                             num_inchannels[j])[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_outchannels_conv3x3,
+                                kernel_size=1,
+                                stride=1,
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg,
+                                             num_outchannels_conv3x3)[1]
+                        ]
+                        if with_out_act:
+                            sub_modules.append(nn.ReLU(False))
+                        conv3x3s.append(nn.Sequential(*sub_modules))
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def get_num_inchannels(self):
+        """Return the number of input channels."""
+        return self.in_channels
+
+
+@BACKBONES.register_module()
+class HRFormer(HRNet):
+    """HRFormer backbone.
+
+    This backbone is the implementation of `HRFormer: High-Resolution
+    Transformer for Dense Prediction <https://arxiv.org/abs/2110.09408>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules (int): The number of HRModule in this stage.
+                - num_branches (int): The number of branches in the HRModule.
+                - block (str): The type of block.
+                - num_blocks (tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels (tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Normally 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Config of norm layer.
+            Use `SyncBN` by default.
+        transformer_norm_cfg (dict): Config of transformer norm layer.
+            Use `LN` by default.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+    Example:
+        >>> from mmpose.models import HRFormer
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(2, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7),
+        >>>         num_heads=(1, 2),
+        >>>         mlp_ratios=(4, 4),
+        >>>         num_blocks=(2, 2),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7, 7),
+        >>>         num_heads=(1, 2, 4),
+        >>>         mlp_ratios=(4, 4, 4),
+        >>>         num_blocks=(2, 2, 2),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=2,
+        >>>         num_branches=4,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7, 7, 7),
+        >>>         num_heads=(1, 2, 4, 8),
+        >>>         mlp_ratios=(4, 4, 4, 4),
+        >>>         num_blocks=(2, 2, 2, 2),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRFormer(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BOTTLENECK': Bottleneck, 'HRFORMERBLOCK': HRFormerBlock}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 frozen_stages=-1):
+
+        # stochastic depth
+        depths = [
+            extra[stage]['num_blocks'][0] * extra[stage]['num_modules']
+            for stage in ['stage2', 'stage3', 'stage4']
+        ]
+        depth_s2, depth_s3, _ = depths
+        drop_path_rate = extra['drop_path_rate']
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        extra['stage2']['drop_path_rates'] = dpr[0:depth_s2]
+        extra['stage3']['drop_path_rates'] = dpr[depth_s2:depth_s2 + depth_s3]
+        extra['stage4']['drop_path_rates'] = dpr[depth_s2 + depth_s3:]
+
+        # HRFormer use bilinear upsample as default
+        upsample_cfg = extra.get('upsample', {
+            'mode': 'bilinear',
+            'align_corners': False
+        })
+        extra['upsample'] = upsample_cfg
+        self.transformer_norm_cfg = transformer_norm_cfg
+        self.with_rpe = extra.get('with_rpe', True)
+        self.with_pad_mask = extra.get('with_pad_mask', False)
+
+        super().__init__(extra, in_channels, conv_cfg, norm_cfg, norm_eval,
+                         with_cp, zero_init_residual, frozen_stages)
+
+    def _make_stage(self,
+                    layer_config,
+                    num_inchannels,
+                    multiscale_output=True):
+        """Make each stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+        num_heads = layer_config['num_heads']
+        num_window_sizes = layer_config['window_sizes']
+        num_mlp_ratios = layer_config['mlp_ratios']
+        drop_path_rates = layer_config['drop_path_rates']
+
+        modules = []
+        for i in range(num_modules):
+            # multiscale_output is only used at the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            modules.append(
+                HRFomerModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    num_heads,
+                    num_window_sizes,
+                    num_mlp_ratios,
+                    reset_multiscale_output,
+                    drop_paths=drop_path_rates[num_blocks[0] *
+                                               i:num_blocks[0] * (i + 1)],
+                    with_rpe=self.with_rpe,
+                    with_pad_mask=self.with_pad_mask,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    transformer_norm_cfg=self.transformer_norm_cfg,
+                    with_cp=self.with_cp,
+                    upsample_cfg=self.upsample_cfg))
+            num_inchannels = modules[-1].get_num_inchannels()
+
+        return nn.Sequential(*modules), num_inchannels
diff --git a/mmpose/models/backbones/hrnet.py b/mmpose/models/backbones/hrnet.py
new file mode 100644
index 0000000..87dc8ce
--- /dev/null
+++ b/mmpose/models/backbones/hrnet.py
@@ -0,0 +1,604 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .resnet import BasicBlock, Bottleneck, get_expansion
+from .utils import load_checkpoint
+
+
+class HRModule(nn.Module):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=False,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 upsample_cfg=dict(mode='nearest', align_corners=None)):
+
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.upsample_cfg = upsample_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=True)
+
+    @staticmethod
+    def _check_branches(num_branches, num_blocks, in_channels, num_channels):
+        """Check input to avoid ValueError."""
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Make one branch."""
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * get_expansion(block):
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * get_expansion(block),
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(
+                    self.norm_cfg,
+                    num_channels[branch_index] * get_expansion(block))[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index] * get_expansion(block),
+                stride=stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * get_expansion(block)
+        for _ in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * get_expansion(block),
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        """Make branches."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Make fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i),
+                                mode=self.upsample_cfg['mode'],
+                                align_corners=self.
+                                upsample_cfg['align_corners'])))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=True)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@BACKBONES.register_module()
+class HRNet(nn.Module):
+    """HRNet backbone.
+
+    `High-Resolution Representations for Labeling Pixels and Regions
+    <https://arxiv.org/abs/1904.04514>`__
+
+    Args:
+        extra (dict): detailed configuration for each stage of HRNet.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+
+    Example:
+        >>> from mmpose.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 frozen_stages=-1):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+        self.frozen_stages = frozen_stages
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.upsample_cfg = self.extra.get('upsample', {
+            'mode': 'nearest',
+            'align_corners': None
+        })
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * get_expansion(block)
+        self.layer1 = self._make_layer(block, 64, stage1_out_channels,
+                                       num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg,
+            num_channels,
+            multiscale_output=self.stage4_cfg.get('multiscale_output', False))
+
+        self._freeze_stages()
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, in_channels, out_channels, blocks, stride=1):
+        """Make layer."""
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, out_channels)[1])
+
+        layers = []
+        layers.append(
+            block(
+                in_channels,
+                out_channels,
+                stride=stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg))
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    out_channels,
+                    out_channels,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        """Make stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    upsample_cfg=self.upsample_cfg))
+
+            in_channels = hr_modules[-1].in_channels
+
+        return nn.Sequential(*hr_modules), in_channels
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.norm1.eval()
+            self.norm2.eval()
+
+            for m in [self.conv1, self.norm1, self.conv2, self.norm2]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            if i == 1:
+                m = getattr(self, 'layer1')
+            else:
+                m = getattr(self, f'stage{i}')
+
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+            if i < 4:
+                m = getattr(self, f'transition{i}')
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/backbones/litehrnet.py b/mmpose/models/backbones/litehrnet.py
new file mode 100644
index 0000000..9543688
--- /dev/null
+++ b/mmpose/models/backbones/litehrnet.py
@@ -0,0 +1,984 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HRNet/Lite-HRNet
+# Original licence: Apache License 2.0.
+# ------------------------------------------------------------------------------
+
+import mmcv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
+                      build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .utils import channel_shuffle, load_checkpoint
+
+
+class SpatialWeighting(nn.Module):
+    """Spatial weighting module.
+
+    Args:
+        channels (int): The channels of the module.
+        ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid')).
+            The last ConvModule uses Sigmoid by default.
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class CrossResolutionWeighting(nn.Module):
+    """Cross-resolution channel weighting module.
+
+    Args:
+        channels (int): The channels of the module.
+        ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid')).
+            The last ConvModule uses Sigmoid by default.
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        total_channel = sum(channels)
+        self.conv1 = ConvModule(
+            in_channels=total_channel,
+            out_channels=int(total_channel / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(total_channel / ratio),
+            out_channels=total_channel,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        mini_size = x[-1].size()[-2:]
+        out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]
+        out = torch.cat(out, dim=1)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        out = torch.split(out, self.channels, dim=1)
+        out = [
+            s * F.interpolate(a, size=s.size()[-2:], mode='nearest')
+            for s, a in zip(x, out)
+        ]
+        return out
+
+
+class ConditionalChannelWeighting(nn.Module):
+    """Conditional channel weighting block.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        stride (int): Stride of the 3x3 convolution layer.
+        reduce_ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stride,
+                 reduce_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.stride = stride
+        assert stride in [1, 2]
+
+        branch_channels = [channel // 2 for channel in in_channels]
+
+        self.cross_resolution_weighting = CrossResolutionWeighting(
+            branch_channels,
+            ratio=reduce_ratio,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        self.depthwise_convs = nn.ModuleList([
+            ConvModule(
+                channel,
+                channel,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=channel,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for channel in branch_channels
+        ])
+
+        self.spatial_weighting = nn.ModuleList([
+            SpatialWeighting(channels=channel, ratio=4)
+            for channel in branch_channels
+        ])
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = [s.chunk(2, dim=1) for s in x]
+            x1 = [s[0] for s in x]
+            x2 = [s[1] for s in x]
+
+            x2 = self.cross_resolution_weighting(x2)
+            x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
+            x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]
+
+            out = [torch.cat([s1, s2], dim=1) for s1, s2 in zip(x1, x2)]
+            out = [channel_shuffle(s, 2) for s in out]
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class Stem(nn.Module):
+    """Stem network block.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        stem_channels (int): Output channels of the stem layer.
+        out_channels (int): The output channels of the block.
+        expand_ratio (int): adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stem_channels,
+                 out_channels,
+                 expand_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 with_cp=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=stem_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='ReLU'))
+
+        mid_channels = int(round(stem_channels * expand_ratio))
+        branch_channels = stem_channels // 2
+        if stem_channels == self.out_channels:
+            inc_channels = self.out_channels - branch_channels
+        else:
+            inc_channels = self.out_channels - stem_channels
+
+        self.branch1 = nn.Sequential(
+            ConvModule(
+                branch_channels,
+                branch_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                groups=branch_channels,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_channels,
+                inc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU')),
+        )
+
+        self.expand_conv = ConvModule(
+            branch_channels,
+            mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'))
+        self.depthwise_conv = ConvModule(
+            mid_channels,
+            mid_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=mid_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.linear_conv = ConvModule(
+            mid_channels,
+            branch_channels
+            if stem_channels == self.out_channels else stem_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = self.conv1(x)
+            x1, x2 = x.chunk(2, dim=1)
+
+            x2 = self.expand_conv(x2)
+            x2 = self.depthwise_conv(x2)
+            x2 = self.linear_conv(x2)
+
+            out = torch.cat((self.branch1(x1), x2), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class IterativeHead(nn.Module):
+    """Extra iterative head for feature learning.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+    """
+
+    def __init__(self, in_channels, norm_cfg=dict(type='BN')):
+        super().__init__()
+        projects = []
+        num_branchs = len(in_channels)
+        self.in_channels = in_channels[::-1]
+
+        for i in range(num_branchs):
+            if i != num_branchs - 1:
+                projects.append(
+                    DepthwiseSeparableConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=self.in_channels[i + 1],
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=dict(type='ReLU'),
+                        dw_act_cfg=None,
+                        pw_act_cfg=dict(type='ReLU')))
+            else:
+                projects.append(
+                    DepthwiseSeparableConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=self.in_channels[i],
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=dict(type='ReLU'),
+                        dw_act_cfg=None,
+                        pw_act_cfg=dict(type='ReLU')))
+        self.projects = nn.ModuleList(projects)
+
+    def forward(self, x):
+        x = x[::-1]
+
+        y = []
+        last_x = None
+        for i, s in enumerate(x):
+            if last_x is not None:
+                last_x = F.interpolate(
+                    last_x,
+                    size=s.size()[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                s = s + last_x
+            s = self.projects[i](s)
+            y.append(s)
+            last_x = s
+
+        return y[::-1]
+
+
+class ShuffleUnit(nn.Module):
+    """InvertedResidual block for ShuffleNetV2 backbone.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        out_channels (int): The output channels of the block.
+        stride (int): Stride of the 3x3 convolution layer. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super().__init__()
+        self.stride = stride
+        self.with_cp = with_cp
+
+        branch_features = out_channels // 2
+        if self.stride == 1:
+            assert in_channels == branch_features * 2, (
+                f'in_channels ({in_channels}) should equal to '
+                f'branch_features * 2 ({branch_features * 2}) '
+                'when stride is 1')
+
+        if in_channels != branch_features * 2:
+            assert self.stride != 1, (
+                f'stride ({self.stride}) should not equal 1 when '
+                f'in_channels != branch_features * 2')
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=self.stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                ConvModule(
+                    in_channels,
+                    branch_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+
+        self.branch2 = nn.Sequential(
+            ConvModule(
+                in_channels if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=branch_features,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.stride > 1:
+                out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+            else:
+                x1, x2 = x.chunk(2, dim=1)
+                out = torch.cat((x1, self.branch2(x2)), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class LiteHRModule(nn.Module):
+    """High-Resolution Module for LiteHRNet.
+
+    It contains conditional channel weighting blocks and
+    shuffle blocks.
+
+
+    Args:
+        num_branches (int): Number of branches in the module.
+        num_blocks (int): Number of blocks in the module.
+        in_channels (list(int)): Number of input image channels.
+        reduce_ratio (int): Channel reduction ratio.
+        module_type (str): 'LITE' or 'NAIVE'
+        multiscale_output (bool): Whether to output multi-scale features.
+        with_fuse (bool): Whether to use fuse layers.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(
+            self,
+            num_branches,
+            num_blocks,
+            in_channels,
+            reduce_ratio,
+            module_type,
+            multiscale_output=False,
+            with_fuse=True,
+            conv_cfg=None,
+            norm_cfg=dict(type='BN'),
+            with_cp=False,
+    ):
+        super().__init__()
+        self._check_branches(num_branches, in_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.module_type = module_type
+        self.multiscale_output = multiscale_output
+        self.with_fuse = with_fuse
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+
+        if self.module_type.upper() == 'LITE':
+            self.layers = self._make_weighting_blocks(num_blocks, reduce_ratio)
+        elif self.module_type.upper() == 'NAIVE':
+            self.layers = self._make_naive_branches(num_branches, num_blocks)
+        else:
+            raise ValueError("module_type should be either 'LITE' or 'NAIVE'.")
+        if self.with_fuse:
+            self.fuse_layers = self._make_fuse_layers()
+            self.relu = nn.ReLU()
+
+    def _check_branches(self, num_branches, in_channels):
+        """Check input to avoid ValueError."""
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_weighting_blocks(self, num_blocks, reduce_ratio, stride=1):
+        """Make channel weighting blocks."""
+        layers = []
+        for i in range(num_blocks):
+            layers.append(
+                ConditionalChannelWeighting(
+                    self.in_channels,
+                    stride=stride,
+                    reduce_ratio=reduce_ratio,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    with_cp=self.with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _make_one_branch(self, branch_index, num_blocks, stride=1):
+        """Make one branch."""
+        layers = []
+        layers.append(
+            ShuffleUnit(
+                self.in_channels[branch_index],
+                self.in_channels[branch_index],
+                stride=stride,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type='ReLU'),
+                with_cp=self.with_cp))
+        for i in range(1, num_blocks):
+            layers.append(
+                ShuffleUnit(
+                    self.in_channels[branch_index],
+                    self.in_channels[branch_index],
+                    stride=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=dict(type='ReLU'),
+                    with_cp=self.with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _make_naive_branches(self, num_branches, num_blocks):
+        """Make branches."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(self._make_one_branch(i, num_blocks))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Make fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=in_channels[j],
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=in_channels[j],
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=True)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.layers[0](x[0])]
+
+        if self.module_type.upper() == 'LITE':
+            out = self.layers(x)
+        elif self.module_type.upper() == 'NAIVE':
+            for i in range(self.num_branches):
+                x[i] = self.layers[i](x[i])
+            out = x
+
+        if self.with_fuse:
+            out_fuse = []
+            for i in range(len(self.fuse_layers)):
+                # `y = 0` will lead to decreased accuracy (0.5~1 mAP)
+                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
+                for j in range(self.num_branches):
+                    if i == j:
+                        y += out[j]
+                    else:
+                        y += self.fuse_layers[i][j](out[j])
+                out_fuse.append(self.relu(y))
+            out = out_fuse
+        if not self.multiscale_output:
+            out = [out[0]]
+        return out
+
+
+@BACKBONES.register_module()
+class LiteHRNet(nn.Module):
+    """Lite-HRNet backbone.
+
+    `Lite-HRNet: A Lightweight High-Resolution Network
+    <https://arxiv.org/abs/2104.06403>`_.
+
+    Code adapted from 'https://github.com/HRNet/Lite-HRNet'.
+
+    Args:
+        extra (dict): detailed configuration for each stage of HRNet.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+
+    Example:
+        >>> from mmpose.models import LiteHRNet
+        >>> import torch
+        >>> extra=dict(
+        >>>    stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+        >>>    num_stages=3,
+        >>>    stages_spec=dict(
+        >>>        num_modules=(2, 4, 2),
+        >>>        num_branches=(2, 3, 4),
+        >>>        num_blocks=(2, 2, 2),
+        >>>        module_type=('LITE', 'LITE', 'LITE'),
+        >>>        with_fuse=(True, True, True),
+        >>>        reduce_ratios=(8, 8, 8),
+        >>>        num_channels=(
+        >>>            (40, 80),
+        >>>            (40, 80, 160),
+        >>>            (40, 80, 160, 320),
+        >>>        )),
+        >>>    with_head=False)
+        >>> self = LiteHRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 40, 8, 8)
+    """
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=False,
+                 with_cp=False):
+        super().__init__()
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.stem = Stem(
+            in_channels,
+            stem_channels=self.extra['stem']['stem_channels'],
+            out_channels=self.extra['stem']['out_channels'],
+            expand_ratio=self.extra['stem']['expand_ratio'],
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+
+        self.num_stages = self.extra['num_stages']
+        self.stages_spec = self.extra['stages_spec']
+
+        num_channels_last = [
+            self.stem.out_channels,
+        ]
+        for i in range(self.num_stages):
+            num_channels = self.stages_spec['num_channels'][i]
+            num_channels = [num_channels[i] for i in range(len(num_channels))]
+            setattr(
+                self, f'transition{i}',
+                self._make_transition_layer(num_channels_last, num_channels))
+
+            stage, num_channels_last = self._make_stage(
+                self.stages_spec, i, num_channels, multiscale_output=True)
+            setattr(self, f'stage{i}', stage)
+
+        self.with_head = self.extra['with_head']
+        if self.with_head:
+            self.head_layer = IterativeHead(
+                in_channels=num_channels_last,
+                norm_cfg=self.norm_cfg,
+            )
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_pre_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                groups=num_channels_pre_layer[i],
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_pre_layer[i])[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU()))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                in_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                groups=in_channels,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels)[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU()))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_stage(self,
+                    stages_spec,
+                    stage_index,
+                    in_channels,
+                    multiscale_output=True):
+        num_modules = stages_spec['num_modules'][stage_index]
+        num_branches = stages_spec['num_branches'][stage_index]
+        num_blocks = stages_spec['num_blocks'][stage_index]
+        reduce_ratio = stages_spec['reduce_ratios'][stage_index]
+        with_fuse = stages_spec['with_fuse'][stage_index]
+        module_type = stages_spec['module_type'][stage_index]
+
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            modules.append(
+                LiteHRModule(
+                    num_branches,
+                    num_blocks,
+                    in_channels,
+                    reduce_ratio,
+                    module_type,
+                    multiscale_output=reset_multiscale_output,
+                    with_fuse=with_fuse,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    with_cp=self.with_cp))
+            in_channels = modules[-1].in_channels
+
+        return nn.Sequential(*modules), in_channels
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.stem(x)
+
+        y_list = [x]
+        for i in range(self.num_stages):
+            x_list = []
+            transition = getattr(self, f'transition{i}')
+            for j in range(self.stages_spec['num_branches'][i]):
+                if transition[j]:
+                    if j >= len(y_list):
+                        x_list.append(transition[j](y_list[-1]))
+                    else:
+                        x_list.append(transition[j](y_list[j]))
+                else:
+                    x_list.append(y_list[j])
+            y_list = getattr(self, f'stage{i}')(x_list)
+
+        x = y_list
+        if self.with_head:
+            x = self.head_layer(x)
+
+        return [x[0]]
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/backbones/mobilenet_v2.py b/mmpose/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000..5dc0cd1
--- /dev/null
+++ b/mmpose/models/backbones/mobilenet_v2.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint, make_divisible
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for MobileNetV2.
+
+    Args:
+        in_channels (int): The input channels of the InvertedResidual block.
+        out_channels (int): The output channels of the InvertedResidual block.
+        stride (int): Stride of the middle (first) 3x3 convolution.
+        expand_ratio (int): adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+        hidden_dim = int(round(in_channels * expand_ratio))
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=hidden_dim,
+                    kernel_size=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        layers.extend([
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=hidden_dim,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=out_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.use_res_connect:
+                return x + self.conv(x)
+            return self.conv(x)
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class MobileNetV2(BaseBackbone):
+    """MobileNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (7, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    # Parameters to build layers. 4 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
+    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
+                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
+                     [6, 320, 1, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 out_indices=(7, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.widen_factor = widen_factor
+        self.out_indices = out_indices
+        for index in out_indices:
+            if index not in range(0, 8):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 8). But received {index}')
+
+        if frozen_stages not in range(-1, 8):
+            raise ValueError('frozen_stages must be in range(-1, 8). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks, stride = layer_cfg
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+        if widen_factor > 1.0:
+            self.out_channel = int(1280 * widen_factor)
+        else:
+            self.out_channel = 1280
+
+        layer = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.add_module('conv2', layer)
+        self.layers.append('conv2')
+
+    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): number of blocks.
+            stride (int): stride of the first block. Default: 1
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio. Default: 6.
+        """
+        layers = []
+        for i in range(num_blocks):
+            if i >= 1:
+                stride = 1
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    stride,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/backbones/mobilenet_v3.py b/mmpose/models/backbones/mobilenet_v3.py
new file mode 100644
index 0000000..d640abe
--- /dev/null
+++ b/mmpose/models/backbones/mobilenet_v3.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import InvertedResidual, load_checkpoint
+
+
+@BACKBONES.register_module()
+class MobileNetV3(BaseBackbone):
+    """MobileNetV3 backbone.
+
+    Args:
+        arch (str): Architecture of mobilnetv3, from {small, big}.
+            Default: small.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (-1, ), which means output tensors from final stage.
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+    """
+    # Parameters to build each block:
+    #     [kernel size, mid channels, out channels, with_se, act type, stride]
+    arch_settings = {
+        'small': [[3, 16, 16, True, 'ReLU', 2],
+                  [3, 72, 24, False, 'ReLU', 2],
+                  [3, 88, 24, False, 'ReLU', 1],
+                  [5, 96, 40, True, 'HSwish', 2],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 120, 48, True, 'HSwish', 1],
+                  [5, 144, 48, True, 'HSwish', 1],
+                  [5, 288, 96, True, 'HSwish', 2],
+                  [5, 576, 96, True, 'HSwish', 1],
+                  [5, 576, 96, True, 'HSwish', 1]],
+        'big': [[3, 16, 16, False, 'ReLU', 1],
+                [3, 64, 24, False, 'ReLU', 2],
+                [3, 72, 24, False, 'ReLU', 1],
+                [5, 72, 40, True, 'ReLU', 2],
+                [5, 120, 40, True, 'ReLU', 1],
+                [5, 120, 40, True, 'ReLU', 1],
+                [3, 240, 80, False, 'HSwish', 2],
+                [3, 200, 80, False, 'HSwish', 1],
+                [3, 184, 80, False, 'HSwish', 1],
+                [3, 184, 80, False, 'HSwish', 1],
+                [3, 480, 112, True, 'HSwish', 1],
+                [3, 672, 112, True, 'HSwish', 1],
+                [5, 672, 160, True, 'HSwish', 1],
+                [5, 672, 160, True, 'HSwish', 2],
+                [5, 960, 160, True, 'HSwish', 1]]
+    }  # yapf: disable
+
+    def __init__(self,
+                 arch='small',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 out_indices=(-1, ),
+                 frozen_stages=-1,
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert arch in self.arch_settings
+        for index in out_indices:
+            if index not in range(-len(self.arch_settings[arch]),
+                                  len(self.arch_settings[arch])):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.arch_settings[arch])}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(-1, len(self.arch_settings[arch])):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             f'{len(self.arch_settings[arch])}). '
+                             f'But received {frozen_stages}')
+        self.arch = arch
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = 16
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='HSwish'))
+
+        self.layers = self._make_layer()
+        self.feat_dim = self.arch_settings[arch][-1][2]
+
+    def _make_layer(self):
+        layers = []
+        layer_setting = self.arch_settings[self.arch]
+        for i, params in enumerate(layer_setting):
+            (kernel_size, mid_channels, out_channels, with_se, act,
+             stride) = params
+            if with_se:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'), dict(type='HSigmoid')))
+            else:
+                se_cfg = None
+
+            layer = InvertedResidual(
+                in_channels=self.in_channels,
+                out_channels=out_channels,
+                mid_channels=mid_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                se_cfg=se_cfg,
+                with_expand_conv=True,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type=act),
+                with_cp=self.with_cp)
+            self.in_channels = out_channels
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, layer)
+            layers.append(layer_name)
+        return layers
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices or \
+                    i - len(self.layers) in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/backbones/mspn.py b/mmpose/models/backbones/mspn.py
new file mode 100644
index 0000000..71cee34
--- /dev/null
+++ b/mmpose/models/backbones/mspn.py
@@ -0,0 +1,513 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+from collections import OrderedDict
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,
+                      normal_init)
+from mmcv.runner.checkpoint import load_state_dict
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .resnet import Bottleneck as _Bottleneck
+from .utils.utils import get_state_dict
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+    """Bottleneck block for MSPN.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        stride (int): stride of the block. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super().__init__(in_channels, out_channels * 4, **kwargs)
+
+
+class DownsampleModule(nn.Module):
+    """Downsample module for MSPN.
+
+    Args:
+        block (nn.Module): Downsample block.
+        num_blocks (list): Number of blocks in each downsample unit.
+        num_units (int): Numbers of downsample units. Default: 4
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the input feature to
+            downsample module. Default: 64
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 num_units=4,
+                 has_skip=False,
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.has_skip = has_skip
+        self.in_channels = in_channels
+        assert len(num_blocks) == num_units
+        self.num_blocks = num_blocks
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.layer1 = self._make_layer(block, in_channels, num_blocks[0])
+        for i in range(1, num_units):
+            module_name = f'layer{i + 1}'
+            self.add_module(
+                module_name,
+                self._make_layer(
+                    block, in_channels * pow(2, i), num_blocks[i], stride=2))
+
+    def _make_layer(self, block, out_channels, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.in_channels != out_channels * block.expansion:
+            downsample = ConvModule(
+                self.in_channels,
+                out_channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        units = list()
+        units.append(
+            block(
+                self.in_channels,
+                out_channels,
+                stride=stride,
+                downsample=downsample,
+                norm_cfg=self.norm_cfg))
+        self.in_channels = out_channels * block.expansion
+        for _ in range(1, blocks):
+            units.append(block(self.in_channels, out_channels))
+
+        return nn.Sequential(*units)
+
+    def forward(self, x, skip1, skip2):
+        out = list()
+        for i in range(self.num_units):
+            module_name = f'layer{i + 1}'
+            module_i = getattr(self, module_name)
+            x = module_i(x)
+            if self.has_skip:
+                x = x + skip1[i] + skip2[i]
+            out.append(x)
+        out.reverse()
+
+        return tuple(out)
+
+
+class UpsampleUnit(nn.Module):
+    """Upsample unit for upsample module.
+
+    Args:
+        ind (int): Indicates whether to interpolate (>0) and whether to
+           generate feature map for the next hourglass-like module.
+        num_units (int): Number of units that form a upsample module. Along
+            with ind and gen_cross_conv, nm_units is used to decide whether
+            to generate feature map for the next hourglass-like module.
+        in_channels (int): Channel number of the skip-in feature maps from
+            the corresponding downsample unit.
+        unit_channels (int): Channel number in this unit. Default:256.
+        gen_skip: (bool): Whether or not to generate skips for the posterior
+            downsample module. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 ind,
+                 num_units,
+                 in_channels,
+                 unit_channels=256,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.in_skip = ConvModule(
+            in_channels,
+            unit_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None,
+            inplace=True)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.ind = ind
+        if self.ind > 0:
+            self.up_conv = ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        self.gen_skip = gen_skip
+        if self.gen_skip:
+            self.out_skip1 = ConvModule(
+                in_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+            self.out_skip2 = ConvModule(
+                unit_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+        self.gen_cross_conv = gen_cross_conv
+        if self.ind == num_units - 1 and self.gen_cross_conv:
+            self.cross_conv = ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+    def forward(self, x, up_x):
+        out = self.in_skip(x)
+
+        if self.ind > 0:
+            up_x = F.interpolate(
+                up_x,
+                size=(x.size(2), x.size(3)),
+                mode='bilinear',
+                align_corners=True)
+            up_x = self.up_conv(up_x)
+            out = out + up_x
+        out = self.relu(out)
+
+        skip1 = None
+        skip2 = None
+        if self.gen_skip:
+            skip1 = self.out_skip1(x)
+            skip2 = self.out_skip2(out)
+
+        cross_conv = None
+        if self.ind == self.num_units - 1 and self.gen_cross_conv:
+            cross_conv = self.cross_conv(out)
+
+        return out, skip1, skip2, cross_conv
+
+
+class UpsampleModule(nn.Module):
+    """Upsample module for MSPN.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units.
+            Default:256.
+        num_units (int): Numbers of upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_units=4,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = list()
+        for i in range(num_units):
+            self.in_channels.append(Bottleneck.expansion * out_channels *
+                                    pow(2, i))
+        self.in_channels.reverse()
+        self.num_units = num_units
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.norm_cfg = norm_cfg
+        for i in range(num_units):
+            module_name = f'up{i + 1}'
+            self.add_module(
+                module_name,
+                UpsampleUnit(
+                    i,
+                    self.num_units,
+                    self.in_channels[i],
+                    unit_channels,
+                    self.gen_skip,
+                    self.gen_cross_conv,
+                    norm_cfg=self.norm_cfg,
+                    out_channels=64))
+
+    def forward(self, x):
+        out = list()
+        skip1 = list()
+        skip2 = list()
+        cross_conv = None
+        for i in range(self.num_units):
+            module_i = getattr(self, f'up{i + 1}')
+            if i == 0:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], None)
+            elif i == self.num_units - 1:
+                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])
+            else:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])
+            out.append(outi)
+            skip1.append(skip1_i)
+            skip2.append(skip2_i)
+        skip1.reverse()
+        skip2.reverse()
+
+        return out, skip1, skip2, cross_conv
+
+
+class SingleStageNetwork(nn.Module):
+    """Single_stage Network.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units. Default:256.
+        num_units (int): Numbers of downsample/upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_blocks (list): Number of blocks in each downsample unit.
+            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the feature from ResNetTop.
+            Default: 64.
+    """
+
+    def __init__(self,
+                 has_skip=False,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 unit_channels=256,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        assert len(num_blocks) == num_units
+        self.has_skip = has_skip
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.num_units = num_units
+        self.unit_channels = unit_channels
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        self.downsample = DownsampleModule(Bottleneck, num_blocks, num_units,
+                                           has_skip, norm_cfg, in_channels)
+        self.upsample = UpsampleModule(unit_channels, num_units, gen_skip,
+                                       gen_cross_conv, norm_cfg, in_channels)
+
+    def forward(self, x, skip1, skip2):
+        mid = self.downsample(x, skip1, skip2)
+        out, skip1, skip2, cross_conv = self.upsample(mid)
+
+        return out, skip1, skip2, cross_conv
+
+
+class ResNetTop(nn.Module):
+    """ResNet top for MSPN.
+
+    Args:
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        channels (int): Number of channels of the feature output by ResNetTop.
+    """
+
+    def __init__(self, norm_cfg=dict(type='BN'), channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.top = nn.Sequential(
+            ConvModule(
+                3,
+                channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                norm_cfg=norm_cfg,
+                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+    def forward(self, img):
+        return self.top(img)
+
+
+@BACKBONES.register_module()
+class MSPN(BaseBackbone):
+    """MSPN backbone. Paper ref: Li et al. "Rethinking on Multi-Stage Networks
+    for Human Pose Estimation" (CVPR 2020).
+
+    Args:
+        unit_channels (int): Number of Channels in an upsample unit.
+            Default: 256
+        num_stages (int): Number of stages in a multi-stage MSPN. Default: 4
+        num_units (int): Number of downsample/upsample units in a single-stage
+            network. Default: 4
+            Note: Make sure num_units == len(self.num_blocks)
+        num_blocks (list): Number of bottlenecks in each
+            downsample unit. Default: [2, 2, 2, 2]
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        res_top_channels (int): Number of channels of feature from ResNetTop.
+            Default: 64.
+
+    Example:
+        >>> from mmpose.models import MSPN
+        >>> import torch
+        >>> self = MSPN(num_stages=2,num_units=2,num_blocks=[2,2])
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     for feature in level_output:
+        ...         print(tuple(feature.shape))
+        ...
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_stages=4,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 res_top_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        assert self.num_stages > 0
+        assert self.num_units > 1
+        assert self.num_units == len(self.num_blocks)
+        self.top = ResNetTop(norm_cfg=norm_cfg)
+        self.multi_stage_mspn = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if i == 0:
+                has_skip = False
+            else:
+                has_skip = True
+            if i != self.num_stages - 1:
+                gen_skip = True
+                gen_cross_conv = True
+            else:
+                gen_skip = False
+                gen_cross_conv = False
+            self.multi_stage_mspn.append(
+                SingleStageNetwork(has_skip, gen_skip, gen_cross_conv,
+                                   unit_channels, num_units, num_blocks,
+                                   norm_cfg, res_top_channels))
+
+    def forward(self, x):
+        """Model forward function."""
+        out_feats = []
+        skip1 = None
+        skip2 = None
+        x = self.top(x)
+        for i in range(self.num_stages):
+            out, skip1, skip2, x = self.multi_stage_mspn[i](x, skip1, skip2)
+            out_feats.append(out)
+
+        return out_feats
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            state_dict_tmp = get_state_dict(pretrained)
+            state_dict = OrderedDict()
+            state_dict['top'] = OrderedDict()
+            state_dict['bottlenecks'] = OrderedDict()
+            for k, v in state_dict_tmp.items():
+                if k.startswith('layer'):
+                    if 'downsample.0' in k:
+                        state_dict['bottlenecks'][k.replace(
+                            'downsample.0', 'downsample.conv')] = v
+                    elif 'downsample.1' in k:
+                        state_dict['bottlenecks'][k.replace(
+                            'downsample.1', 'downsample.bn')] = v
+                    else:
+                        state_dict['bottlenecks'][k] = v
+                elif k.startswith('conv1'):
+                    state_dict['top'][k.replace('conv1', 'top.0.conv')] = v
+                elif k.startswith('bn1'):
+                    state_dict['top'][k.replace('bn1', 'top.0.bn')] = v
+
+            load_state_dict(
+                self.top, state_dict['top'], strict=False, logger=logger)
+            for i in range(self.num_stages):
+                load_state_dict(
+                    self.multi_stage_mspn[i].downsample,
+                    state_dict['bottlenecks'],
+                    strict=False,
+                    logger=logger)
+        else:
+            for m in self.multi_stage_mspn.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+
+            for m in self.top.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
diff --git a/mmpose/models/backbones/regnet.py b/mmpose/models/backbones/regnet.py
new file mode 100644
index 0000000..693417c
--- /dev/null
+++ b/mmpose/models/backbones/regnet.py
@@ -0,0 +1,317 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import ResNet
+from .resnext import Bottleneck
+
+
+@BACKBONES.register_module()
+class RegNet(ResNet):
+    """RegNet backbone.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`__ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+            - w0 (int): initial width
+            - wa (float): slope of width
+            - wm (float): quantization parameter to quantize the width
+            - depth (int): depth of the backbone
+            - group_w (int): width of group
+            - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Default: 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer. Default: "pytorch".
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters. Default: -1.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import RegNet
+        >>> import torch
+        >>> self = RegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0),
+                 out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+    arch_settings = {
+        'regnetx_400mf':
+        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        'regnetx_800mf':
+        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
+        'regnetx_1.6gf':
+        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
+        'regnetx_3.2gf':
+        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
+        'regnetx_4.0gf':
+        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
+        'regnetx_6.4gf':
+        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
+        'regnetx_8.0gf':
+        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
+        'regnetx_12gf':
+        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
+    }
+
+    def __init__(self,
+                 arch,
+                 in_channels=3,
+                 stem_channels=32,
+                 base_channels=32,
+                 strides=(2, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super(ResNet, self).__init__()
+
+        # Generate RegNet parameters first
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'"arch": "{arch}" is not one of the' \
+                ' arch_settings'
+            arch = self.arch_settings[arch]
+        elif not isinstance(arch, dict):
+            raise TypeError('Expect "arch" to be either a string '
+                            f'or a dict, got {type(arch)}')
+
+        widths, num_stages = self.generate_regnet(
+            arch['w0'],
+            arch['wa'],
+            arch['wm'],
+            arch['depth'],
+        )
+        # Convert to per stage format
+        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
+        # Generate group widths and bot muls
+        group_widths = [arch['group_w'] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
+        # Adjust the compatibility of stage_widths and group_widths
+        stage_widths, group_widths = self.adjust_width_group(
+            stage_widths, self.bottleneck_ratio, group_widths)
+
+        # Group params by stage
+        self.stage_widths = stage_widths
+        self.group_widths = group_widths
+        self.depth = sum(stage_blocks)
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        if self.deep_stem:
+            raise NotImplementedError(
+                'deep_stem has not been implemented for RegNet')
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.stage_blocks = stage_blocks[:num_stages]
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        _in_channels = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            group_width = self.group_widths[i]
+            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
+            stage_groups = width // group_width
+
+            res_layer = self.make_res_layer(
+                block=Bottleneck,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=self.stage_widths[i],
+                expansion=1,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                base_channels=self.stage_widths[i],
+                groups=stage_groups,
+                width_per_group=group_width)
+            _in_channels = self.stage_widths[i]
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = stage_widths[-1]
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            base_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, base_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+
+    @staticmethod
+    def generate_regnet(initial_width,
+                        width_slope,
+                        width_parameter,
+                        depth,
+                        divisor=8):
+        """Generates per block width from RegNet parameters.
+
+        Args:
+            initial_width ([int]): Initial width of the backbone
+            width_slope ([float]): Slope of the quantized linear function
+            width_parameter ([int]): Parameter used to quantize the width.
+            depth ([int]): Depth of the backbone.
+            divisor (int, optional): The divisor of channels. Defaults to 8.
+
+        Returns:
+            list, int: return a list of widths of each stage and the number of
+                stages
+        """
+        assert width_slope >= 0
+        assert initial_width > 0
+        assert width_parameter > 1
+        assert initial_width % divisor == 0
+        widths_cont = np.arange(depth) * width_slope + initial_width
+        ks = np.round(
+            np.log(widths_cont / initial_width) / np.log(width_parameter))
+        widths = initial_width * np.power(width_parameter, ks)
+        widths = np.round(np.divide(widths, divisor)) * divisor
+        num_stages = len(np.unique(widths))
+        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+        return widths, num_stages
+
+    @staticmethod
+    def quantize_float(number, divisor):
+        """Converts a float to closest non-zero int divisible by divior.
+
+        Args:
+            number (int): Original number to be quantized.
+            divisor (int): Divisor used to quantize the number.
+
+        Returns:
+            int: quantized number that is divisible by devisor.
+        """
+        return int(round(number / divisor) * divisor)
+
+    def adjust_width_group(self, widths, bottleneck_ratio, groups):
+        """Adjusts the compatibility of widths and groups.
+
+        Args:
+            widths (list[int]): Width of each stage.
+            bottleneck_ratio (float): Bottleneck ratio.
+            groups (int): number of groups in each stage
+
+        Returns:
+            tuple(list): The adjusted widths and groups of each stage.
+        """
+        bottleneck_width = [
+            int(w * b) for w, b in zip(widths, bottleneck_ratio)
+        ]
+        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
+        bottleneck_width = [
+            self.quantize_float(w_bot, g)
+            for w_bot, g in zip(bottleneck_width, groups)
+        ]
+        widths = [
+            int(w_bot / b)
+            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
+        ]
+        return widths, groups
+
+    def get_stages_from_blocks(self, widths):
+        """Gets widths/stage_blocks of network at each stage.
+
+        Args:
+            widths (list[int]): Width in each stage.
+
+        Returns:
+            tuple(list): width and depth of each stage
+        """
+        width_diff = [
+            width != width_prev
+            for width, width_prev in zip(widths + [0], [0] + widths)
+        ]
+        stage_widths = [
+            width for width, diff in zip(widths, width_diff[:-1]) if diff
+        ]
+        stage_blocks = np.diff([
+            depth for depth, diff in zip(range(len(width_diff)), width_diff)
+            if diff
+        ]).tolist()
+        return stage_widths, stage_blocks
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
diff --git a/mmpose/models/backbones/resnest.py b/mmpose/models/backbones/resnest.py
new file mode 100644
index 0000000..0a2d408
--- /dev/null
+++ b/mmpose/models/backbones/resnest.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResLayer, ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(nn.Module):
+    """Split-Attention Conv2d.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int | tuple[int]): Same as nn.Conv2d.
+        stride (int | tuple[int]): Same as nn.Conv2d.
+        padding (int | tuple[int]): Same as nn.Conv2d.
+        dilation (int | tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        super().__init__()
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=1,
+                 width_per_group=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # For ResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for ResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = SplitAttentionConv2d(
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    Please refer to the `paper <https://arxiv.org/pdf/2004.08955.pdf>`__
+    for details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152, 200}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3)),
+        269: (Bottleneck, (3, 30, 48, 8))
+    }
+
+    def __init__(self,
+                 depth,
+                 groups=1,
+                 width_per_group=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super().__init__(depth=depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/mmpose/models/backbones/resnet.py b/mmpose/models/backbones/resnet.py
new file mode 100644
index 0000000..649496a
--- /dev/null
+++ b/mmpose/models/backbones/resnet.py
@@ -0,0 +1,701 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer,
+                      constant_init, kaiming_init)
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class BasicBlock(nn.Module):
+    """BasicBlock for ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the output channels of conv1. This is a
+            reserved argument in BasicBlock and should always be 1. Default: 1.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): `pytorch` or `caffe`. It is unused and reserved for
+            unified API with Bottleneck.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=1,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert self.expansion == 1
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, out_channels, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            3,
+            padding=1,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the input/output channels of conv2. Default: 4.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Default: "pytorch".
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=4,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: the normalization layer named "norm3" """
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def get_expansion(block, expansion=None):
+    """Get the expansion of a residual block.
+
+    The block expansion will be obtained by the following order:
+
+    1. If ``expansion`` is given, just return it.
+    2. If ``block`` has the attribute ``expansion``, then return
+       ``block.expansion``.
+    3. Return the default value according the the block type:
+       1 for ``BasicBlock`` and 4 for ``Bottleneck``.
+
+    Args:
+        block (class): The block class.
+        expansion (int | None): The given expansion ratio.
+
+    Returns:
+        int: The expansion of the block.
+    """
+    if isinstance(expansion, int):
+        assert expansion > 0
+    elif expansion is None:
+        if hasattr(block, 'expansion'):
+            expansion = block.expansion
+        elif issubclass(block, BasicBlock):
+            expansion = 1
+        elif issubclass(block, Bottleneck):
+            expansion = 4
+        else:
+            raise TypeError(f'expansion is not specified for {block.__name__}')
+    else:
+        raise TypeError('expansion must be an integer or None')
+
+    return expansion
+
+
+class ResLayer(nn.Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): Residual block used to build ResLayer.
+        num_blocks (int): Number of blocks.
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int, optional): The expansion for BasicBlock/Bottleneck.
+            If not specified, it will firstly be obtained via
+            ``block.expansion``. If the block has no attribute "expansion",
+            the following default values will be used: 1 for BasicBlock and
+            4 for Bottleneck. Default: None.
+        stride (int): stride of the first block. Default: 1.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 expansion=None,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 **kwargs):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        self.block = block
+        self.expansion = get_expansion(block, expansion)
+
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, out_channels)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            in_channels = out_channels
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+        else:  # downsample_first=False is for HourglassModule
+            for i in range(0, num_blocks - 1):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super().__init__(*layers)
+
+
+@BACKBONES.register_module()
+class ResNet(BaseBackbone):
+    """ResNet backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1512.03385>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        base_channels (int): Middle channels of the first stage. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=64,
+                 base_channels=64,
+                 expansion=None,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.expansion = get_expansion(self.block, expansion)
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        _in_channels = stem_channels
+        _out_channels = base_channels * self.expansion
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            res_layer = self.make_res_layer(
+                block=self.block,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=_out_channels,
+                expansion=self.expansion,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg)
+            _in_channels = _out_channels
+            _out_channels *= 2
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = res_layer[-1].out_channels
+
+    def make_res_layer(self, **kwargs):
+        """Make a ResLayer."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        """Make stem layer."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`__.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(deep_stem=True, avg_down=True, **kwargs)
diff --git a/mmpose/models/backbones/resnext.py b/mmpose/models/backbones/resnext.py
new file mode 100644
index 0000000..c10dc33
--- /dev/null
+++ b/mmpose/models/backbones/resnext.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResLayer, ResNet
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeXt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 base_channels=64,
+                 groups=32,
+                 width_per_group=4,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # For ResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for ResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1611.05431>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+     Example:
+        >>> from mmpose.models import ResNeXt
+        >>> import torch
+        >>> self = ResNeXt(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/mmpose/models/backbones/rsn.py b/mmpose/models/backbones/rsn.py
new file mode 100644
index 0000000..29038af
--- /dev/null
+++ b/mmpose/models/backbones/rsn.py
@@ -0,0 +1,616 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,
+                      normal_init)
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class RSB(nn.Module):
+    """Residual Steps block for RSN. Paper ref: Cai et al. "Learning Delicate
+    Local Representations for Multi-Person Pose Estimation" (ECCV 2020).
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        num_steps (int): Numbers of steps in RSB
+        stride (int): stride of the block. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        expand_times (int): Times by which the in_channels are expanded.
+            Default:26.
+        res_top_channels (int): Number of channels of feature output by
+            ResNet_top. Default:64.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_steps=4,
+                 stride=1,
+                 downsample=None,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 expand_times=26,
+                 res_top_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        assert num_steps > 1
+        self.in_channels = in_channels
+        self.branch_channels = self.in_channels * expand_times
+        self.branch_channels //= res_top_channels
+        self.out_channels = out_channels
+        self.stride = stride
+        self.downsample = downsample
+        self.with_cp = with_cp
+        self.norm_cfg = norm_cfg
+        self.num_steps = num_steps
+        self.conv_bn_relu1 = ConvModule(
+            self.in_channels,
+            self.num_steps * self.branch_channels,
+            kernel_size=1,
+            stride=self.stride,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            inplace=False)
+        for i in range(self.num_steps):
+            for j in range(i + 1):
+                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'
+                self.add_module(
+                    module_name,
+                    ConvModule(
+                        self.branch_channels,
+                        self.branch_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+        self.conv_bn3 = ConvModule(
+            self.num_steps * self.branch_channels,
+            self.out_channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act_cfg=None,
+            norm_cfg=self.norm_cfg,
+            inplace=False)
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        """Forward function."""
+
+        identity = x
+        x = self.conv_bn_relu1(x)
+        spx = torch.split(x, self.branch_channels, 1)
+        outputs = list()
+        outs = list()
+        for i in range(self.num_steps):
+            outputs_i = list()
+            outputs.append(outputs_i)
+            for j in range(i + 1):
+                if j == 0:
+                    inputs = spx[i]
+                else:
+                    inputs = outputs[i][j - 1]
+                if i > j:
+                    inputs = inputs + outputs[i - 1][j]
+                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'
+                module_i_j = getattr(self, module_name)
+                outputs[i].append(module_i_j(inputs))
+
+            outs.append(outputs[i][i])
+        out = torch.cat(tuple(outs), 1)
+        out = self.conv_bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(identity)
+        out = out + identity
+
+        out = self.relu(out)
+
+        return out
+
+
+class Downsample_module(nn.Module):
+    """Downsample module for RSN.
+
+    Args:
+        block (nn.Module): Downsample block.
+        num_blocks (list): Number of blocks in each downsample unit.
+        num_units (int): Numbers of downsample units. Default: 4
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_steps (int): Number of steps in a block. Default:4
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the input feature to
+            downsample module. Default: 64
+        expand_times (int): Times by which the in_channels are expanded.
+            Default:26.
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 num_steps=4,
+                 num_units=4,
+                 has_skip=False,
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.has_skip = has_skip
+        self.in_channels = in_channels
+        assert len(num_blocks) == num_units
+        self.num_blocks = num_blocks
+        self.num_units = num_units
+        self.num_steps = num_steps
+        self.norm_cfg = norm_cfg
+        self.layer1 = self._make_layer(
+            block,
+            in_channels,
+            num_blocks[0],
+            expand_times=expand_times,
+            res_top_channels=in_channels)
+        for i in range(1, num_units):
+            module_name = f'layer{i + 1}'
+            self.add_module(
+                module_name,
+                self._make_layer(
+                    block,
+                    in_channels * pow(2, i),
+                    num_blocks[i],
+                    stride=2,
+                    expand_times=expand_times,
+                    res_top_channels=in_channels))
+
+    def _make_layer(self,
+                    block,
+                    out_channels,
+                    blocks,
+                    stride=1,
+                    expand_times=26,
+                    res_top_channels=64):
+        downsample = None
+        if stride != 1 or self.in_channels != out_channels * block.expansion:
+            downsample = ConvModule(
+                self.in_channels,
+                out_channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        units = list()
+        units.append(
+            block(
+                self.in_channels,
+                out_channels,
+                num_steps=self.num_steps,
+                stride=stride,
+                downsample=downsample,
+                norm_cfg=self.norm_cfg,
+                expand_times=expand_times,
+                res_top_channels=res_top_channels))
+        self.in_channels = out_channels * block.expansion
+        for _ in range(1, blocks):
+            units.append(
+                block(
+                    self.in_channels,
+                    out_channels,
+                    num_steps=self.num_steps,
+                    expand_times=expand_times,
+                    res_top_channels=res_top_channels))
+
+        return nn.Sequential(*units)
+
+    def forward(self, x, skip1, skip2):
+        out = list()
+        for i in range(self.num_units):
+            module_name = f'layer{i + 1}'
+            module_i = getattr(self, module_name)
+            x = module_i(x)
+            if self.has_skip:
+                x = x + skip1[i] + skip2[i]
+            out.append(x)
+        out.reverse()
+
+        return tuple(out)
+
+
+class Upsample_unit(nn.Module):
+    """Upsample unit for upsample module.
+
+    Args:
+        ind (int): Indicates whether to interpolate (>0) and whether to
+           generate feature map for the next hourglass-like module.
+        num_units (int): Number of units that form a upsample module. Along
+            with ind and gen_cross_conv, nm_units is used to decide whether
+            to generate feature map for the next hourglass-like module.
+        in_channels (int): Channel number of the skip-in feature maps from
+            the corresponding downsample unit.
+        unit_channels (int): Channel number in this unit. Default:256.
+        gen_skip: (bool): Whether or not to generate skips for the posterior
+            downsample module. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (in): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 ind,
+                 num_units,
+                 in_channels,
+                 unit_channels=256,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.in_skip = ConvModule(
+            in_channels,
+            unit_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None,
+            inplace=True)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.ind = ind
+        if self.ind > 0:
+            self.up_conv = ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        self.gen_skip = gen_skip
+        if self.gen_skip:
+            self.out_skip1 = ConvModule(
+                in_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+            self.out_skip2 = ConvModule(
+                unit_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+        self.gen_cross_conv = gen_cross_conv
+        if self.ind == num_units - 1 and self.gen_cross_conv:
+            self.cross_conv = ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+    def forward(self, x, up_x):
+        out = self.in_skip(x)
+
+        if self.ind > 0:
+            up_x = F.interpolate(
+                up_x,
+                size=(x.size(2), x.size(3)),
+                mode='bilinear',
+                align_corners=True)
+            up_x = self.up_conv(up_x)
+            out = out + up_x
+        out = self.relu(out)
+
+        skip1 = None
+        skip2 = None
+        if self.gen_skip:
+            skip1 = self.out_skip1(x)
+            skip2 = self.out_skip2(out)
+
+        cross_conv = None
+        if self.ind == self.num_units - 1 and self.gen_cross_conv:
+            cross_conv = self.cross_conv(out)
+
+        return out, skip1, skip2, cross_conv
+
+
+class Upsample_module(nn.Module):
+    """Upsample module for RSN.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units.
+            Default:256.
+        num_units (int): Numbers of upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_units=4,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = list()
+        for i in range(num_units):
+            self.in_channels.append(RSB.expansion * out_channels * pow(2, i))
+        self.in_channels.reverse()
+        self.num_units = num_units
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.norm_cfg = norm_cfg
+        for i in range(num_units):
+            module_name = f'up{i + 1}'
+            self.add_module(
+                module_name,
+                Upsample_unit(
+                    i,
+                    self.num_units,
+                    self.in_channels[i],
+                    unit_channels,
+                    self.gen_skip,
+                    self.gen_cross_conv,
+                    norm_cfg=self.norm_cfg,
+                    out_channels=64))
+
+    def forward(self, x):
+        out = list()
+        skip1 = list()
+        skip2 = list()
+        cross_conv = None
+        for i in range(self.num_units):
+            module_i = getattr(self, f'up{i + 1}')
+            if i == 0:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], None)
+            elif i == self.num_units - 1:
+                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])
+            else:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])
+            out.append(outi)
+            skip1.append(skip1_i)
+            skip2.append(skip2_i)
+        skip1.reverse()
+        skip2.reverse()
+
+        return out, skip1, skip2, cross_conv
+
+
+class Single_stage_RSN(nn.Module):
+    """Single_stage Residual Steps Network.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units. Default:256.
+        num_units (int): Numbers of downsample/upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_steps (int): Number of steps in RSB. Default: 4
+        num_blocks (list): Number of blocks in each downsample unit.
+            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the feature from ResNet_Top.
+            Default: 64.
+        expand_times (int): Times by which the in_channels are expanded in RSB.
+            Default:26.
+    """
+
+    def __init__(self,
+                 has_skip=False,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 unit_channels=256,
+                 num_units=4,
+                 num_steps=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        assert len(num_blocks) == num_units
+        self.has_skip = has_skip
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.num_units = num_units
+        self.num_steps = num_steps
+        self.unit_channels = unit_channels
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        self.downsample = Downsample_module(RSB, num_blocks, num_steps,
+                                            num_units, has_skip, norm_cfg,
+                                            in_channels, expand_times)
+        self.upsample = Upsample_module(unit_channels, num_units, gen_skip,
+                                        gen_cross_conv, norm_cfg, in_channels)
+
+    def forward(self, x, skip1, skip2):
+        mid = self.downsample(x, skip1, skip2)
+        out, skip1, skip2, cross_conv = self.upsample(mid)
+
+        return out, skip1, skip2, cross_conv
+
+
+class ResNet_top(nn.Module):
+    """ResNet top for RSN.
+
+    Args:
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        channels (int): Number of channels of the feature output by ResNet_top.
+    """
+
+    def __init__(self, norm_cfg=dict(type='BN'), channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.top = nn.Sequential(
+            ConvModule(
+                3,
+                channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                norm_cfg=norm_cfg,
+                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+    def forward(self, img):
+        return self.top(img)
+
+
+@BACKBONES.register_module()
+class RSN(BaseBackbone):
+    """Residual Steps Network backbone. Paper ref: Cai et al. "Learning
+    Delicate Local Representations for Multi-Person Pose Estimation" (ECCV
+    2020).
+
+    Args:
+        unit_channels (int): Number of Channels in an upsample unit.
+            Default: 256
+        num_stages (int): Number of stages in a multi-stage RSN. Default: 4
+        num_units (int): NUmber of downsample/upsample units in a single-stage
+            RSN. Default: 4 Note: Make sure num_units == len(self.num_blocks)
+        num_blocks (list): Number of RSBs (Residual Steps Block) in each
+            downsample unit. Default: [2, 2, 2, 2]
+        num_steps (int): Number of steps in a RSB. Default:4
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        res_top_channels (int): Number of channels of feature from ResNet_top.
+            Default: 64.
+        expand_times (int): Times by which the in_channels are expanded in RSB.
+            Default:26.
+    Example:
+        >>> from mmpose.models import RSN
+        >>> import torch
+        >>> self = RSN(num_stages=2,num_units=2,num_blocks=[2,2])
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     for feature in level_output:
+        ...         print(tuple(feature.shape))
+        ...
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_stages=4,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 num_steps=4,
+                 norm_cfg=dict(type='BN'),
+                 res_top_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+        self.num_blocks = num_blocks
+        self.num_steps = num_steps
+        self.norm_cfg = norm_cfg
+
+        assert self.num_stages > 0
+        assert self.num_steps > 1
+        assert self.num_units > 1
+        assert self.num_units == len(self.num_blocks)
+        self.top = ResNet_top(norm_cfg=norm_cfg)
+        self.multi_stage_rsn = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if i == 0:
+                has_skip = False
+            else:
+                has_skip = True
+            if i != self.num_stages - 1:
+                gen_skip = True
+                gen_cross_conv = True
+            else:
+                gen_skip = False
+                gen_cross_conv = False
+            self.multi_stage_rsn.append(
+                Single_stage_RSN(has_skip, gen_skip, gen_cross_conv,
+                                 unit_channels, num_units, num_steps,
+                                 num_blocks, norm_cfg, res_top_channels,
+                                 expand_times))
+
+    def forward(self, x):
+        """Model forward function."""
+        out_feats = []
+        skip1 = None
+        skip2 = None
+        x = self.top(x)
+        for i in range(self.num_stages):
+            out, skip1, skip2, x = self.multi_stage_rsn[i](x, skip1, skip2)
+            out_feats.append(out)
+
+        return out_feats
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        for m in self.multi_stage_rsn.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.Linear):
+                normal_init(m, std=0.01)
+
+        for m in self.top.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
diff --git a/mmpose/models/backbones/scnet.py b/mmpose/models/backbones/scnet.py
new file mode 100644
index 0000000..3786c57
--- /dev/null
+++ b/mmpose/models/backbones/scnet.py
@@ -0,0 +1,248 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck, ResNet
+
+
+class SCConv(nn.Module):
+    """SCConv (Self-calibrated Convolution)
+
+    Args:
+        in_channels (int): The input channels of the SCConv.
+        out_channels (int): The output channel of the SCConv.
+        stride (int): stride of SCConv.
+        pooling_r (int): size of pooling for scconv.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 pooling_r,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.1)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        assert in_channels == out_channels
+
+        self.k2 = nn.Sequential(
+            nn.AvgPool2d(kernel_size=pooling_r, stride=pooling_r),
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, in_channels)[1],
+        )
+        self.k3 = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, in_channels)[1],
+        )
+        self.k4 = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, out_channels)[1],
+            nn.ReLU(inplace=True),
+        )
+
+    def forward(self, x):
+        """Forward function."""
+        identity = x
+
+        out = torch.sigmoid(
+            torch.add(identity, F.interpolate(self.k2(x),
+                                              identity.size()[2:])))
+        out = torch.mul(self.k3(x), out)
+        out = self.k4(out)
+
+        return out
+
+
+class SCBottleneck(Bottleneck):
+    """SC(Self-calibrated) Bottleneck.
+
+    Args:
+        in_channels (int): The input channels of the SCBottleneck block.
+        out_channels (int): The output channel of the SCBottleneck block.
+    """
+
+    pooling_r = 4
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.mid_channels = out_channels // self.expansion // 2
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+
+        self.k1 = nn.Sequential(
+            build_conv_layer(
+                self.conv_cfg,
+                self.mid_channels,
+                self.mid_channels,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                bias=False),
+            build_norm_layer(self.norm_cfg, self.mid_channels)[1],
+            nn.ReLU(inplace=True))
+
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.scconv = SCConv(self.mid_channels, self.mid_channels, self.stride,
+                             self.pooling_r, self.conv_cfg, self.norm_cfg)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels * 2,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out_a = self.conv1(x)
+            out_a = self.norm1(out_a)
+            out_a = self.relu(out_a)
+
+            out_a = self.k1(out_a)
+
+            out_b = self.conv2(x)
+            out_b = self.norm2(out_b)
+            out_b = self.relu(out_b)
+
+            out_b = self.scconv(out_b)
+
+            out = self.conv3(torch.cat([out_a, out_b], dim=1))
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class SCNet(ResNet):
+    """SCNet backbone.
+
+    Improving Convolutional Networks with Self-Calibrated Convolutions,
+    Jiang-Jiang Liu, Qibin Hou, Ming-Ming Cheng, Changhu Wang, Jiashi Feng,
+    IEEE CVPR, 2020.
+    http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf
+
+    Args:
+        depth (int): Depth of scnet, from {50, 101}.
+        in_channels (int): Number of input image channels. Normally 3.
+        base_channels (int): Number of base channels of hidden layer.
+        num_stages (int): SCNet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmpose.models import SCNet
+        >>> import torch
+        >>> self = SCNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SCBottleneck, [3, 4, 6, 3]),
+        101: (SCBottleneck, [3, 4, 23, 3])
+    }
+
+    def __init__(self, depth, **kwargs):
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for SCNet')
+        super().__init__(depth, **kwargs)
diff --git a/mmpose/models/backbones/seresnet.py b/mmpose/models/backbones/seresnet.py
new file mode 100644
index 0000000..ac2d53b
--- /dev/null
+++ b/mmpose/models/backbones/seresnet.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.utils.checkpoint as cp
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck, ResLayer, ResNet
+from .utils.se_layer import SELayer
+
+
+class SEBottleneck(Bottleneck):
+    """SEBottleneck block for SEResNet.
+
+    Args:
+        in_channels (int): The input channels of the SEBottleneck block.
+        out_channels (int): The output channel of the SEBottleneck block.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16
+    """
+
+    def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.se_layer = SELayer(out_channels, ratio=se_ratio)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            out = self.se_layer(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class SEResNet(ResNet):
+    """SEResNet backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1709.01507>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import SEResNet
+        >>> import torch
+        >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SEBottleneck, (3, 4, 6, 3)),
+        101: (SEBottleneck, (3, 4, 23, 3)),
+        152: (SEBottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, se_ratio=16, **kwargs):
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for SEResNet')
+        self.se_ratio = se_ratio
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(se_ratio=self.se_ratio, **kwargs)
diff --git a/mmpose/models/backbones/seresnext.py b/mmpose/models/backbones/seresnext.py
new file mode 100644
index 0000000..c5c4e4c
--- /dev/null
+++ b/mmpose/models/backbones/seresnext.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import ResLayer
+from .seresnet import SEBottleneck as _SEBottleneck
+from .seresnet import SEResNet
+
+
+class SEBottleneck(_SEBottleneck):
+    """SEBottleneck block for SEResNeXt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        base_channels (int): Middle channels of the first stage. Default: 64.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 base_channels=64,
+                 groups=32,
+                 width_per_group=4,
+                 se_ratio=16,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, se_ratio, **kwargs)
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # We follow the same rational of ResNext to compute mid_channels.
+        # For SEResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for SEResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class SEResNeXt(SEResNet):
+    """SEResNeXt backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1709.01507>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import SEResNeXt
+        >>> import torch
+        >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SEBottleneck, (3, 4, 6, 3)),
+        101: (SEBottleneck, (3, 4, 23, 3)),
+        152: (SEBottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/mmpose/models/backbones/shufflenet_v1.py b/mmpose/models/backbones/shufflenet_v1.py
new file mode 100644
index 0000000..9f98cbd
--- /dev/null
+++ b/mmpose/models/backbones/shufflenet_v1.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, build_activation_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import channel_shuffle, load_checkpoint, make_divisible
+
+
+class ShuffleUnit(nn.Module):
+    """ShuffleUnit block.
+
+    ShuffleNet unit with pointwise group convolution (GConv) and channel
+    shuffle.
+
+    Args:
+        in_channels (int): The input channels of the ShuffleUnit.
+        out_channels (int): The output channels of the ShuffleUnit.
+        groups (int, optional): The number of groups to be used in grouped 1x1
+            convolutions in each ShuffleUnit. Default: 3
+        first_block (bool, optional): Whether it is the first ShuffleUnit of a
+            sequential ShuffleUnits. Default: True, which means not using the
+            grouped 1x1 convolution.
+        combine (str, optional): The ways to combine the input and output
+            branches. Default: 'add'.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=3,
+                 first_block=True,
+                 combine='add',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.first_block = first_block
+        self.combine = combine
+        self.groups = groups
+        self.bottleneck_channels = self.out_channels // 4
+        self.with_cp = with_cp
+
+        if self.combine == 'add':
+            self.depthwise_stride = 1
+            self._combine_func = self._add
+            assert in_channels == out_channels, (
+                'in_channels must be equal to out_channels when combine '
+                'is add')
+        elif self.combine == 'concat':
+            self.depthwise_stride = 2
+            self._combine_func = self._concat
+            self.out_channels -= self.in_channels
+            self.avgpool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            raise ValueError(f'Cannot combine tensors with {self.combine}. '
+                             'Only "add" and "concat" are supported')
+
+        self.first_1x1_groups = 1 if first_block else self.groups
+        self.g_conv_1x1_compress = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.bottleneck_channels,
+            kernel_size=1,
+            groups=self.first_1x1_groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.depthwise_conv3x3_bn = ConvModule(
+            in_channels=self.bottleneck_channels,
+            out_channels=self.bottleneck_channels,
+            kernel_size=3,
+            stride=self.depthwise_stride,
+            padding=1,
+            groups=self.bottleneck_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.g_conv_1x1_expand = ConvModule(
+            in_channels=self.bottleneck_channels,
+            out_channels=self.out_channels,
+            kernel_size=1,
+            groups=self.groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.act = build_activation_layer(act_cfg)
+
+    @staticmethod
+    def _add(x, out):
+        # residual connection
+        return x + out
+
+    @staticmethod
+    def _concat(x, out):
+        # concatenate along channel axis
+        return torch.cat((x, out), 1)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.g_conv_1x1_compress(x)
+            out = self.depthwise_conv3x3_bn(out)
+
+            if self.groups > 1:
+                out = channel_shuffle(out, self.groups)
+
+            out = self.g_conv_1x1_expand(out)
+
+            if self.combine == 'concat':
+                residual = self.avgpool(residual)
+                out = self.act(out)
+                out = self._combine_func(residual, out)
+            else:
+                out = self._combine_func(residual, out)
+                out = self.act(out)
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ShuffleNetV1(BaseBackbone):
+    """ShuffleNetV1 backbone.
+
+    Args:
+        groups (int, optional): The number of groups to be used in grouped 1x1
+            convolutions in each ShuffleUnit. Default: 3.
+        widen_factor (float, optional): Width multiplier - adjusts the number
+            of channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (2, )
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 groups=3,
+                 widen_factor=1.0,
+                 out_indices=(2, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stage_blocks = [4, 8, 4]
+        self.groups = groups
+
+        for index in out_indices:
+            if index not in range(0, 3):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 3). But received {index}')
+
+        if frozen_stages not in range(-1, 3):
+            raise ValueError('frozen_stages must be in range(-1, 3). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        if groups == 1:
+            channels = (144, 288, 576)
+        elif groups == 2:
+            channels = (200, 400, 800)
+        elif groups == 3:
+            channels = (240, 480, 960)
+        elif groups == 4:
+            channels = (272, 544, 1088)
+        elif groups == 8:
+            channels = (384, 768, 1536)
+        else:
+            raise ValueError(f'{groups} groups is not supported for 1x1 '
+                             'Grouped Convolutions')
+
+        channels = [make_divisible(ch * widen_factor, 8) for ch in channels]
+
+        self.in_channels = int(24 * widen_factor)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layers = nn.ModuleList()
+        for i, num_blocks in enumerate(self.stage_blocks):
+            first_block = (i == 0)
+            layer = self.make_layer(channels[i], num_blocks, first_block)
+            self.layers.append(layer)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(self.frozen_stages):
+            layer = self.layers[i]
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for name, m in self.named_modules():
+                if isinstance(m, nn.Conv2d):
+                    if 'conv1' in name:
+                        normal_init(m, mean=0, std=0.01)
+                    else:
+                        normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, val=1, bias=0.0001)
+                    if isinstance(m, _BatchNorm):
+                        if m.running_mean is not None:
+                            nn.init.constant_(m.running_mean, 0)
+        else:
+            raise TypeError('pretrained must be a str or None. But received '
+                            f'{type(pretrained)}')
+
+    def make_layer(self, out_channels, num_blocks, first_block=False):
+        """Stack ShuffleUnit blocks to make a layer.
+
+        Args:
+            out_channels (int): out_channels of the block.
+            num_blocks (int): Number of blocks.
+            first_block (bool, optional): Whether is the first ShuffleUnit of a
+                sequential ShuffleUnits. Default: False, which means using
+                the grouped 1x1 convolution.
+        """
+        layers = []
+        for i in range(num_blocks):
+            first_block = first_block if i == 0 else False
+            combine_mode = 'concat' if i == 0 else 'add'
+            layers.append(
+                ShuffleUnit(
+                    self.in_channels,
+                    out_channels,
+                    groups=self.groups,
+                    first_block=first_block,
+                    combine=combine_mode,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/backbones/shufflenet_v2.py b/mmpose/models/backbones/shufflenet_v2.py
new file mode 100644
index 0000000..e935333
--- /dev/null
+++ b/mmpose/models/backbones/shufflenet_v2.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import channel_shuffle, load_checkpoint
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for ShuffleNetV2 backbone.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        out_channels (int): The output channels of the block.
+        stride (int): Stride of the 3x3 convolution layer. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stride = stride
+        self.with_cp = with_cp
+
+        branch_features = out_channels // 2
+        if self.stride == 1:
+            assert in_channels == branch_features * 2, (
+                f'in_channels ({in_channels}) should equal to '
+                f'branch_features * 2 ({branch_features * 2}) '
+                'when stride is 1')
+
+        if in_channels != branch_features * 2:
+            assert self.stride != 1, (
+                f'stride ({self.stride}) should not equal 1 when '
+                f'in_channels != branch_features * 2')
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=self.stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                ConvModule(
+                    in_channels,
+                    branch_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+
+        self.branch2 = nn.Sequential(
+            ConvModule(
+                in_channels if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=branch_features,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.stride > 1:
+                out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+            else:
+                x1, x2 = x.chunk(2, dim=1)
+                out = torch.cat((x1, self.branch2(x2)), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ShuffleNetV2(BaseBackbone):
+    """ShuffleNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier - adjusts the number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 widen_factor=1.0,
+                 out_indices=(3, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stage_blocks = [4, 8, 4]
+        for index in out_indices:
+            if index not in range(0, 4):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 4). But received {index}')
+
+        if frozen_stages not in range(-1, 4):
+            raise ValueError('frozen_stages must be in range(-1, 4). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        if widen_factor == 0.5:
+            channels = [48, 96, 192, 1024]
+        elif widen_factor == 1.0:
+            channels = [116, 232, 464, 1024]
+        elif widen_factor == 1.5:
+            channels = [176, 352, 704, 1024]
+        elif widen_factor == 2.0:
+            channels = [244, 488, 976, 2048]
+        else:
+            raise ValueError('widen_factor must be in [0.5, 1.0, 1.5, 2.0]. '
+                             f'But received {widen_factor}')
+
+        self.in_channels = 24
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layers = nn.ModuleList()
+        for i, num_blocks in enumerate(self.stage_blocks):
+            layer = self._make_layer(channels[i], num_blocks)
+            self.layers.append(layer)
+
+        output_channels = channels[-1]
+        self.layers.append(
+            ConvModule(
+                in_channels=self.in_channels,
+                out_channels=output_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def _make_layer(self, out_channels, num_blocks):
+        """Stack blocks to make a layer.
+
+        Args:
+            out_channels (int): out_channels of the block.
+            num_blocks (int): number of blocks.
+        """
+        layers = []
+        for i in range(num_blocks):
+            stride = 2 if i == 0 else 1
+            layers.append(
+                InvertedResidual(
+                    in_channels=self.in_channels,
+                    out_channels=out_channels,
+                    stride=stride,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for name, m in self.named_modules():
+                if isinstance(m, nn.Conv2d):
+                    if 'conv1' in name:
+                        normal_init(m, mean=0, std=0.01)
+                    else:
+                        normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m.weight, val=1, bias=0.0001)
+                    if isinstance(m, _BatchNorm):
+                        if m.running_mean is not None:
+                            nn.init.constant_(m.running_mean, 0)
+        else:
+            raise TypeError('pretrained must be a str or None. But received '
+                            f'{type(pretrained)}')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/mmpose/models/backbones/tcn.py b/mmpose/models/backbones/tcn.py
new file mode 100644
index 0000000..deca229
--- /dev/null
+++ b/mmpose/models/backbones/tcn.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_conv_layer, constant_init, kaiming_init
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmpose.core import WeightNormClipHook
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class BasicTemporalBlock(nn.Module):
+    """Basic block for VideoPose3D.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        mid_channels (int): The output channels of conv1. Default: 1024.
+        kernel_size (int): Size of the convolving kernel. Default: 3.
+        dilation (int): Spacing between kernel elements. Default: 3.
+        dropout (float): Dropout rate. Default: 0.25.
+        causal (bool): Use causal convolutions instead of symmetric
+            convolutions (for real-time applications). Default: False.
+        residual (bool): Use residual connection. Default: True.
+        use_stride_conv (bool): Use optimized TCN that designed
+            specifically for single-frame batching, i.e. where batches have
+            input length = receptive field, and output length = 1. This
+            implementation replaces dilated convolutions with strided
+            convolutions to avoid generating unused intermediate results.
+            Default: False.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN1d').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels=1024,
+                 kernel_size=3,
+                 dilation=3,
+                 dropout=0.25,
+                 causal=False,
+                 residual=True,
+                 use_stride_conv=False,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d')):
+        # Protect mutable default arguments
+        conv_cfg = copy.deepcopy(conv_cfg)
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.mid_channels = mid_channels
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.dropout = dropout
+        self.causal = causal
+        self.residual = residual
+        self.use_stride_conv = use_stride_conv
+
+        self.pad = (kernel_size - 1) * dilation // 2
+        if use_stride_conv:
+            self.stride = kernel_size
+            self.causal_shift = kernel_size // 2 if causal else 0
+            self.dilation = 1
+        else:
+            self.stride = 1
+            self.causal_shift = kernel_size // 2 * dilation if causal else 0
+
+        self.conv1 = nn.Sequential(
+            ConvModule(
+                in_channels,
+                mid_channels,
+                kernel_size=kernel_size,
+                stride=self.stride,
+                dilation=self.dilation,
+                bias='auto',
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+        self.conv2 = nn.Sequential(
+            ConvModule(
+                mid_channels,
+                out_channels,
+                kernel_size=1,
+                bias='auto',
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+        if residual and in_channels != out_channels:
+            self.short_cut = build_conv_layer(conv_cfg, in_channels,
+                                              out_channels, 1)
+        else:
+            self.short_cut = None
+
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
+
+    def forward(self, x):
+        """Forward function."""
+        if self.use_stride_conv:
+            assert self.causal_shift + self.kernel_size // 2 < x.shape[2]
+        else:
+            assert 0 <= self.pad + self.causal_shift < x.shape[2] - \
+                self.pad + self.causal_shift <= x.shape[2]
+
+        out = self.conv1(x)
+        if self.dropout is not None:
+            out = self.dropout(out)
+
+        out = self.conv2(out)
+        if self.dropout is not None:
+            out = self.dropout(out)
+
+        if self.residual:
+            if self.use_stride_conv:
+                res = x[:, :, self.causal_shift +
+                        self.kernel_size // 2::self.kernel_size]
+            else:
+                res = x[:, :,
+                        (self.pad + self.causal_shift):(x.shape[2] - self.pad +
+                                                        self.causal_shift)]
+
+            if self.short_cut is not None:
+                res = self.short_cut(res)
+            out = out + res
+
+        return out
+
+
+@BACKBONES.register_module()
+class TCN(BaseBackbone):
+    """TCN backbone.
+
+    Temporal Convolutional Networks.
+    More details can be found in the
+    `paper <https://arxiv.org/abs/1811.11742>`__ .
+
+    Args:
+        in_channels (int): Number of input channels, which equals to
+            num_keypoints * num_features.
+        stem_channels (int): Number of feature channels. Default: 1024.
+        num_blocks (int): NUmber of basic temporal convolutional blocks.
+            Default: 2.
+        kernel_sizes (Sequence[int]): Sizes of the convolving kernel of
+            each basic block. Default: ``(3, 3, 3)``.
+        dropout (float): Dropout rate. Default: 0.25.
+        causal (bool): Use causal convolutions instead of symmetric
+            convolutions (for real-time applications).
+            Default: False.
+        residual (bool): Use residual connection. Default: True.
+        use_stride_conv (bool): Use TCN backbone optimized for
+            single-frame batching, i.e. where batches have input length =
+            receptive field, and output length = 1. This implementation
+            replaces dilated convolutions with strided convolutions to avoid
+            generating unused intermediate results. The weights are
+            interchangeable with the reference implementation. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN1d').
+        max_norm (float|None): if not None, the weight of convolution layers
+            will be clipped to have a maximum norm of max_norm.
+
+    Example:
+        >>> from mmpose.models import TCN
+        >>> import torch
+        >>> self = TCN(in_channels=34)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 34, 243)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 1024, 235)
+        (1, 1024, 217)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stem_channels=1024,
+                 num_blocks=2,
+                 kernel_sizes=(3, 3, 3),
+                 dropout=0.25,
+                 causal=False,
+                 residual=True,
+                 use_stride_conv=False,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 max_norm=None):
+        # Protect mutable default arguments
+        conv_cfg = copy.deepcopy(conv_cfg)
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.stem_channels = stem_channels
+        self.num_blocks = num_blocks
+        self.kernel_sizes = kernel_sizes
+        self.dropout = dropout
+        self.causal = causal
+        self.residual = residual
+        self.use_stride_conv = use_stride_conv
+        self.max_norm = max_norm
+
+        assert num_blocks == len(kernel_sizes) - 1
+        for ks in kernel_sizes:
+            assert ks % 2 == 1, 'Only odd filter widths are supported.'
+
+        self.expand_conv = ConvModule(
+            in_channels,
+            stem_channels,
+            kernel_size=kernel_sizes[0],
+            stride=kernel_sizes[0] if use_stride_conv else 1,
+            bias='auto',
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        dilation = kernel_sizes[0]
+        self.tcn_blocks = nn.ModuleList()
+        for i in range(1, num_blocks + 1):
+            self.tcn_blocks.append(
+                BasicTemporalBlock(
+                    in_channels=stem_channels,
+                    out_channels=stem_channels,
+                    mid_channels=stem_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilation,
+                    dropout=dropout,
+                    causal=causal,
+                    residual=residual,
+                    use_stride_conv=use_stride_conv,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+            dilation *= kernel_sizes[i]
+
+        if self.max_norm is not None:
+            # Apply weight norm clip to conv layers
+            weight_clip = WeightNormClipHook(self.max_norm)
+            for module in self.modules():
+                if isinstance(module, nn.modules.conv._ConvNd):
+                    weight_clip.register(module)
+
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.expand_conv(x)
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        outs = []
+        for i in range(self.num_blocks):
+            x = self.tcn_blocks[i](x)
+            outs.append(x)
+
+        return tuple(outs)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights."""
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.modules.conv._ConvNd):
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
diff --git a/mmpose/models/backbones/utils/__init__.py b/mmpose/models/backbones/utils/__init__.py
new file mode 100644
index 0000000..52a30ca
--- /dev/null
+++ b/mmpose/models/backbones/utils/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .channel_shuffle import channel_shuffle
+from .inverted_residual import InvertedResidual
+from .make_divisible import make_divisible
+from .se_layer import SELayer
+from .utils import load_checkpoint
+
+__all__ = [
+    'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer',
+    'load_checkpoint'
+]
diff --git a/mmpose/models/backbones/utils/channel_shuffle.py b/mmpose/models/backbones/utils/channel_shuffle.py
new file mode 100644
index 0000000..27006a8
--- /dev/null
+++ b/mmpose/models/backbones/utils/channel_shuffle.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def channel_shuffle(x, groups):
+    """Channel Shuffle operation.
+
+    This function enables cross-group information flow for multiple groups
+    convolution layers.
+
+    Args:
+        x (Tensor): The input tensor.
+        groups (int): The number of groups to divide the input tensor
+            in the channel dimension.
+
+    Returns:
+        Tensor: The output tensor after channel shuffle operation.
+    """
+
+    batch_size, num_channels, height, width = x.size()
+    assert (num_channels % groups == 0), ('num_channels should be '
+                                          'divisible by groups')
+    channels_per_group = num_channels // groups
+
+    x = x.view(batch_size, groups, channels_per_group, height, width)
+    x = torch.transpose(x, 1, 2).contiguous()
+    x = x.view(batch_size, -1, height, width)
+
+    return x
diff --git a/mmpose/models/backbones/utils/inverted_residual.py b/mmpose/models/backbones/utils/inverted_residual.py
new file mode 100644
index 0000000..dff762c
--- /dev/null
+++ b/mmpose/models/backbones/utils/inverted_residual.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(nn.Module):
+    """Inverted Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        groups (None or int): The group number of the depthwise convolution.
+            Default: None, which means group number = mid_channels.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 groups=None,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if groups is None:
+            groups = mid_channels
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + out
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/mmpose/models/backbones/utils/make_divisible.py b/mmpose/models/backbones/utils/make_divisible.py
new file mode 100644
index 0000000..b7666be
--- /dev/null
+++ b/mmpose/models/backbones/utils/make_divisible.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number down to the nearest value that can
+    be divisible by the divisor.
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int, optional): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float, optional): The minimum ratio of the rounded channel
+            number to the original channel number. Default: 0.9.
+    Returns:
+        int: The modified output channel number
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/mmpose/models/backbones/utils/se_layer.py b/mmpose/models/backbones/utils/se_layer.py
new file mode 100644
index 0000000..07f7080
--- /dev/null
+++ b/mmpose/models/backbones/utils/se_layer.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+
+class SELayer(nn.Module):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid'))
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
diff --git a/mmpose/models/backbones/utils/utils.py b/mmpose/models/backbones/utils/utils.py
new file mode 100644
index 0000000..a9ac948
--- /dev/null
+++ b/mmpose/models/backbones/utils/utils.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict_tmp = checkpoint['state_dict']
+    else:
+        state_dict_tmp = checkpoint
+
+    state_dict = OrderedDict()
+    # strip prefix of state_dict
+    for k, v in state_dict_tmp.items():
+        if k.startswith('module.backbone.'):
+            state_dict[k[16:]] = v
+        elif k.startswith('module.'):
+            state_dict[k[7:]] = v
+        elif k.startswith('backbone.'):
+            state_dict[k[9:]] = v
+        else:
+            state_dict[k] = v
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def get_state_dict(filename, map_location='cpu'):
+    """Get state_dict from a file or URI.
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``.
+        map_location (str): Same as :func:`torch.load`.
+
+    Returns:
+        OrderedDict: The state_dict.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict_tmp = checkpoint['state_dict']
+    else:
+        state_dict_tmp = checkpoint
+
+    state_dict = OrderedDict()
+    # strip prefix of state_dict
+    for k, v in state_dict_tmp.items():
+        if k.startswith('module.backbone.'):
+            state_dict[k[16:]] = v
+        elif k.startswith('module.'):
+            state_dict[k[7:]] = v
+        elif k.startswith('backbone.'):
+            state_dict[k[9:]] = v
+        else:
+            state_dict[k] = v
+
+    return state_dict
diff --git a/mmpose/models/backbones/v2v_net.py b/mmpose/models/backbones/v2v_net.py
new file mode 100644
index 0000000..99462af
--- /dev/null
+++ b/mmpose/models/backbones/v2v_net.py
@@ -0,0 +1,257 @@
+# ------------------------------------------------------------------------------
+# Copyright and License Information
+# Adapted from
+# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models/v2v_net.py
+# Original Licence: MIT License
+# ------------------------------------------------------------------------------
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class Basic3DBlock(nn.Module):
+    """A basic 3D convolutional block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the convolution operation
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv3d')
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN3d')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d')):
+        super(Basic3DBlock, self).__init__()
+        self.block = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=((kernel_size - 1) // 2),
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True)
+
+    def forward(self, x):
+        """Forward function."""
+        return self.block(x)
+
+
+class Res3DBlock(nn.Module):
+    """A residual 3D convolutional block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the convolution operation
+            Default: 3
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv3d')
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN3d')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d')):
+        super(Res3DBlock, self).__init__()
+        self.res_branch = nn.Sequential(
+            ConvModule(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                padding=((kernel_size - 1) // 2),
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                bias=True),
+            ConvModule(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                padding=((kernel_size - 1) // 2),
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True))
+
+        if in_channels == out_channels:
+            self.skip_con = nn.Sequential()
+        else:
+            self.skip_con = ConvModule(
+                in_channels,
+                out_channels,
+                1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True)
+
+    def forward(self, x):
+        """Forward function."""
+        res = self.res_branch(x)
+        skip = self.skip_con(x)
+        return F.relu(res + skip, True)
+
+
+class Pool3DBlock(nn.Module):
+    """A 3D max-pool block.
+
+    Args:
+        pool_size (int): Pool size of the 3D max-pool layer
+    """
+
+    def __init__(self, pool_size):
+        super(Pool3DBlock, self).__init__()
+        self.pool_size = pool_size
+
+    def forward(self, x):
+        """Forward function."""
+        return F.max_pool3d(
+            x, kernel_size=self.pool_size, stride=self.pool_size)
+
+
+class Upsample3DBlock(nn.Module):
+    """A 3D upsample block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the transposed convolution operation.
+            Default: 2
+        stride (int):  Kernel size of the transposed convolution operation.
+            Default: 2
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size=2, stride=2):
+        super(Upsample3DBlock, self).__init__()
+        assert kernel_size == 2
+        assert stride == 2
+        self.block = nn.Sequential(
+            nn.ConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=0,
+                output_padding=0), nn.BatchNorm3d(out_channels), nn.ReLU(True))
+
+    def forward(self, x):
+        """Forward function."""
+        return self.block(x)
+
+
+class EncoderDecorder(nn.Module):
+    """An encoder-decoder block.
+
+    Args:
+        in_channels (int): Input channels of this block
+    """
+
+    def __init__(self, in_channels=32):
+        super(EncoderDecorder, self).__init__()
+
+        self.encoder_pool1 = Pool3DBlock(2)
+        self.encoder_res1 = Res3DBlock(in_channels, in_channels * 2)
+        self.encoder_pool2 = Pool3DBlock(2)
+        self.encoder_res2 = Res3DBlock(in_channels * 2, in_channels * 4)
+
+        self.mid_res = Res3DBlock(in_channels * 4, in_channels * 4)
+
+        self.decoder_res2 = Res3DBlock(in_channels * 4, in_channels * 4)
+        self.decoder_upsample2 = Upsample3DBlock(in_channels * 4,
+                                                 in_channels * 2, 2, 2)
+        self.decoder_res1 = Res3DBlock(in_channels * 2, in_channels * 2)
+        self.decoder_upsample1 = Upsample3DBlock(in_channels * 2, in_channels,
+                                                 2, 2)
+
+        self.skip_res1 = Res3DBlock(in_channels, in_channels)
+        self.skip_res2 = Res3DBlock(in_channels * 2, in_channels * 2)
+
+    def forward(self, x):
+        """Forward function."""
+        skip_x1 = self.skip_res1(x)
+        x = self.encoder_pool1(x)
+        x = self.encoder_res1(x)
+
+        skip_x2 = self.skip_res2(x)
+        x = self.encoder_pool2(x)
+        x = self.encoder_res2(x)
+
+        x = self.mid_res(x)
+
+        x = self.decoder_res2(x)
+        x = self.decoder_upsample2(x)
+        x = x + skip_x2
+
+        x = self.decoder_res1(x)
+        x = self.decoder_upsample1(x)
+        x = x + skip_x1
+
+        return x
+
+
+@BACKBONES.register_module()
+class V2VNet(BaseBackbone):
+    """V2VNet.
+
+    Please refer to the `paper <https://arxiv.org/abs/1711.07399>`
+        for details.
+
+    Args:
+        input_channels (int):
+            Number of channels of the input feature volume.
+        output_channels (int):
+            Number of channels of the output volume.
+        mid_channels (int):
+            Input and output channels of the encoder-decoder block.
+    """
+
+    def __init__(self, input_channels, output_channels, mid_channels=32):
+        super(V2VNet, self).__init__()
+
+        self.front_layers = nn.Sequential(
+            Basic3DBlock(input_channels, mid_channels // 2, 7),
+            Res3DBlock(mid_channels // 2, mid_channels),
+        )
+
+        self.encoder_decoder = EncoderDecorder(in_channels=mid_channels)
+
+        self.output_layer = nn.Conv3d(
+            mid_channels, output_channels, kernel_size=1, stride=1, padding=0)
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.front_layers(x)
+        x = self.encoder_decoder(x)
+        x = self.output_layer(x)
+
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                nn.init.normal_(m.weight, 0, 0.001)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose3d):
+                nn.init.normal_(m.weight, 0, 0.001)
+                nn.init.constant_(m.bias, 0)
diff --git a/mmpose/models/backbones/vgg.py b/mmpose/models/backbones/vgg.py
new file mode 100644
index 0000000..f7d4670
--- /dev/null
+++ b/mmpose/models/backbones/vgg.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init, normal_init
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+def make_vgg_layer(in_channels,
+                   out_channels,
+                   num_blocks,
+                   conv_cfg=None,
+                   norm_cfg=None,
+                   act_cfg=dict(type='ReLU'),
+                   dilation=1,
+                   with_norm=False,
+                   ceil_mode=False):
+    layers = []
+    for _ in range(num_blocks):
+        layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            dilation=dilation,
+            padding=dilation,
+            bias=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        layers.append(layer)
+        in_channels = out_channels
+    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+    return layers
+
+
+@BACKBONES.register_module()
+class VGG(BaseBackbone):
+    """VGG backbone.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_norm (bool): Use BatchNorm or not.
+        num_classes (int): number of classes for classification.
+        num_stages (int): VGG stages, normally 5.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. When it is None, the default behavior depends on
+            whether num_classes is specified. If num_classes <= 0, the default
+            value is (4, ), outputting the last feature map before classifier.
+            If num_classes > 0, the default value is (5, ), outputting the
+            classification score. Default: None.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        ceil_mode (bool): Whether to use ceil_mode of MaxPool. Default: False.
+        with_last_pool (bool): Whether to keep the last pooling before
+            classifier. Default: True.
+    """
+
+    # Parameters to build layers. Each element specifies the number of conv in
+    # each stage. For example, VGG11 contains 11 layers with learnable
+    # parameters. 11 is computed as 11 = (1 + 1 + 2 + 2 + 2) + 3,
+    # where 3 indicates the last three fully-connected layers.
+    arch_settings = {
+        11: (1, 1, 2, 2, 2),
+        13: (2, 2, 2, 2, 2),
+        16: (2, 2, 3, 3, 3),
+        19: (2, 2, 4, 4, 4)
+    }
+
+    def __init__(self,
+                 depth,
+                 num_classes=-1,
+                 num_stages=5,
+                 dilations=(1, 1, 1, 1, 1),
+                 out_indices=None,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 ceil_mode=False,
+                 with_last_pool=True):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for vgg')
+        assert num_stages >= 1 and num_stages <= 5
+        stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        assert len(dilations) == num_stages
+
+        self.num_classes = num_classes
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        with_norm = norm_cfg is not None
+
+        if out_indices is None:
+            out_indices = (5, ) if num_classes > 0 else (4, )
+        assert max(out_indices) <= num_stages
+        self.out_indices = out_indices
+
+        self.in_channels = 3
+        start_idx = 0
+        vgg_layers = []
+        self.range_sub_modules = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            num_modules = num_blocks + 1
+            end_idx = start_idx + num_modules
+            dilation = dilations[i]
+            out_channels = 64 * 2**i if i < 4 else 512
+            vgg_layer = make_vgg_layer(
+                self.in_channels,
+                out_channels,
+                num_blocks,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                dilation=dilation,
+                with_norm=with_norm,
+                ceil_mode=ceil_mode)
+            vgg_layers.extend(vgg_layer)
+            self.in_channels = out_channels
+            self.range_sub_modules.append([start_idx, end_idx])
+            start_idx = end_idx
+        if not with_last_pool:
+            vgg_layers.pop(-1)
+            self.range_sub_modules[-1][1] -= 1
+        self.module_name = 'features'
+        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained=None):
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+
+    def forward(self, x):
+        outs = []
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(len(self.stage_blocks)):
+            for j in range(*self.range_sub_modules[i]):
+                vgg_layer = vgg_layers[j]
+                x = vgg_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), -1)
+            x = self.classifier(x)
+            outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def _freeze_stages(self):
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(self.frozen_stages):
+            for j in range(*self.range_sub_modules[i]):
+                m = vgg_layers[j]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/backbones/vipnas_mbv3.py b/mmpose/models/backbones/vipnas_mbv3.py
new file mode 100644
index 0000000..ed990e3
--- /dev/null
+++ b/mmpose/models/backbones/vipnas_mbv3.py
@@ -0,0 +1,179 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import InvertedResidual, load_checkpoint
+
+
+@BACKBONES.register_module()
+class ViPNAS_MobileNetV3(BaseBackbone):
+    """ViPNAS_MobileNetV3 backbone.
+
+    "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search"
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2105.10154>`__ .
+
+    Args:
+        wid (list(int)): Searched width config for each stage.
+        expan (list(int)): Searched expansion ratio config for each stage.
+        dep (list(int)): Searched depth config for each stage.
+        ks (list(int)): Searched kernel size config for each stage.
+        group (list(int)): Searched group number config for each stage.
+        att (list(bool)): Searched attention config for each stage.
+        stride (list(int)): Stride config for each stage.
+        act (list(dict)): Activation config for each stage.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+    """
+
+    def __init__(self,
+                 wid=[16, 16, 24, 40, 80, 112, 160],
+                 expan=[None, 1, 5, 4, 5, 5, 6],
+                 dep=[None, 1, 4, 4, 4, 4, 4],
+                 ks=[3, 3, 7, 7, 5, 7, 5],
+                 group=[None, 8, 120, 20, 100, 280, 240],
+                 att=[None, True, True, False, True, True, True],
+                 stride=[2, 1, 2, 2, 2, 1, 2],
+                 act=[
+                     'HSwish', 'ReLU', 'ReLU', 'ReLU', 'HSwish', 'HSwish',
+                     'HSwish'
+                 ],
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 frozen_stages=-1,
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.wid = wid
+        self.expan = expan
+        self.dep = dep
+        self.ks = ks
+        self.group = group
+        self.att = att
+        self.stride = stride
+        self.act = act
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.wid[0],
+            kernel_size=self.ks[0],
+            stride=self.stride[0],
+            padding=self.ks[0] // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type=self.act[0]))
+
+        self.layers = self._make_layer()
+
+    def _make_layer(self):
+        layers = []
+        layer_index = 0
+        for i, dep in enumerate(self.dep[1:]):
+            mid_channels = self.wid[i + 1] * self.expan[i + 1]
+
+            if self.att[i + 1]:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'), dict(type='HSigmoid')))
+            else:
+                se_cfg = None
+
+            if self.expan[i + 1] == 1:
+                with_expand_conv = False
+            else:
+                with_expand_conv = True
+
+            for j in range(dep):
+                if j == 0:
+                    stride = self.stride[i + 1]
+                    in_channels = self.wid[i]
+                else:
+                    stride = 1
+                    in_channels = self.wid[i + 1]
+
+                layer = InvertedResidual(
+                    in_channels=in_channels,
+                    out_channels=self.wid[i + 1],
+                    mid_channels=mid_channels,
+                    kernel_size=self.ks[i + 1],
+                    groups=self.group[i + 1],
+                    stride=stride,
+                    se_cfg=se_cfg,
+                    with_expand_conv=with_expand_conv,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=dict(type=self.act[i + 1]),
+                    with_cp=self.with_cp)
+                layer_index += 1
+                layer_name = f'layer{layer_index}'
+                self.add_module(layer_name, layer)
+                layers.append(layer_name)
+        return layers
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight, std=0.001)
+                    for name, _ in m.named_parameters():
+                        if name in ['bias']:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+
+        return x
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/backbones/vipnas_resnet.py b/mmpose/models/backbones/vipnas_resnet.py
new file mode 100644
index 0000000..81b028e
--- /dev/null
+++ b/mmpose/models/backbones/vipnas_resnet.py
@@ -0,0 +1,589 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer
+from mmcv.cnn.bricks import ContextBlock
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class ViPNAS_Bottleneck(nn.Module):
+    """Bottleneck block for ViPNAS_ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the input/output channels of conv2. Default: 4.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Default: "pytorch".
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        kernel_size (int): kernel size of conv2 searched in ViPANS.
+        groups (int): group number of conv2 searched in ViPNAS.
+        attention (bool): whether to use attention module in the end of
+            the block.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=4,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 kernel_size=3,
+                 groups=1,
+                 attention=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=kernel_size,
+            stride=self.conv2_stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            dilation=dilation,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        if attention:
+            self.attention = ContextBlock(out_channels,
+                                          max(1.0 / 16, 16.0 / out_channels))
+        else:
+            self.attention = None
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: the normalization layer named "norm3" """
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.attention is not None:
+                out = self.attention(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def get_expansion(block, expansion=None):
+    """Get the expansion of a residual block.
+
+    The block expansion will be obtained by the following order:
+
+    1. If ``expansion`` is given, just return it.
+    2. If ``block`` has the attribute ``expansion``, then return
+       ``block.expansion``.
+    3. Return the default value according the the block type:
+       4 for ``ViPNAS_Bottleneck``.
+
+    Args:
+        block (class): The block class.
+        expansion (int | None): The given expansion ratio.
+
+    Returns:
+        int: The expansion of the block.
+    """
+    if isinstance(expansion, int):
+        assert expansion > 0
+    elif expansion is None:
+        if hasattr(block, 'expansion'):
+            expansion = block.expansion
+        elif issubclass(block, ViPNAS_Bottleneck):
+            expansion = 1
+        else:
+            raise TypeError(f'expansion is not specified for {block.__name__}')
+    else:
+        raise TypeError('expansion must be an integer or None')
+
+    return expansion
+
+
+class ViPNAS_ResLayer(nn.Sequential):
+    """ViPNAS_ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): Residual block used to build ViPNAS ResLayer.
+        num_blocks (int): Number of blocks.
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int, optional): The expansion for BasicBlock/Bottleneck.
+            If not specified, it will firstly be obtained via
+            ``block.expansion``. If the block has no attribute "expansion",
+            the following default values will be used: 1 for BasicBlock and
+            4 for Bottleneck. Default: None.
+        stride (int): stride of the first block. Default: 1.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+        kernel_size (int): Kernel Size of the corresponding convolution layer
+            searched in the block.
+        groups (int): Group number of the corresponding convolution layer
+            searched in the block.
+        attention (bool): Whether to use attention module in the end of the
+            block.
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 expansion=None,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 kernel_size=3,
+                 groups=1,
+                 attention=False,
+                 **kwargs):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        self.block = block
+        self.expansion = get_expansion(block, expansion)
+
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, out_channels)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    kernel_size=kernel_size,
+                    groups=groups,
+                    attention=attention,
+                    **kwargs))
+            in_channels = out_channels
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        kernel_size=kernel_size,
+                        groups=groups,
+                        attention=attention,
+                        **kwargs))
+        else:  # downsample_first=False is for HourglassModule
+            for i in range(0, num_blocks - 1):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        kernel_size=kernel_size,
+                        groups=groups,
+                        attention=attention,
+                        **kwargs))
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    kernel_size=kernel_size,
+                    groups=groups,
+                    attention=attention,
+                    **kwargs))
+
+        super().__init__(*layers)
+
+
+@BACKBONES.register_module()
+class ViPNAS_ResNet(BaseBackbone):
+    """ViPNAS_ResNet backbone.
+
+    "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search"
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2105.10154>`__ .
+
+    Args:
+        depth (int): Network depth, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+        wid (list(int)): Searched width config for each stage.
+        expan (list(int)): Searched expansion ratio config for each stage.
+        dep (list(int)): Searched depth config for each stage.
+        ks (list(int)): Searched kernel size config for each stage.
+        group (list(int)): Searched group number config for each stage.
+        att (list(bool)): Searched attention config for each stage.
+    """
+
+    arch_settings = {
+        50: ViPNAS_Bottleneck,
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 wid=[48, 80, 160, 304, 608],
+                 expan=[None, 1, 1, 1, 1],
+                 dep=[None, 4, 6, 7, 3],
+                 ks=[7, 3, 5, 5, 5],
+                 group=[None, 16, 16, 16, 16],
+                 att=[None, True, False, True, True]):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.stem_channels = dep[0]
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block = self.arch_settings[depth]
+        self.stage_blocks = dep[1:1 + num_stages]
+
+        self._make_stem_layer(in_channels, wid[0], ks[0])
+
+        self.res_layers = []
+        _in_channels = wid[0]
+        for i, num_blocks in enumerate(self.stage_blocks):
+            expansion = get_expansion(self.block, expan[i + 1])
+            _out_channels = wid[i + 1] * expansion
+            stride = strides[i]
+            dilation = dilations[i]
+            res_layer = self.make_res_layer(
+                block=self.block,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=_out_channels,
+                expansion=expansion,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                kernel_size=ks[i + 1],
+                groups=group[i + 1],
+                attention=att[i + 1])
+            _in_channels = _out_channels
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = res_layer[-1].out_channels
+
+    def make_res_layer(self, **kwargs):
+        """Make a ViPNAS ResLayer."""
+        return ViPNAS_ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels, kernel_size):
+        """Make stem layer."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=kernel_size,
+                stride=2,
+                padding=kernel_size // 2,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight, std=0.001)
+                    for name, _ in m.named_parameters():
+                        if name in ['bias']:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/backbones/vit.py b/mmpose/models/backbones/vit.py
new file mode 100644
index 0000000..2719d1a
--- /dev/null
+++ b/mmpose/models/backbones/vit.py
@@ -0,0 +1,341 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    cls_token = None
+    B, L, C = abs_pos.shape
+    if has_cls_token:
+        cls_token = abs_pos[:, 0:1]
+        abs_pos = abs_pos[:, 1:]
+
+    if ori_h != h or ori_w != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).reshape(B, -1, C)
+
+    else:
+        new_abs_pos = abs_pos
+    
+    if cls_token is not None:
+        new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
+    return new_abs_pos
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None,):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, 
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, 
+                 norm_layer=nn.LayerNorm, attn_head_dim=None
+                 ):
+        super().__init__()
+        
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+            )
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+@BACKBONES.register_module()
+class ViT(BaseBackbone):
+
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, 
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False,
+                 ):
+        # Protect mutable default arguments
+        super(ViT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                )
+            for i in range(depth)])
+
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained, patch_padding=self.patch_padding)
+
+        if pretrained is None:
+            def _init_weights(m):
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if isinstance(m, nn.Linear) and m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.LayerNorm):
+                    nn.init.constant_(m.bias, 0)
+                    nn.init.constant_(m.weight, 1.0)
+
+            self.apply(_init_weights)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+        x = self.last_norm(x)
+
+        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+
+        return xp
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
diff --git a/mmpose/models/backbones/vit_moe.py b/mmpose/models/backbones/vit_moe.py
new file mode 100644
index 0000000..880a58f
--- /dev/null
+++ b/mmpose/models/backbones/vit_moe.py
@@ -0,0 +1,385 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    cls_token = None
+    B, L, C = abs_pos.shape
+    if has_cls_token:
+        cls_token = abs_pos[:, 0:1]
+        abs_pos = abs_pos[:, 1:]
+
+    if ori_h != h or ori_w != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).reshape(B, -1, C)
+
+    else:
+        new_abs_pos = abs_pos
+    
+    if cls_token is not None:
+        new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
+    return new_abs_pos
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class MoEMlp(nn.Module):
+    def __init__(self, num_expert=1, in_features=1024, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., part_features=256):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.part_features = part_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features - part_features)
+        self.drop = nn.Dropout(drop)
+        
+        self.num_expert = num_expert
+        experts = []
+
+        for i in range(num_expert):
+            experts.append(
+                        nn.Linear(hidden_features, part_features)
+                        )
+        self.experts = nn.ModuleList(experts)
+
+    def forward(self, x, indices):
+
+        expert_x = torch.zeros_like(x[:, :, -self.part_features:], device=x.device, dtype=x.dtype)
+
+        x = self.fc1(x)
+        x = self.act(x)
+        shared_x = self.fc2(x)
+        indices = indices.view(-1, 1, 1)
+
+        # to support ddp training
+        for i in range(self.num_expert):
+            selectedIndex = (indices == i)
+            current_x = self.experts[i](x) * selectedIndex
+            expert_x = expert_x + current_x
+
+        x = torch.cat([shared_x, expert_x], dim=-1)
+
+        return x
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None,):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, 
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, 
+                 norm_layer=nn.LayerNorm, attn_head_dim=None, num_expert=1, part_features=None
+                 ):
+        super().__init__()
+
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+            )
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MoEMlp(num_expert=num_expert, in_features=dim, hidden_features=mlp_hidden_dim, 
+                            act_layer=act_layer, drop=drop, part_features=part_features)
+
+    def forward(self, x, indices=None):
+
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x), indices))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+@BACKBONES.register_module()
+class ViTMoE(BaseBackbone):
+
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, 
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False,
+                 num_expert=1, part_features=None
+                 ):
+        # Protect mutable default arguments
+        super(ViTMoE, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+
+        self.part_features = part_features
+
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                num_expert=num_expert, part_features=part_features
+                )
+            for i in range(depth)])
+
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained, patch_padding=self.patch_padding, part_features=self.part_features)
+
+        if pretrained is None:
+            def _init_weights(m):
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if isinstance(m, nn.Linear) and m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.LayerNorm):
+                    nn.init.constant_(m.bias, 0)
+                    nn.init.constant_(m.weight, 1.0)
+
+            self.apply(_init_weights)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x, dataset_source=None):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, dataset_source)
+            else:
+                x = blk(x, dataset_source)
+
+        x = self.last_norm(x)
+
+        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+
+        return xp
+
+    def forward(self, x, dataset_source=None):
+        x = self.forward_features(x, dataset_source)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
diff --git a/mmpose/models/builder.py b/mmpose/models/builder.py
new file mode 100644
index 0000000..220839d
--- /dev/null
+++ b/mmpose/models/builder.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.cnn import build_model_from_cfg
+from mmcv.utils import Registry
+
+MODELS = Registry(
+    'models', build_func=build_model_from_cfg, parent=MMCV_MODELS)
+
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+POSENETS = MODELS
+MESH_MODELS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_posenet(cfg):
+    """Build posenet."""
+    return POSENETS.build(cfg)
+
+
+def build_mesh_model(cfg):
+    """Build mesh model."""
+    return MESH_MODELS.build(cfg)
diff --git a/mmpose/models/detectors/__init__.py b/mmpose/models/detectors/__init__.py
new file mode 100644
index 0000000..e098209
--- /dev/null
+++ b/mmpose/models/detectors/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .associative_embedding import AssociativeEmbedding
+from .interhand_3d import Interhand3D
+from .mesh import ParametricMesh
+from .multi_task import MultiTask
+from .multiview_pose import (DetectAndRegress, VoxelCenterDetector,
+                             VoxelSinglePose)
+from .pose_lifter import PoseLifter
+from .posewarper import PoseWarper
+from .top_down import TopDown
+from .top_down_moe import TopDownMoE
+
+__all__ = [
+    'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask',
+    'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress',
+    'VoxelCenterDetector', 'VoxelSinglePose', 'TopDownMoE'
+]
diff --git a/mmpose/models/detectors/associative_embedding.py b/mmpose/models/detectors/associative_embedding.py
new file mode 100644
index 0000000..100c780
--- /dev/null
+++ b/mmpose/models/detectors/associative_embedding.py
@@ -0,0 +1,420 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import torch
+from mmcv.image import imwrite
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core.evaluation import (aggregate_scale, aggregate_stage_flip,
+                                    flip_feature_maps, get_group_preds,
+                                    split_ae_outputs)
+from mmpose.core.post_processing.group import HeatmapParser
+from mmpose.core.visualization import imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class AssociativeEmbedding(BasePose):
+    """Associative embedding pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            ``loss_keypoint`` for heads instead.
+    """
+
+    def __init__(self,
+                 backbone,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.backbone = builder.build_backbone(backbone)
+
+        if keypoint_head is not None:
+            if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
+                warnings.warn(
+                    '`loss_pose` for BottomUp is deprecated, '
+                    'use `loss_keypoint` for heads instead. See '
+                    'https://github.com/open-mmlab/mmpose/pull/382'
+                    ' for more information.', DeprecationWarning)
+                keypoint_head['loss_keypoint'] = loss_pose
+
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.use_udp = test_cfg.get('use_udp', False)
+        self.parser = HeatmapParser(self.test_cfg)
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img=None,
+                targets=None,
+                masks=None,
+                joints=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss is True.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C
+            - img_width: imgW
+            - img_height: imgH
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+
+        Args:
+            img (torch.Tensor[N,C,imgH,imgW]): Input image.
+            targets (list(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps.
+            masks (list(torch.Tensor[N,H,W])): Masks of multi-scale target
+                heatmaps
+            joints (list(torch.Tensor[N,M,K,2])): Joints of multi-scale target
+                heatmaps for ae loss
+            img_metas (dict): Information about val & test.
+                By default it includes:
+
+                - "image_file": image path
+                - "aug_data": input
+                - "test_scale_factor": test scale factor
+                - "base_size": base size of input
+                - "center": center of image
+                - "scale": scale of image
+                - "flip_index": flip index of keypoints
+            return loss (bool): ``return_loss=True`` for training,
+                ``return_loss=False`` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if 'return_loss' is true, then return losses. \
+                Otherwise, return predicted poses, scores, image \
+                paths and heatmaps.
+        """
+
+        if return_loss:
+            return self.forward_train(img, targets, masks, joints, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, targets, masks, joints, img_metas, **kwargs):
+        """Forward the bottom-up model and calculate the loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            heatmaps weight: W
+            heatmaps height: H
+            max_num_people: M
+
+        Args:
+            img (torch.Tensor[N,C,imgH,imgW]): Input image.
+            targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps.
+            masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target
+                                              heatmaps
+            joints (List(torch.Tensor[N,M,K,2])): Joints of multi-scale target
+                                                 heatmaps for ae loss
+            img_metas (dict):Information about val&test
+                By default this includes:
+                - "image_file": image path
+                - "aug_data": input
+                - "test_scale_factor": test scale factor
+                - "base_size": base size of input
+                - "center": center of image
+                - "scale": scale of image
+                - "flip_index": flip index of keypoints
+
+        Returns:
+            dict: The total loss for bottom-up
+        """
+
+        output = self.backbone(img)
+
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+
+        # if return loss
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, targets, masks, joints)
+            losses.update(keypoint_losses)
+
+        return losses
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Outputs.
+        """
+        output = self.backbone(img)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Inference the bottom-up model.
+
+        Note:
+            - Batchsize: N (currently support batchsize = 1)
+            - num_img_channel: C
+            - img_width: imgW
+            - img_height: imgH
+
+        Args:
+            flip_index (List(int)):
+            aug_data (List(Tensor[NxCximgHximgW])): Multi-scale image
+            test_scale_factor (List(float)): Multi-scale factor
+            base_size (Tuple(int)): Base size of image when scale is 1
+            center (np.ndarray): center of image
+            scale (np.ndarray): the scale of image
+        """
+        assert img.size(0) == 1
+        assert len(img_metas) == 1
+
+        img_metas = img_metas[0]
+
+        aug_data = img_metas['aug_data']
+
+        test_scale_factor = img_metas['test_scale_factor']
+        base_size = img_metas['base_size']
+        center = img_metas['center']
+        scale = img_metas['scale']
+
+        result = {}
+
+        scale_heatmaps_list = []
+        scale_tags_list = []
+
+        for idx, s in enumerate(sorted(test_scale_factor, reverse=True)):
+            image_resized = aug_data[idx].to(img.device)
+
+            features = self.backbone(image_resized)
+            if self.with_keypoint:
+                outputs = self.keypoint_head(features)
+
+            heatmaps, tags = split_ae_outputs(
+                outputs, self.test_cfg['num_joints'],
+                self.test_cfg['with_heatmaps'], self.test_cfg['with_ae'],
+                self.test_cfg.get('select_output_index', range(len(outputs))))
+
+            if self.test_cfg.get('flip_test', True):
+                # use flip test
+                features_flipped = self.backbone(
+                    torch.flip(image_resized, [3]))
+                if self.with_keypoint:
+                    outputs_flipped = self.keypoint_head(features_flipped)
+
+                heatmaps_flipped, tags_flipped = split_ae_outputs(
+                    outputs_flipped, self.test_cfg['num_joints'],
+                    self.test_cfg['with_heatmaps'], self.test_cfg['with_ae'],
+                    self.test_cfg.get('select_output_index',
+                                      range(len(outputs))))
+
+                heatmaps_flipped = flip_feature_maps(
+                    heatmaps_flipped, flip_index=img_metas['flip_index'])
+                if self.test_cfg['tag_per_joint']:
+                    tags_flipped = flip_feature_maps(
+                        tags_flipped, flip_index=img_metas['flip_index'])
+                else:
+                    tags_flipped = flip_feature_maps(
+                        tags_flipped, flip_index=None, flip_output=True)
+
+            else:
+                heatmaps_flipped = None
+                tags_flipped = None
+
+            aggregated_heatmaps = aggregate_stage_flip(
+                heatmaps,
+                heatmaps_flipped,
+                index=-1,
+                project2image=self.test_cfg['project2image'],
+                size_projected=base_size,
+                align_corners=self.test_cfg.get('align_corners', True),
+                aggregate_stage='average',
+                aggregate_flip='average')
+
+            aggregated_tags = aggregate_stage_flip(
+                tags,
+                tags_flipped,
+                index=-1,
+                project2image=self.test_cfg['project2image'],
+                size_projected=base_size,
+                align_corners=self.test_cfg.get('align_corners', True),
+                aggregate_stage='concat',
+                aggregate_flip='concat')
+
+            if s == 1 or len(test_scale_factor) == 1:
+                if isinstance(aggregated_tags, list):
+                    scale_tags_list.extend(aggregated_tags)
+                else:
+                    scale_tags_list.append(aggregated_tags)
+
+            if isinstance(aggregated_heatmaps, list):
+                scale_heatmaps_list.extend(aggregated_heatmaps)
+            else:
+                scale_heatmaps_list.append(aggregated_heatmaps)
+
+        aggregated_heatmaps = aggregate_scale(
+            scale_heatmaps_list,
+            align_corners=self.test_cfg.get('align_corners', True),
+            aggregate_scale='average')
+
+        aggregated_tags = aggregate_scale(
+            scale_tags_list,
+            align_corners=self.test_cfg.get('align_corners', True),
+            aggregate_scale='unsqueeze_concat')
+
+        heatmap_size = aggregated_heatmaps.shape[2:4]
+        tag_size = aggregated_tags.shape[2:4]
+        if heatmap_size != tag_size:
+            tmp = []
+            for idx in range(aggregated_tags.shape[-1]):
+                tmp.append(
+                    torch.nn.functional.interpolate(
+                        aggregated_tags[..., idx],
+                        size=heatmap_size,
+                        mode='bilinear',
+                        align_corners=self.test_cfg.get('align_corners',
+                                                        True)).unsqueeze(-1))
+            aggregated_tags = torch.cat(tmp, dim=-1)
+
+        # perform grouping
+        grouped, scores = self.parser.parse(aggregated_heatmaps,
+                                            aggregated_tags,
+                                            self.test_cfg['adjust'],
+                                            self.test_cfg['refine'])
+
+        preds = get_group_preds(
+            grouped,
+            center,
+            scale, [aggregated_heatmaps.size(3),
+                    aggregated_heatmaps.size(2)],
+            use_udp=self.use_udp)
+
+        image_paths = []
+        image_paths.append(img_metas['image_file'])
+
+        if return_heatmap:
+            output_heatmap = aggregated_heatmaps.detach().cpu().numpy()
+        else:
+            output_heatmap = None
+
+        result['preds'] = preds
+        result['scores'] = scores
+        result['image_paths'] = image_paths
+        result['output_heatmap'] = output_heatmap
+
+        return result
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='AssociativeEmbedding')
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color=None,
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    radius=4,
+                    thickness=1,
+                    font_scale=0.5,
+                    win_name='',
+                    show=False,
+                    show_keypoint_weight=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            skeleton (list[list]): The connection of keypoints.
+                skeleton is 0-based indexing.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            show_keypoint_weight (bool): Whether to change the transparency
+                using the predicted confidence scores of keypoints.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized image only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        img_h, img_w, _ = img.shape
+
+        pose_result = []
+        for res in result:
+            pose_result.append(res['keypoints'])
+
+        imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
+                         pose_kpt_color, pose_link_color, radius, thickness)
+
+        if show:
+            imshow(img, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/mmpose/models/detectors/base.py b/mmpose/models/detectors/base.py
new file mode 100644
index 0000000..5d459b4
--- /dev/null
+++ b/mmpose/models/detectors/base.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+
+class BasePose(nn.Module, metaclass=ABCMeta):
+    """Base class for pose detectors.
+
+    All recognizers should subclass it.
+    All subclass should overwrite:
+        Methods:`forward_train`, supporting to forward when training.
+        Methods:`forward_test`, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head modules to give output.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+    """
+
+    @abstractmethod
+    def forward_train(self, img, img_metas, **kwargs):
+        """Defines the computation performed at training."""
+
+    @abstractmethod
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at testing."""
+
+    @abstractmethod
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Forward function."""
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
+                which may be a weighted sum of all losses, log_vars \
+                contains all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, float):
+                log_vars[loss_name] = loss_value
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors or float')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if not isinstance(loss_value, float):
+                if dist.is_available() and dist.is_initialized():
+                    loss_value = loss_value.data.clone()
+                    dist.all_reduce(loss_value.div_(dist.get_world_size()))
+                log_vars[loss_name] = loss_value.item()
+            else:
+                log_vars[loss_name] = loss_value
+
+        return loss, log_vars
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data_batch (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self.forward(**data_batch)
+
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(next(iter(data_batch.values()))))
+
+        return outputs
+
+    def val_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        results = self.forward(return_loss=False, **data_batch)
+
+        outputs = dict(results=results)
+
+        return outputs
+
+    @abstractmethod
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
diff --git a/mmpose/models/detectors/interhand_3d.py b/mmpose/models/detectors/interhand_3d.py
new file mode 100644
index 0000000..5a4d6bd
--- /dev/null
+++ b/mmpose/models/detectors/interhand_3d.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmcv.utils.misc import deprecated_api_warning
+
+from mmpose.core import imshow_keypoints, imshow_keypoints_3d
+from ..builder import POSENETS
+from .top_down import TopDown
+
+
+@POSENETS.register_module()
+class Interhand3D(TopDown):
+    """Top-down interhand 3D pose detector of paper ref: Gyeongsik Moon.
+
+    "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose
+    Estimation from a Single RGB Image". A child class of TopDown detector.
+    """
+
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  list[Tensor], list[list[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (list[torch.Tensor]): Target heatmaps, relative hand
+            root depth and hand type.
+            target_weight (list[torch.Tensor]): Weights for target
+            heatmaps, relative hand root depth and hand type.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+                - "heatmap3d_depth_bound": depth bound of hand keypoint 3D
+                    heatmap
+                - "root_depth_bound": depth bound of relative root depth 1D
+                    heatmap
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths, \
+                heatmaps, relative hand root depth and hand type.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(img, img_metas, **kwargs)
+
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output = [(out + out_flipped) * 0.5
+                          for out, out_flipped in zip(output, output_flipped)]
+
+        if self.with_keypoint:
+            result = self.keypoint_head.decode(
+                img_metas, output, img_size=[img_width, img_height])
+        else:
+            result = {}
+        return result
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='Interhand3D')
+    def show_result(self,
+                    result,
+                    img=None,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    radius=8,
+                    bbox_color='green',
+                    thickness=2,
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    vis_height=400,
+                    num_instances=-1,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Visualize 3D pose estimation results.
+
+        Args:
+            result (list[dict]): The pose estimation results containing:
+
+                - "keypoints_3d" ([K,4]): 3D keypoints
+                - "keypoints" ([K,3] or [T,K,3]): Optional for visualizing
+                    2D inputs. If a sequence is given, only the last frame
+                    will be used for visualization
+                - "bbox" ([4,] or [T,4]): Optional for visualizing 2D inputs
+                - "title" (str): title for the subplot
+            img (str or Tensor): Optional. The image to visualize 2D inputs on.
+            skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+                links, each is a pair of joint indices.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            radius (int): Radius of circles.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            thickness (int): Thickness of lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M limbs.
+                If None, do not draw limbs.
+            vis_height (int): The image height of the visualization. The width
+                will be N*vis_height depending on the number of visualized
+                items.
+            num_instances (int): Number of instances to be shown in 3D. If
+                smaller than 0, all the instances in the pose_result will be
+                shown. Otherwise, pad or truncate the pose_result to a length
+                of num_instances.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        if num_instances < 0:
+            assert len(result) > 0
+        result = sorted(result, key=lambda x: x.get('track_id', 0))
+
+        # draw image and 2d poses
+        if img is not None:
+            img = mmcv.imread(img)
+
+            bbox_result = []
+            pose_2d = []
+            for res in result:
+                if 'bbox' in res:
+                    bbox = np.array(res['bbox'])
+                    if bbox.ndim != 1:
+                        assert bbox.ndim == 2
+                        bbox = bbox[-1]  # Get bbox from the last frame
+                    bbox_result.append(bbox)
+                if 'keypoints' in res:
+                    kpts = np.array(res['keypoints'])
+                    if kpts.ndim != 2:
+                        assert kpts.ndim == 3
+                        kpts = kpts[-1]  # Get 2D keypoints from the last frame
+                    pose_2d.append(kpts)
+
+            if len(bbox_result) > 0:
+                bboxes = np.vstack(bbox_result)
+                mmcv.imshow_bboxes(
+                    img,
+                    bboxes,
+                    colors=bbox_color,
+                    top_k=-1,
+                    thickness=2,
+                    show=False)
+            if len(pose_2d) > 0:
+                imshow_keypoints(
+                    img,
+                    pose_2d,
+                    skeleton,
+                    kpt_score_thr=kpt_score_thr,
+                    pose_kpt_color=pose_kpt_color,
+                    pose_link_color=pose_link_color,
+                    radius=radius,
+                    thickness=thickness)
+            img = mmcv.imrescale(img, scale=vis_height / img.shape[0])
+
+        img_vis = imshow_keypoints_3d(
+            result,
+            img,
+            skeleton,
+            pose_kpt_color,
+            pose_link_color,
+            vis_height,
+            axis_limit=300,
+            axis_azimuth=-115,
+            axis_elev=15,
+            kpt_score_thr=kpt_score_thr,
+            num_instances=num_instances)
+
+        if show:
+            mmcv.visualization.imshow(img_vis, win_name, wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(img_vis, out_file)
+
+        return img_vis
diff --git a/mmpose/models/detectors/mesh.py b/mmpose/models/detectors/mesh.py
new file mode 100644
index 0000000..0af18e3
--- /dev/null
+++ b/mmpose/models/detectors/mesh.py
@@ -0,0 +1,438 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import mmcv
+import numpy as np
+import torch
+
+from mmpose.core.visualization.image import imshow_mesh_3d
+from mmpose.models.misc.discriminator import SMPLDiscriminator
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+
+def set_requires_grad(nets, requires_grad=False):
+    """Set requies_grad for all the networks.
+
+    Args:
+        nets (nn.Module | list[nn.Module]): A list of networks or a single
+            network.
+        requires_grad (bool): Whether the networks require gradients or not
+    """
+    if not isinstance(nets, list):
+        nets = [nets]
+    for net in nets:
+        if net is not None:
+            for param in net.parameters():
+                param.requires_grad = requires_grad
+
+
+@POSENETS.register_module()
+class ParametricMesh(BasePose):
+    """Model-based 3D human mesh detector. Take a single color image as input
+    and output 3D joints, SMPL parameters and camera parameters.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        mesh_head (dict): Mesh head to process feature.
+        smpl (dict): Config for SMPL model.
+        disc (dict): Discriminator for SMPL parameters. Default: None.
+        loss_gan (dict): Config for adversarial loss. Default: None.
+        loss_mesh (dict): Config for mesh loss. Default: None.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+    """
+
+    def __init__(self,
+                 backbone,
+                 mesh_head,
+                 smpl,
+                 disc=None,
+                 loss_gan=None,
+                 loss_mesh=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = builder.build_backbone(backbone)
+        self.mesh_head = builder.build_head(mesh_head)
+        self.generator = torch.nn.Sequential(self.backbone, self.mesh_head)
+
+        self.smpl = builder.build_mesh_model(smpl)
+
+        self.with_gan = disc is not None and loss_gan is not None
+        if self.with_gan:
+            self.discriminator = SMPLDiscriminator(**disc)
+            self.loss_gan = builder.build_loss(loss_gan)
+        self.disc_step_count = 0
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.loss_mesh = builder.build_loss(loss_mesh)
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        self.mesh_head.init_weights()
+        if self.with_gan:
+            self.discriminator.init_weights()
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """Train step function.
+
+        In this function, the detector will finish the train step following
+        the pipeline:
+
+            1. get fake and real SMPL parameters
+            2. optimize discriminator (if have)
+            3. optimize generator
+
+        If `self.train_cfg.disc_step > 1`, the train step will contain multiple
+        iterations for optimizing discriminator with different input data and
+        only one iteration for optimizing generator after `disc_step`
+        iterations for discriminator.
+
+        Args:
+            data_batch (torch.Tensor): Batch of data as input.
+            optimizer (dict[torch.optim.Optimizer]): Dict with optimizers for
+                generator and discriminator (if have).
+
+        Returns:
+            outputs (dict): Dict with loss, information for logger,
+            the number of samples.
+        """
+
+        img = data_batch['img']
+        pred_smpl = self.generator(img)
+        pred_pose, pred_beta, pred_camera = pred_smpl
+
+        # optimize discriminator (if have)
+        if self.train_cfg['disc_step'] > 0 and self.with_gan:
+            set_requires_grad(self.discriminator, True)
+            fake_data = (pred_camera.detach(), pred_pose.detach(),
+                         pred_beta.detach())
+            mosh_theta = data_batch['mosh_theta']
+            real_data = (mosh_theta[:, :3], mosh_theta[:,
+                                                       3:75], mosh_theta[:,
+                                                                         75:])
+            fake_score = self.discriminator(fake_data)
+            real_score = self.discriminator(real_data)
+
+            disc_losses = {}
+            disc_losses['real_loss'] = self.loss_gan(
+                real_score, target_is_real=True, is_disc=True)
+            disc_losses['fake_loss'] = self.loss_gan(
+                fake_score, target_is_real=False, is_disc=True)
+            loss_disc, log_vars_d = self._parse_losses(disc_losses)
+
+            optimizer['discriminator'].zero_grad()
+            loss_disc.backward()
+            optimizer['discriminator'].step()
+            self.disc_step_count = \
+                (self.disc_step_count + 1) % self.train_cfg['disc_step']
+
+            if self.disc_step_count != 0:
+                outputs = dict(
+                    loss=loss_disc,
+                    log_vars=log_vars_d,
+                    num_samples=len(next(iter(data_batch.values()))))
+                return outputs
+
+        # optimize generator
+        pred_out = self.smpl(
+            betas=pred_beta,
+            body_pose=pred_pose[:, 1:],
+            global_orient=pred_pose[:, :1])
+        pred_vertices, pred_joints_3d = pred_out['vertices'], pred_out[
+            'joints']
+
+        gt_beta = data_batch['beta']
+        gt_pose = data_batch['pose']
+        gt_vertices = self.smpl(
+            betas=gt_beta,
+            body_pose=gt_pose[:, 3:],
+            global_orient=gt_pose[:, :3])['vertices']
+
+        pred = dict(
+            pose=pred_pose,
+            beta=pred_beta,
+            camera=pred_camera,
+            vertices=pred_vertices,
+            joints_3d=pred_joints_3d)
+
+        target = {
+            key: data_batch[key]
+            for key in [
+                'pose', 'beta', 'has_smpl', 'joints_3d', 'joints_2d',
+                'joints_3d_visible', 'joints_2d_visible'
+            ]
+        }
+        target['vertices'] = gt_vertices
+
+        losses = self.loss_mesh(pred, target)
+
+        if self.with_gan:
+            set_requires_grad(self.discriminator, False)
+            pred_theta = (pred_camera, pred_pose, pred_beta)
+            pred_score = self.discriminator(pred_theta)
+            loss_adv = self.loss_gan(
+                pred_score, target_is_real=True, is_disc=False)
+            losses['adv_loss'] = loss_adv
+
+        loss, log_vars = self._parse_losses(losses)
+        optimizer['generator'].zero_grad()
+        loss.backward()
+        optimizer['generator'].step()
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(next(iter(data_batch.values()))))
+
+        return outputs
+
+    def forward_train(self, *args, **kwargs):
+        """Forward function for training.
+
+        For ParametricMesh, we do not use this interface.
+        """
+        raise NotImplementedError('This interface should not be used in '
+                                  'current training schedule. Please use '
+                                  '`train_step` for training.')
+
+    def val_step(self, data_batch, **kwargs):
+        """Forward function for evaluation.
+
+        Args:
+            data_batch (dict): Contain data for forward.
+
+        Returns:
+            dict: Contain the results from model.
+        """
+        output = self.forward_test(**data_batch, **kwargs)
+        return output
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Outputs.
+        """
+        output = self.generator(img)
+        return output
+
+    def forward_test(self,
+                     img,
+                     img_metas,
+                     return_vertices=False,
+                     return_faces=False,
+                     **kwargs):
+        """Defines the computation performed at every call when testing."""
+
+        pred_smpl = self.generator(img)
+        pred_pose, pred_beta, pred_camera = pred_smpl
+        pred_out = self.smpl(
+            betas=pred_beta,
+            body_pose=pred_pose[:, 1:],
+            global_orient=pred_pose[:, :1])
+        pred_vertices, pred_joints_3d = pred_out['vertices'], pred_out[
+            'joints']
+
+        all_preds = {}
+        all_preds['keypoints_3d'] = pred_joints_3d.detach().cpu().numpy()
+        all_preds['smpl_pose'] = pred_pose.detach().cpu().numpy()
+        all_preds['smpl_beta'] = pred_beta.detach().cpu().numpy()
+        all_preds['camera'] = pred_camera.detach().cpu().numpy()
+
+        if return_vertices:
+            all_preds['vertices'] = pred_vertices.detach().cpu().numpy()
+        if return_faces:
+            all_preds['faces'] = self.smpl.get_faces()
+
+        all_boxes = []
+        image_path = []
+        for img_meta in img_metas:
+            box = np.zeros(6, dtype=np.float32)
+            c = img_meta['center']
+            s = img_meta['scale']
+            if 'bbox_score' in img_metas:
+                score = np.array(img_metas['bbox_score']).reshape(-1)
+            else:
+                score = 1.0
+            box[0:2] = c
+            box[2:4] = s
+            box[4] = np.prod(s * 200.0, axis=0)
+            box[5] = score
+            all_boxes.append(box)
+            image_path.append(img_meta['image_file'])
+
+        all_preds['bboxes'] = np.stack(all_boxes, axis=0)
+        all_preds['image_path'] = image_path
+        return all_preds
+
+    def get_3d_joints_from_mesh(self, vertices):
+        """Get 3D joints from 3D mesh using predefined joints regressor."""
+        return torch.matmul(
+            self.joints_regressor.to(vertices.device), vertices)
+
+    def forward(self, img, img_metas=None, return_loss=False, **kwargs):
+        """Forward function.
+
+        Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note:
+            - batch_size: N
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+
+        Args:
+            img (torch.Tensor[N x C x imgH x imgW]): Input images.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        Returns:
+            Return predicted 3D joints, SMPL parameters, boxes and image paths.
+        """
+
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        return self.forward_test(img, img_metas, **kwargs)
+
+    def show_result(self,
+                    result,
+                    img,
+                    show=False,
+                    out_file=None,
+                    win_name='',
+                    wait_time=0,
+                    bbox_color='green',
+                    mesh_color=(76, 76, 204),
+                    **kwargs):
+        """Visualize 3D mesh estimation results.
+
+        Args:
+            result (list[dict]): The mesh estimation results containing:
+
+               - "bbox" (ndarray[4]): instance bounding bbox
+               - "center" (ndarray[2]): bbox center
+               - "scale" (ndarray[2]): bbox scale
+               - "keypoints_3d" (ndarray[K,3]): predicted 3D keypoints
+               - "camera" (ndarray[3]): camera parameters
+               - "vertices" (ndarray[V, 3]): predicted 3D vertices
+               - "faces" (ndarray[F, 3]): mesh faces
+            img (str or Tensor): Optional. The image to visualize 2D inputs on.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            wait_time (int): Value of waitKey param. Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            mesh_color (str or tuple or :obj:`Color`): Color of mesh surface.
+
+        Returns:
+            ndarray: Visualized img, only if not `show` or `out_file`.
+        """
+
+        if img is not None:
+            img = mmcv.imread(img)
+
+        focal_length = self.loss_mesh.focal_length
+        H, W, C = img.shape
+        img_center = np.array([[0.5 * W], [0.5 * H]])
+
+        # show bounding boxes
+        bboxes = [res['bbox'] for res in result]
+        bboxes = np.vstack(bboxes)
+        mmcv.imshow_bboxes(
+            img, bboxes, colors=bbox_color, top_k=-1, thickness=2, show=False)
+
+        vertex_list = []
+        face_list = []
+        for res in result:
+            vertices = res['vertices']
+            faces = res['faces']
+            camera = res['camera']
+            camera_center = res['center']
+            scale = res['scale']
+
+            # predicted vertices are in root-relative space,
+            # we need to translate them to camera space.
+            translation = np.array([
+                camera[1], camera[2],
+                2 * focal_length / (scale[0] * 200.0 * camera[0] + 1e-9)
+            ])
+            mean_depth = vertices[:, -1].mean() + translation[-1]
+            translation[:2] += (camera_center -
+                                img_center[:, 0]) / focal_length * mean_depth
+            vertices += translation[None, :]
+
+            vertex_list.append(vertices)
+            face_list.append(faces)
+
+        # render from front view
+        img_vis = imshow_mesh_3d(
+            img,
+            vertex_list,
+            face_list,
+            img_center, [focal_length, focal_length],
+            colors=mesh_color)
+
+        # render from side view
+        # rotate mesh vertices
+        R = cv2.Rodrigues(np.array([0, np.radians(90.), 0]))[0]
+        rot_vertex_list = [np.dot(vert, R) for vert in vertex_list]
+
+        # get the 3D bbox containing all meshes
+        rot_vertices = np.concatenate(rot_vertex_list, axis=0)
+        min_corner = rot_vertices.min(0)
+        max_corner = rot_vertices.max(0)
+
+        center_3d = 0.5 * (min_corner + max_corner)
+        ratio = 0.8
+        bbox3d_size = max_corner - min_corner
+
+        # set appropriate translation to make all meshes appear in the image
+        z_x = bbox3d_size[0] * focal_length / (ratio * W) - min_corner[2]
+        z_y = bbox3d_size[1] * focal_length / (ratio * H) - min_corner[2]
+        z = max(z_x, z_y)
+        translation = -center_3d
+        translation[2] = z
+        translation = translation[None, :]
+        rot_vertex_list = [
+            rot_vert + translation for rot_vert in rot_vertex_list
+        ]
+
+        # render from side view
+        img_side = imshow_mesh_3d(
+            np.ones_like(img) * 255, rot_vertex_list, face_list, img_center,
+            [focal_length, focal_length])
+
+        # merger images from front view and side view
+        img_vis = np.concatenate([img_vis, img_side], axis=1)
+
+        if show:
+            mmcv.visualization.imshow(img_vis, win_name, wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(img_vis, out_file)
+
+        return img_vis
diff --git a/mmpose/models/detectors/multi_task.py b/mmpose/models/detectors/multi_task.py
new file mode 100644
index 0000000..1b6f317
--- /dev/null
+++ b/mmpose/models/detectors/multi_task.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from .. import builder
+from ..builder import POSENETS
+
+
+@POSENETS.register_module()
+class MultiTask(nn.Module):
+    """Multi-task detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        heads (list[dict]): heads to output predictions.
+        necks (list[dict] | None): necks to process feature.
+        head2neck (dict{int:int}): head index to neck index.
+        pretrained (str): Path to the pretrained models.
+    """
+
+    def __init__(self,
+                 backbone,
+                 heads,
+                 necks=None,
+                 head2neck=None,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = builder.build_backbone(backbone)
+
+        if head2neck is None:
+            assert necks is None
+            head2neck = {}
+
+        self.head2neck = {}
+        for i in range(len(heads)):
+            self.head2neck[i] = head2neck[i] if i in head2neck else -1
+
+        self.necks = nn.ModuleList([])
+        if necks is not None:
+            for neck in necks:
+                self.necks.append(builder.build_neck(neck))
+        self.necks.append(nn.Identity())
+
+        self.heads = nn.ModuleList([])
+        assert heads is not None
+        for head in heads:
+            assert head is not None
+            self.heads.append(builder.build_head(head))
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_necks(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'necks')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_necks:
+            for neck in self.necks:
+                if hasattr(neck, 'init_weights'):
+                    neck.init_weights()
+
+        for head in self.heads:
+            if hasattr(head, 'init_weights'):
+                head.init_weights()
+
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img weight: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[N,C,imgH,imgW]): Input images.
+            target (list[torch.Tensor]): Targets.
+            target_weight (List[torch.Tensor]): Weights.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(img, img_metas, **kwargs)
+
+    def forward_train(self, img, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        features = self.backbone(img)
+        outputs = []
+
+        for head_id, head in enumerate(self.heads):
+            neck_id = self.head2neck[head_id]
+            outputs.append(head(self.necks[neck_id](features)))
+
+        # if return loss
+        losses = dict()
+
+        for head, output, gt, gt_weight in zip(self.heads, outputs, target,
+                                               target_weight):
+            loss = head.get_loss(output, gt, gt_weight)
+            assert len(set(losses.keys()).intersection(set(loss.keys()))) == 0
+            losses.update(loss)
+
+            if hasattr(head, 'get_accuracy'):
+                acc = head.get_accuracy(output, gt, gt_weight)
+                assert len(set(losses.keys()).intersection(set(
+                    acc.keys()))) == 0
+                losses.update(acc)
+
+        return losses
+
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        results = {}
+
+        features = self.backbone(img)
+        outputs = []
+
+        for head_id, head in enumerate(self.heads):
+            neck_id = self.head2neck[head_id]
+            if hasattr(head, 'inference_model'):
+                head_output = head.inference_model(
+                    self.necks[neck_id](features), flip_pairs=None)
+            else:
+                head_output = head(
+                    self.necks[neck_id](features)).detach().cpu().numpy()
+            outputs.append(head_output)
+
+        for head, output in zip(self.heads, outputs):
+            result = head.decode(
+                img_metas, output, img_size=[img_width, img_height])
+            results.update(result)
+        return results
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            list[Tensor]: Outputs.
+        """
+        features = self.backbone(img)
+        outputs = []
+        for head_id, head in enumerate(self.heads):
+            neck_id = self.head2neck[head_id]
+            outputs.append(head(self.necks[neck_id](features)))
+        return outputs
diff --git a/mmpose/models/detectors/multiview_pose.py b/mmpose/models/detectors/multiview_pose.py
new file mode 100644
index 0000000..c3d2221
--- /dev/null
+++ b/mmpose/models/detectors/multiview_pose.py
@@ -0,0 +1,889 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.runner import load_checkpoint
+
+from mmpose.core.camera import SimpleCameraTorch
+from mmpose.core.post_processing.post_transforms import (
+    affine_transform_torch, get_affine_transform)
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+
+class ProjectLayer(nn.Module):
+
+    def __init__(self, image_size, heatmap_size):
+        """Project layer to get voxel feature. Adapted from
+        https://github.com/microsoft/voxelpose-
+        pytorch/blob/main/lib/models/project_layer.py.
+
+        Args:
+            image_size (int or list): input size of the 2D model
+            heatmap_size (int or list): output size of the 2D model
+        """
+        super(ProjectLayer, self).__init__()
+        self.image_size = image_size
+        self.heatmap_size = heatmap_size
+        if isinstance(self.image_size, int):
+            self.image_size = [self.image_size, self.image_size]
+        if isinstance(self.heatmap_size, int):
+            self.heatmap_size = [self.heatmap_size, self.heatmap_size]
+
+    def compute_grid(self, box_size, box_center, num_bins, device=None):
+        if isinstance(box_size, int) or isinstance(box_size, float):
+            box_size = [box_size, box_size, box_size]
+        if isinstance(num_bins, int):
+            num_bins = [num_bins, num_bins, num_bins]
+
+        grid_1D_x = torch.linspace(
+            -box_size[0] / 2, box_size[0] / 2, num_bins[0], device=device)
+        grid_1D_y = torch.linspace(
+            -box_size[1] / 2, box_size[1] / 2, num_bins[1], device=device)
+        grid_1D_z = torch.linspace(
+            -box_size[2] / 2, box_size[2] / 2, num_bins[2], device=device)
+        grid_x, grid_y, grid_z = torch.meshgrid(
+            grid_1D_x + box_center[0],
+            grid_1D_y + box_center[1],
+            grid_1D_z + box_center[2],
+        )
+        grid_x = grid_x.contiguous().view(-1, 1)
+        grid_y = grid_y.contiguous().view(-1, 1)
+        grid_z = grid_z.contiguous().view(-1, 1)
+        grid = torch.cat([grid_x, grid_y, grid_z], dim=1)
+
+        return grid
+
+    def get_voxel(self, feature_maps, meta, grid_size, grid_center, cube_size):
+        device = feature_maps[0].device
+        batch_size = feature_maps[0].shape[0]
+        num_channels = feature_maps[0].shape[1]
+        num_bins = cube_size[0] * cube_size[1] * cube_size[2]
+        n = len(feature_maps)
+        cubes = torch.zeros(
+            batch_size, num_channels, 1, num_bins, n, device=device)
+        w, h = self.heatmap_size
+        grids = torch.zeros(batch_size, num_bins, 3, device=device)
+        bounding = torch.zeros(batch_size, 1, 1, num_bins, n, device=device)
+        for i in range(batch_size):
+            if len(grid_center[0]) == 3 or grid_center[i][3] >= 0:
+                if len(grid_center) == 1:
+                    grid = self.compute_grid(
+                        grid_size, grid_center[0], cube_size, device=device)
+                else:
+                    grid = self.compute_grid(
+                        grid_size, grid_center[i], cube_size, device=device)
+                grids[i:i + 1] = grid
+                for c in range(n):
+                    center = meta[i]['center'][c]
+                    scale = meta[i]['scale'][c]
+
+                    width, height = center * 2
+                    trans = torch.as_tensor(
+                        get_affine_transform(center, scale / 200.0, 0,
+                                             self.image_size),
+                        dtype=torch.float,
+                        device=device)
+
+                    cam_param = meta[i]['camera'][c].copy()
+
+                    single_view_camera = SimpleCameraTorch(
+                        param=cam_param, device=device)
+                    xy = single_view_camera.world_to_pixel(grid)
+
+                    bounding[i, 0, 0, :, c] = (xy[:, 0] >= 0) & (
+                        xy[:, 1] >= 0) & (xy[:, 0] < width) & (
+                            xy[:, 1] < height)
+                    xy = torch.clamp(xy, -1.0, max(width, height))
+                    xy = affine_transform_torch(xy, trans)
+                    xy = xy * torch.tensor(
+                        [w, h], dtype=torch.float,
+                        device=device) / torch.tensor(
+                            self.image_size, dtype=torch.float, device=device)
+                    sample_grid = xy / torch.tensor([w - 1, h - 1],
+                                                    dtype=torch.float,
+                                                    device=device) * 2.0 - 1.0
+                    sample_grid = torch.clamp(
+                        sample_grid.view(1, 1, num_bins, 2), -1.1, 1.1)
+
+                    cubes[i:i + 1, :, :, :, c] += F.grid_sample(
+                        feature_maps[c][i:i + 1, :, :, :],
+                        sample_grid,
+                        align_corners=True)
+
+        cubes = torch.sum(
+            torch.mul(cubes, bounding), dim=-1) / (
+                torch.sum(bounding, dim=-1) + 1e-6)
+        cubes[cubes != cubes] = 0.0
+        cubes = cubes.clamp(0.0, 1.0)
+
+        cubes = cubes.view(batch_size, num_channels, cube_size[0],
+                           cube_size[1], cube_size[2])
+        return cubes, grids
+
+    def forward(self, feature_maps, meta, grid_size, grid_center, cube_size):
+        cubes, grids = self.get_voxel(feature_maps, meta, grid_size,
+                                      grid_center, cube_size)
+        return cubes, grids
+
+
+@POSENETS.register_module()
+class DetectAndRegress(BasePose):
+    """DetectAndRegress approach for multiview human pose detection.
+
+    Args:
+        backbone (ConfigDict): Dictionary to construct the 2D pose detector
+        human_detector (ConfigDict): dictionary to construct human detector
+        pose_regressor (ConfigDict): dictionary to construct pose regressor
+        train_cfg (ConfigDict): Config for training. Default: None.
+        test_cfg (ConfigDict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained 2D model. Default: None.
+        freeze_2d (bool): Whether to freeze the 2D model in training.
+            Default: True.
+    """
+
+    def __init__(self,
+                 backbone,
+                 human_detector,
+                 pose_regressor,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 freeze_2d=True):
+        super(DetectAndRegress, self).__init__()
+        if backbone is not None:
+            self.backbone = builder.build_posenet(backbone)
+            if self.training and pretrained is not None:
+                load_checkpoint(self.backbone, pretrained)
+        else:
+            self.backbone = None
+
+        self.freeze_2d = freeze_2d
+        self.human_detector = builder.MODELS.build(human_detector)
+        self.pose_regressor = builder.MODELS.build(pose_regressor)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    @staticmethod
+    def _freeze(model):
+        """Freeze parameters."""
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+
+    def train(self, mode=True):
+        """Sets the module in training mode.
+        Args:
+            mode (bool): whether to set training mode (``True``)
+                or evaluation mode (``False``). Default: ``True``.
+
+        Returns:
+            Module: self
+        """
+        super().train(mode)
+        if mode and self.freeze_2d and self.backbone is not None:
+            self._freeze(self.backbone)
+
+        return self
+
+    def forward(self,
+                img=None,
+                img_metas=None,
+                return_loss=True,
+                targets=None,
+                masks=None,
+                targets_3d=None,
+                input_heatmaps=None,
+                **kwargs):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            return_loss: Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            targets (list(torch.Tensor[NxKxHxW])):
+                Multi-camera target feature_maps of the 2D model.
+            masks (list(torch.Tensor[NxHxW])):
+                Multi-camera masks of the input to the 2D model.
+            targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]):
+                Ground-truth 3D heatmap of human centers.
+            input_heatmaps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps when the 2D model is not available.
+                 Default: None.
+            **kwargs:
+
+        Returns:
+            dict: if 'return_loss' is true, then return losses.
+              Otherwise, return predicted poses, human centers and sample_id
+
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, targets, masks,
+                                      targets_3d, input_heatmaps)
+        else:
+            return self.forward_test(img, img_metas, input_heatmaps)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data_batch (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self.forward(**data_batch)
+
+        loss, log_vars = self._parse_losses(losses)
+        if 'img' in data_batch:
+            batch_size = data_batch['img'][0].shape[0]
+        else:
+            assert 'input_heatmaps' in data_batch
+            batch_size = data_batch['input_heatmaps'][0][0].shape[0]
+
+        outputs = dict(loss=loss, log_vars=log_vars, num_samples=batch_size)
+
+        return outputs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      targets=None,
+                      masks=None,
+                      targets_3d=None,
+                      input_heatmaps=None):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            targets (list(torch.Tensor[NxKxHxW])):
+                Multi-camera target feature_maps of the 2D model.
+            masks (list(torch.Tensor[NxHxW])):
+                Multi-camera masks of the input to the 2D model.
+            targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]):
+                Ground-truth 3D heatmap of human centers.
+            input_heatmaps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps when the 2D model is not available.
+                 Default: None.
+
+        Returns:
+            dict: losses.
+
+        """
+        if self.backbone is None:
+            assert input_heatmaps is not None
+            feature_maps = []
+            for input_heatmap in input_heatmaps:
+                feature_maps.append(input_heatmap[0])
+        else:
+            feature_maps = []
+            assert isinstance(img, list)
+            for img_ in img:
+                feature_maps.append(self.backbone.forward_dummy(img_)[0])
+
+        losses = dict()
+        human_candidates, human_loss = self.human_detector.forward_train(
+            None, img_metas, feature_maps, targets_3d, return_preds=True)
+        losses.update(human_loss)
+
+        pose_loss = self.pose_regressor(
+            None,
+            img_metas,
+            return_loss=True,
+            feature_maps=feature_maps,
+            human_candidates=human_candidates)
+        losses.update(pose_loss)
+
+        if not self.freeze_2d:
+            losses_2d = {}
+            heatmaps_tensor = torch.cat(feature_maps, dim=0)
+            targets_tensor = torch.cat(targets, dim=0)
+            masks_tensor = torch.cat(masks, dim=0)
+            losses_2d_ = self.backbone.get_loss(heatmaps_tensor,
+                                                targets_tensor, masks_tensor)
+            for k, v in losses_2d_.items():
+                losses_2d[k + '_2d'] = v
+            losses.update(losses_2d)
+
+        return losses
+
+    def forward_test(
+        self,
+        img,
+        img_metas,
+        input_heatmaps=None,
+    ):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            input_heatmaps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps when the 2D model is not available.
+                 Default: None.
+
+        Returns:
+            dict: predicted poses, human centers and sample_id
+
+        """
+        if self.backbone is None:
+            assert input_heatmaps is not None
+            feature_maps = []
+            for input_heatmap in input_heatmaps:
+                feature_maps.append(input_heatmap[0])
+        else:
+            feature_maps = []
+            assert isinstance(img, list)
+            for img_ in img:
+                feature_maps.append(self.backbone.forward_dummy(img_)[0])
+
+        human_candidates = self.human_detector.forward_test(
+            None, img_metas, feature_maps)
+
+        human_poses = self.pose_regressor(
+            None,
+            img_metas,
+            return_loss=False,
+            feature_maps=feature_maps,
+            human_candidates=human_candidates)
+
+        result = {}
+        result['pose_3d'] = human_poses.cpu().numpy()
+        result['human_detection_3d'] = human_candidates.cpu().numpy()
+        result['sample_id'] = [img_meta['sample_id'] for img_meta in img_metas]
+
+        return result
+
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
+
+    def forward_dummy(self, img, input_heatmaps=None, num_candidates=5):
+        """Used for computing network FLOPs."""
+        if self.backbone is None:
+            assert input_heatmaps is not None
+            feature_maps = []
+            for input_heatmap in input_heatmaps:
+                feature_maps.append(input_heatmap[0])
+        else:
+            feature_maps = []
+            assert isinstance(img, list)
+            for img_ in img:
+                feature_maps.append(self.backbone.forward_dummy(img_)[0])
+
+        _ = self.human_detector.forward_dummy(feature_maps)
+
+        _ = self.pose_regressor.forward_dummy(feature_maps, num_candidates)
+
+
+@POSENETS.register_module()
+class VoxelSinglePose(BasePose):
+    """VoxelPose Please refer to the `paper <https://arxiv.org/abs/2004.06239>`
+    for details.
+
+    Args:
+        image_size (list): input size of the 2D model.
+        heatmap_size (list): output size of the 2D model.
+        sub_space_size (list): Size of the cuboid human proposal.
+        sub_cube_size (list): Size of the input volume to the pose net.
+        pose_net (ConfigDict): Dictionary to construct the pose net.
+        pose_head (ConfigDict): Dictionary to construct the pose head.
+        train_cfg (ConfigDict): Config for training. Default: None.
+        test_cfg (ConfigDict): Config for testing. Default: None.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        heatmap_size,
+        sub_space_size,
+        sub_cube_size,
+        num_joints,
+        pose_net,
+        pose_head,
+        train_cfg=None,
+        test_cfg=None,
+    ):
+        super(VoxelSinglePose, self).__init__()
+        self.project_layer = ProjectLayer(image_size, heatmap_size)
+        self.pose_net = builder.build_backbone(pose_net)
+        self.pose_head = builder.build_head(pose_head)
+
+        self.sub_space_size = sub_space_size
+        self.sub_cube_size = sub_cube_size
+
+        self.num_joints = num_joints
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def forward(self,
+                img,
+                img_metas,
+                return_loss=True,
+                feature_maps=None,
+                human_candidates=None,
+                **kwargs):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            feature_maps (list(torch.Tensor[NxCxHxW])):
+                Multi-camera input feature_maps.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            human_candidates (torch.Tensor[NxPx5]):
+                Human candidates.
+            return_loss: Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, feature_maps,
+                                      human_candidates)
+        else:
+            return self.forward_test(img, img_metas, feature_maps,
+                                     human_candidates)
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      feature_maps=None,
+                      human_candidates=None,
+                      return_preds=False,
+                      **kwargs):
+        """Defines the computation performed at training.
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            feature_maps (list(torch.Tensor[NxCxHxW])):
+                Multi-camera input feature_maps.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            human_candidates (torch.Tensor[NxPx5]):
+                Human candidates.
+            return_preds (bool): Whether to return prediction results
+
+        Returns:
+            dict: losses.
+
+        """
+        batch_size, num_candidates, _ = human_candidates.shape
+        pred = human_candidates.new_zeros(batch_size, num_candidates,
+                                          self.num_joints, 5)
+        pred[:, :, :, 3:] = human_candidates[:, :, None, 3:]
+
+        device = feature_maps[0].device
+        gt_3d = torch.stack([
+            torch.tensor(img_meta['joints_3d'], device=device)
+            for img_meta in img_metas
+        ])
+        gt_3d_vis = torch.stack([
+            torch.tensor(img_meta['joints_3d_visible'], device=device)
+            for img_meta in img_metas
+        ])
+        valid_preds = []
+        valid_targets = []
+        valid_weights = []
+
+        for n in range(num_candidates):
+            index = pred[:, n, 0, 3] >= 0
+            num_valid = index.sum()
+            if num_valid > 0:
+                pose_input_cube, coordinates \
+                    = self.project_layer(feature_maps,
+                                         img_metas,
+                                         self.sub_space_size,
+                                         human_candidates[:, n, :3],
+                                         self.sub_cube_size)
+                pose_heatmaps_3d = self.pose_net(pose_input_cube)
+                pose_3d = self.pose_head(pose_heatmaps_3d[index],
+                                         coordinates[index])
+
+                pred[index, n, :, 0:3] = pose_3d.detach()
+                valid_targets.append(gt_3d[index, pred[index, n, 0, 3].long()])
+                valid_weights.append(gt_3d_vis[index, pred[index, n, 0,
+                                                           3].long(), :,
+                                               0:1].float())
+                valid_preds.append(pose_3d)
+
+        losses = dict()
+        if len(valid_preds) > 0:
+            valid_targets = torch.cat(valid_targets, dim=0)
+            valid_weights = torch.cat(valid_weights, dim=0)
+            valid_preds = torch.cat(valid_preds, dim=0)
+            losses.update(
+                self.pose_head.get_loss(valid_preds, valid_targets,
+                                        valid_weights))
+        else:
+            pose_input_cube = feature_maps[0].new_zeros(
+                batch_size, self.num_joints, *self.sub_cube_size)
+            coordinates = feature_maps[0].new_zeros(batch_size,
+                                                    *self.sub_cube_size,
+                                                    3).view(batch_size, -1, 3)
+            pseudo_targets = feature_maps[0].new_zeros(batch_size,
+                                                       self.num_joints, 3)
+            pseudo_weights = feature_maps[0].new_zeros(batch_size,
+                                                       self.num_joints, 1)
+            pose_heatmaps_3d = self.pose_net(pose_input_cube)
+            pose_3d = self.pose_head(pose_heatmaps_3d, coordinates)
+            losses.update(
+                self.pose_head.get_loss(pose_3d, pseudo_targets,
+                                        pseudo_weights))
+        if return_preds:
+            return pred, losses
+        else:
+            return losses
+
+    def forward_test(self,
+                     img,
+                     img_metas,
+                     feature_maps=None,
+                     human_candidates=None,
+                     **kwargs):
+        """Defines the computation performed at training.
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            feature_maps width: W
+            feature_maps height: H
+            volume_length: cubeL
+            volume_width: cubeW
+            volume_height: cubeH
+
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            feature_maps (list(torch.Tensor[NxCxHxW])):
+                Multi-camera input feature_maps.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            human_candidates (torch.Tensor[NxPx5]):
+                Human candidates.
+
+        Returns:
+            dict: predicted poses, human centers and sample_id
+
+        """
+        batch_size, num_candidates, _ = human_candidates.shape
+        pred = human_candidates.new_zeros(batch_size, num_candidates,
+                                          self.num_joints, 5)
+        pred[:, :, :, 3:] = human_candidates[:, :, None, 3:]
+
+        for n in range(num_candidates):
+            index = pred[:, n, 0, 3] >= 0
+            num_valid = index.sum()
+            if num_valid > 0:
+                pose_input_cube, coordinates \
+                    = self.project_layer(feature_maps,
+                                         img_metas,
+                                         self.sub_space_size,
+                                         human_candidates[:, n, :3],
+                                         self.sub_cube_size)
+                pose_heatmaps_3d = self.pose_net(pose_input_cube)
+                pose_3d = self.pose_head(pose_heatmaps_3d[index],
+                                         coordinates[index])
+
+                pred[index, n, :, 0:3] = pose_3d.detach()
+
+        return pred
+
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
+
+    def forward_dummy(self, feature_maps, num_candidates=5):
+        """Used for computing network FLOPs."""
+        batch_size, num_channels = feature_maps[0].shape
+        pose_input_cube = feature_maps[0].new_zeros(batch_size, num_channels,
+                                                    *self.sub_cube_size)
+        for n in range(num_candidates):
+            _ = self.pose_net(pose_input_cube)
+
+
+@POSENETS.register_module()
+class VoxelCenterDetector(BasePose):
+    """Detect human center by 3D CNN on voxels.
+
+    Please refer to the
+    `paper <https://arxiv.org/abs/2004.06239>` for details.
+    Args:
+        image_size (list): input size of the 2D model.
+        heatmap_size (list): output size of the 2D model.
+        space_size (list): Size of the 3D space.
+        cube_size (list): Size of the input volume to the 3D CNN.
+        space_center (list): Coordinate of the center of the 3D space.
+        center_net (ConfigDict): Dictionary to construct the center net.
+        center_head (ConfigDict): Dictionary to construct the center head.
+        train_cfg (ConfigDict): Config for training. Default: None.
+        test_cfg (ConfigDict): Config for testing. Default: None.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        heatmap_size,
+        space_size,
+        cube_size,
+        space_center,
+        center_net,
+        center_head,
+        train_cfg=None,
+        test_cfg=None,
+    ):
+        super(VoxelCenterDetector, self).__init__()
+        self.project_layer = ProjectLayer(image_size, heatmap_size)
+        self.center_net = builder.build_backbone(center_net)
+        self.center_head = builder.build_head(center_head)
+
+        self.space_size = space_size
+        self.cube_size = cube_size
+        self.space_center = space_center
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def assign2gt(self, center_candidates, gt_centers, gt_num_persons):
+        """"Assign gt id to each valid human center candidate."""
+        det_centers = center_candidates[..., :3]
+        batch_size = center_candidates.shape[0]
+        cand_num = center_candidates.shape[1]
+        cand2gt = torch.zeros(batch_size, cand_num)
+
+        for i in range(batch_size):
+            cand = det_centers[i].view(cand_num, 1, -1)
+            gt = gt_centers[None, i, :gt_num_persons[i]]
+
+            dist = torch.sqrt(torch.sum((cand - gt)**2, dim=-1))
+            min_dist, min_gt = torch.min(dist, dim=-1)
+
+            cand2gt[i] = min_gt
+            cand2gt[i][min_dist > self.train_cfg['dist_threshold']] = -1.0
+
+        center_candidates[:, :, 3] = cand2gt
+
+        return center_candidates
+
+    def forward(self,
+                img,
+                img_metas,
+                return_loss=True,
+                feature_maps=None,
+                targets_3d=None):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            heatmaps width: W
+            heatmaps height: H
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            return_loss: Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]):
+                Ground-truth 3D heatmap of human centers.
+            feature_maps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps.
+        Returns:
+            dict: if 'return_loss' is true, then return losses.
+                Otherwise, return predicted poses
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, feature_maps, targets_3d)
+        else:
+            return self.forward_test(img, img_metas, feature_maps)
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      feature_maps=None,
+                      targets_3d=None,
+                      return_preds=False):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            heatmaps width: W
+            heatmaps height: H
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]):
+                Ground-truth 3D heatmap of human centers.
+            feature_maps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps.
+            return_preds (bool): Whether to return prediction results
+        Returns:
+            dict: if 'return_pred' is true, then return losses
+                and human centers. Otherwise, return losses only
+        """
+        initial_cubes, _ = self.project_layer(feature_maps, img_metas,
+                                              self.space_size,
+                                              [self.space_center],
+                                              self.cube_size)
+        center_heatmaps_3d = self.center_net(initial_cubes)
+        center_heatmaps_3d = center_heatmaps_3d.squeeze(1)
+        center_candidates = self.center_head(center_heatmaps_3d)
+
+        device = center_candidates.device
+
+        gt_centers = torch.stack([
+            torch.tensor(img_meta['roots_3d'], device=device)
+            for img_meta in img_metas
+        ])
+        gt_num_persons = torch.stack([
+            torch.tensor(img_meta['num_persons'], device=device)
+            for img_meta in img_metas
+        ])
+        center_candidates = self.assign2gt(center_candidates, gt_centers,
+                                           gt_num_persons)
+
+        losses = dict()
+        losses.update(
+            self.center_head.get_loss(center_heatmaps_3d, targets_3d))
+
+        if return_preds:
+            return center_candidates, losses
+        else:
+            return losses
+
+    def forward_test(self, img, img_metas, feature_maps=None):
+        """
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C
+            img_width: imgW
+            img_height: imgH
+            heatmaps width: W
+            heatmaps height: H
+        Args:
+            img (list(torch.Tensor[NxCximgHximgW])):
+                Multi-camera input images to the 2D model.
+            img_metas (list(dict)):
+                Information about image, 3D groundtruth and camera parameters.
+            feature_maps (list(torch.Tensor[NxKxHxW])):
+                Multi-camera feature_maps.
+        Returns:
+            human centers
+        """
+        initial_cubes, _ = self.project_layer(feature_maps, img_metas,
+                                              self.space_size,
+                                              [self.space_center],
+                                              self.cube_size)
+        center_heatmaps_3d = self.center_net(initial_cubes)
+        center_heatmaps_3d = center_heatmaps_3d.squeeze(1)
+        center_candidates = self.center_head(center_heatmaps_3d)
+        center_candidates[..., 3] = \
+            (center_candidates[..., 4] >
+             self.test_cfg['center_threshold']).float() - 1.0
+
+        return center_candidates
+
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
+
+    def forward_dummy(self, feature_maps):
+        """Used for computing network FLOPs."""
+        batch_size, num_channels, _, _ = feature_maps[0].shape
+        initial_cubes = feature_maps[0].new_zeros(batch_size, num_channels,
+                                                  *self.cube_size)
+        _ = self.center_net(initial_cubes)
diff --git a/mmpose/models/detectors/pose_lifter.py b/mmpose/models/detectors/pose_lifter.py
new file mode 100644
index 0000000..ace6b9f
--- /dev/null
+++ b/mmpose/models/detectors/pose_lifter.py
@@ -0,0 +1,392 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+from mmcv.utils.misc import deprecated_api_warning
+
+from mmpose.core import imshow_bboxes, imshow_keypoints, imshow_keypoints_3d
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class PoseLifter(BasePose):
+    """Pose lifter that lifts 2D pose to 3D pose.
+
+    The basic model is a pose model that predicts root-relative pose. If
+    traj_head is not None, a trajectory model that predicts absolute root joint
+    position is also built.
+
+    Args:
+        backbone (dict): Config for the backbone of pose model.
+        neck (dict|None): Config for the neck of pose model.
+        keypoint_head (dict|None): Config for the head of pose model.
+        traj_backbone (dict|None): Config for the backbone of trajectory model.
+            If traj_backbone is None and traj_head is not None, trajectory
+            model will share backbone with pose model.
+        traj_neck (dict|None): Config for the neck of trajectory model.
+        traj_head (dict|None): Config for the head of trajectory model.
+        loss_semi (dict|None): Config for semi-supervision loss.
+        train_cfg (dict|None): Config for keypoint head during training.
+        test_cfg (dict|None): Config for keypoint head during testing.
+        pretrained (str|None): Path to pretrained weights.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 traj_backbone=None,
+                 traj_neck=None,
+                 traj_head=None,
+                 loss_semi=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        # pose model
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            keypoint_head['train_cfg'] = train_cfg
+            keypoint_head['test_cfg'] = test_cfg
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+        # trajectory model
+        if traj_head is not None:
+            self.traj_head = builder.build_head(traj_head)
+
+            if traj_backbone is not None:
+                self.traj_backbone = builder.build_backbone(traj_backbone)
+            else:
+                self.traj_backbone = self.backbone
+
+            if traj_neck is not None:
+                self.traj_neck = builder.build_neck(traj_neck)
+
+        # semi-supervised learning
+        self.semi = loss_semi is not None
+        if self.semi:
+            assert keypoint_head is not None and traj_head is not None
+            self.loss_semi = builder.build_loss(loss_semi)
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_neck(self):
+        """Check if has keypoint_neck."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    @property
+    def with_traj_backbone(self):
+        """Check if has trajectory_backbone."""
+        return hasattr(self, 'traj_backbone')
+
+    @property
+    def with_traj_neck(self):
+        """Check if has trajectory_neck."""
+        return hasattr(self, 'traj_neck')
+
+    @property
+    def with_traj(self):
+        """Check if has trajectory_head."""
+        return hasattr(self, 'traj_head')
+
+    @property
+    def causal(self):
+        if hasattr(self.backbone, 'causal'):
+            return self.backbone.causal
+        else:
+            raise AttributeError('A PoseLifter\'s backbone should have '
+                                 'the bool attribute "causal" to indicate if'
+                                 'it performs causal inference.')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+        if self.with_traj_backbone:
+            self.traj_backbone.init_weights(pretrained)
+        if self.with_traj_neck:
+            self.traj_neck.init_weights()
+        if self.with_traj:
+            self.traj_head.init_weights()
+
+    @auto_fp16(apply_to=('input', ))
+    def forward(self,
+                input,
+                target=None,
+                target_weight=None,
+                metas=None,
+                return_loss=True,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note:
+            - batch_size: N
+            - num_input_keypoints: Ki
+            - input_keypoint_dim: Ci
+            - input_sequence_len: Ti
+            - num_output_keypoints: Ko
+            - output_keypoint_dim: Co
+            - input_sequence_len: To
+
+        Args:
+            input (torch.Tensor[NxKixCixTi]): Input keypoint coordinates.
+            target (torch.Tensor[NxKoxCoxTo]): Output keypoint coordinates.
+                Defaults to None.
+            target_weight (torch.Tensor[NxKox1]): Weights across different
+                joint types. Defaults to None.
+            metas (list(dict)): Information about data augmentation
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+
+        Returns:
+            dict|Tensor: If `reutrn_loss` is true, return losses. \
+                Otherwise return predicted poses.
+        """
+        if return_loss:
+            return self.forward_train(input, target, target_weight, metas,
+                                      **kwargs)
+        else:
+            return self.forward_test(input, metas, **kwargs)
+
+    def forward_train(self, input, target, target_weight, metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        assert input.size(0) == len(metas)
+
+        # supervised learning
+        # pose model
+        features = self.backbone(input)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output = self.keypoint_head(features)
+
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, target, target_weight)
+            keypoint_accuracy = self.keypoint_head.get_accuracy(
+                output, target, target_weight, metas)
+            losses.update(keypoint_losses)
+            losses.update(keypoint_accuracy)
+
+        # trajectory model
+        if self.with_traj:
+            traj_features = self.traj_backbone(input)
+            if self.with_traj_neck:
+                traj_features = self.traj_neck(traj_features)
+            traj_output = self.traj_head(traj_features)
+
+            traj_losses = self.traj_head.get_loss(traj_output,
+                                                  kwargs['traj_target'], None)
+            losses.update(traj_losses)
+
+        # semi-supervised learning
+        if self.semi:
+            ul_input = kwargs['unlabeled_input']
+            ul_features = self.backbone(ul_input)
+            if self.with_neck:
+                ul_features = self.neck(ul_features)
+            ul_output = self.keypoint_head(ul_features)
+
+            ul_traj_features = self.traj_backbone(ul_input)
+            if self.with_traj_neck:
+                ul_traj_features = self.traj_neck(ul_traj_features)
+            ul_traj_output = self.traj_head(ul_traj_features)
+
+            output_semi = dict(
+                labeled_pose=output,
+                unlabeled_pose=ul_output,
+                unlabeled_traj=ul_traj_output)
+            target_semi = dict(
+                unlabeled_target_2d=kwargs['unlabeled_target_2d'],
+                intrinsics=kwargs['intrinsics'])
+
+            semi_losses = self.loss_semi(output_semi, target_semi)
+            losses.update(semi_losses)
+
+        return losses
+
+    def forward_test(self, input, metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        assert input.size(0) == len(metas)
+
+        results = {}
+
+        features = self.backbone(input)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output = self.keypoint_head.inference_model(features)
+            keypoint_result = self.keypoint_head.decode(metas, output)
+            results.update(keypoint_result)
+
+        if self.with_traj:
+            traj_features = self.traj_backbone(input)
+            if self.with_traj_neck:
+                traj_features = self.traj_neck(traj_features)
+            traj_output = self.traj_head.inference_model(traj_features)
+            results['traj_preds'] = traj_output
+
+        return results
+
+    def forward_dummy(self, input):
+        """Used for computing network FLOPs. See ``tools/get_flops.py``.
+
+        Args:
+            input (torch.Tensor): Input pose
+
+        Returns:
+            Tensor: Model output
+        """
+        output = self.backbone(input)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+
+        if self.with_traj:
+            traj_features = self.traj_backbone(input)
+            if self.with_neck:
+                traj_features = self.traj_neck(traj_features)
+            traj_output = self.traj_head(traj_features)
+            output = output + traj_output
+
+        return output
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='PoseLifter')
+    def show_result(self,
+                    result,
+                    img=None,
+                    skeleton=None,
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    radius=8,
+                    thickness=2,
+                    vis_height=400,
+                    num_instances=-1,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Visualize 3D pose estimation results.
+
+        Args:
+            result (list[dict]): The pose estimation results containing:
+
+                - "keypoints_3d" ([K,4]): 3D keypoints
+                - "keypoints" ([K,3] or [T,K,3]): Optional for visualizing
+                    2D inputs. If a sequence is given, only the last frame
+                    will be used for visualization
+                - "bbox" ([4,] or [T,4]): Optional for visualizing 2D inputs
+                - "title" (str): title for the subplot
+            img (str or Tensor): Optional. The image to visualize 2D inputs on.
+            skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+                links, each is a pair of joint indices.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            vis_height (int): The image height of the visualization. The width
+                will be N*vis_height depending on the number of visualized
+                items.
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        if num_instances < 0:
+            assert len(result) > 0
+        result = sorted(result, key=lambda x: x.get('track_id', 1e4))
+
+        # draw image and input 2d poses
+        if img is not None:
+            img = mmcv.imread(img)
+
+            bbox_result = []
+            pose_input_2d = []
+            for res in result:
+                if 'bbox' in res:
+                    bbox = np.array(res['bbox'])
+                    if bbox.ndim != 1:
+                        assert bbox.ndim == 2
+                        bbox = bbox[-1]  # Get bbox from the last frame
+                    bbox_result.append(bbox)
+                if 'keypoints' in res:
+                    kpts = np.array(res['keypoints'])
+                    if kpts.ndim != 2:
+                        assert kpts.ndim == 3
+                        kpts = kpts[-1]  # Get 2D keypoints from the last frame
+                    pose_input_2d.append(kpts)
+
+            if len(bbox_result) > 0:
+                bboxes = np.vstack(bbox_result)
+                imshow_bboxes(
+                    img,
+                    bboxes,
+                    colors='green',
+                    thickness=thickness,
+                    show=False)
+            if len(pose_input_2d) > 0:
+                imshow_keypoints(
+                    img,
+                    pose_input_2d,
+                    skeleton,
+                    kpt_score_thr=0.3,
+                    pose_kpt_color=pose_kpt_color,
+                    pose_link_color=pose_link_color,
+                    radius=radius,
+                    thickness=thickness)
+            img = mmcv.imrescale(img, scale=vis_height / img.shape[0])
+
+        img_vis = imshow_keypoints_3d(
+            result,
+            img,
+            skeleton,
+            pose_kpt_color,
+            pose_link_color,
+            vis_height,
+            num_instances=num_instances)
+
+        if show:
+            mmcv.visualization.imshow(img_vis, win_name, wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(img_vis, out_file)
+
+        return img_vis
diff --git a/mmpose/models/detectors/posewarper.py b/mmpose/models/detectors/posewarper.py
new file mode 100644
index 0000000..aa1d05f
--- /dev/null
+++ b/mmpose/models/detectors/posewarper.py
@@ -0,0 +1,244 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+
+from ..builder import POSENETS
+from .top_down import TopDown
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class PoseWarper(TopDown):
+    """Top-down pose detectors for multi-frame settings for video inputs.
+
+    `"Learning temporal pose estimation from sparsely-labeled videos"
+    <https://arxiv.org/abs/1906.04016>`_.
+
+    A child class of TopDown detector. The main difference between PoseWarper
+    and TopDown lies in that the former takes a list of tensors as input image
+    while the latter takes a single tensor as input image in forward method.
+
+    Args:
+        backbone (dict): Backbone modules to extract features.
+        neck (dict): intermediate modules to transform features.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+        concat_tensors (bool): Whether to concat the tensors on the batch dim,
+            which can speed up, Default: True
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None,
+                 concat_tensors=True):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            keypoint_head=keypoint_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            loss_pose=loss_pose)
+        self.concat_tensors = concat_tensors
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - number of frames: F
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            imgs (list[F,torch.Tensor[N,C,imgH,imgW]]): multiple input frames
+            target (torch.Tensor[N,K,H,W]): Target heatmaps for one frame.
+            target_weight (torch.Tensor[N,K,1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: paths to multiple video frames
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, imgs, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames
+        assert imgs[0].size(0) == len(img_metas)
+        num_frames = len(imgs)
+        frame_weight = img_metas[0]['frame_weight']
+
+        assert num_frames == len(frame_weight), f'The number of frames ' \
+            f'({num_frames}) and the length of weights for each frame ' \
+            f'({len(frame_weight)}) must match'
+
+        if self.concat_tensors:
+            features = [self.backbone(torch.cat(imgs, 0))]
+        else:
+            features = [self.backbone(img) for img in imgs]
+
+        if self.with_neck:
+            features = self.neck(features, frame_weight=frame_weight)
+
+        if self.with_keypoint:
+            output = self.keypoint_head(features)
+
+        # if return loss
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, target, target_weight)
+            losses.update(keypoint_losses)
+            keypoint_accuracy = self.keypoint_head.get_accuracy(
+                output, target, target_weight)
+            losses.update(keypoint_accuracy)
+
+        return losses
+
+    def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames
+        assert imgs[0].size(0) == len(img_metas)
+        num_frames = len(imgs)
+        frame_weight = img_metas[0]['frame_weight']
+
+        assert num_frames == len(frame_weight), f'The number of frames ' \
+            f'({num_frames}) and the length of weights for each frame ' \
+            f'({len(frame_weight)}) must match'
+
+        batch_size, _, img_height, img_width = imgs[0].shape
+
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        if self.concat_tensors:
+            features = [self.backbone(torch.cat(imgs, 0))]
+        else:
+            features = [self.backbone(img) for img in imgs]
+
+        if self.with_neck:
+            features = self.neck(features, frame_weight=frame_weight)
+
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            imgs_flipped = [img.flip(3) for img in imgs]
+
+            if self.concat_tensors:
+                features_flipped = [self.backbone(torch.cat(imgs_flipped, 0))]
+            else:
+                features_flipped = [
+                    self.backbone(img_flipped) for img_flipped in imgs_flipped
+                ]
+
+            if self.with_neck:
+                features_flipped = self.neck(
+                    features_flipped, frame_weight=frame_weight)
+
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap +
+                                  output_flipped_heatmap) * 0.5
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor[N,C,imgH,imgW], or list|tuple of tensors):
+                multiple input frames, N >= 2.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        # concat tensors if they are in a list
+        if isinstance(img, (list, tuple)):
+            img = torch.cat(img, 0)
+
+        batch_size = img.size(0)
+        assert batch_size > 1, 'Input batch size to PoseWarper ' \
+            'should be larger than 1.'
+        if batch_size == 2:
+            warnings.warn('Current batch size: 2, for pytorch2onnx and '
+                          'getting flops both.')
+        else:
+            warnings.warn(
+                f'Current batch size: {batch_size}, for getting flops only.')
+
+        frame_weight = np.random.uniform(0, 1, batch_size)
+        output = [self.backbone(img)]
+
+        if self.with_neck:
+            output = self.neck(output, frame_weight=frame_weight)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
diff --git a/mmpose/models/detectors/top_down.py b/mmpose/models/detectors/top_down.py
new file mode 100644
index 0000000..af0ab51
--- /dev/null
+++ b/mmpose/models/detectors/top_down.py
@@ -0,0 +1,307 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+from mmcv.image import imwrite
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_bboxes, imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class TopDown(BasePose):
+    """Top-down pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.backbone = builder.build_backbone(backbone)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            keypoint_head['train_cfg'] = train_cfg
+            keypoint_head['test_cfg'] = test_cfg
+
+            if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
+                warnings.warn(
+                    '`loss_pose` for TopDown is deprecated, '
+                    'use `loss_keypoint` for heads instead. See '
+                    'https://github.com/open-mmlab/mmpose/pull/382'
+                    ' for more information.', DeprecationWarning)
+                keypoint_head['loss_keypoint'] = loss_pose
+
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_neck(self):
+        """Check if has neck."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+
+        # if return loss
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, target, target_weight)
+            losses.update(keypoint_losses)
+            keypoint_accuracy = self.keypoint_head.get_accuracy(
+                output, target, target_weight)
+            losses.update(keypoint_accuracy)
+
+        return losses
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap +
+                                  output_flipped_heatmap) * 0.5
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='TopDown')
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    text_color='white',
+                    radius=4,
+                    thickness=1,
+                    font_scale=0.5,
+                    bbox_thickness=1,
+                    win_name='',
+                    show=False,
+                    show_keypoint_weight=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            skeleton (list[list]): The connection of keypoints.
+                skeleton is 0-based indexing.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            text_color (str or tuple or :obj:`Color`): Color of texts.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            show_keypoint_weight (bool): Whether to change the transparency
+                using the predicted confidence scores of keypoints.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+
+        bbox_result = []
+        bbox_labels = []
+        pose_result = []
+        for res in result:
+            if 'bbox' in res:
+                bbox_result.append(res['bbox'])
+                bbox_labels.append(res.get('label', None))
+            pose_result.append(res['keypoints'])
+
+        if bbox_result:
+            bboxes = np.vstack(bbox_result)
+            # draw bounding boxes
+            imshow_bboxes(
+                img,
+                bboxes,
+                labels=bbox_labels,
+                colors=bbox_color,
+                text_color=text_color,
+                thickness=bbox_thickness,
+                font_scale=font_scale,
+                show=False)
+
+        if pose_result:
+            imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
+                             pose_kpt_color, pose_link_color, radius,
+                             thickness)
+
+        if show:
+            imshow(img, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/mmpose/models/detectors/top_down_moe.py b/mmpose/models/detectors/top_down_moe.py
new file mode 100644
index 0000000..7d499b7
--- /dev/null
+++ b/mmpose/models/detectors/top_down_moe.py
@@ -0,0 +1,351 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+
+import mmcv
+import numpy as np
+from mmcv.image import imwrite
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_bboxes, imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class TopDownMoE(BasePose):
+    """Top-down pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 associate_keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.backbone = builder.build_backbone(backbone)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            keypoint_head['train_cfg'] = train_cfg
+            keypoint_head['test_cfg'] = test_cfg
+
+            if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
+                warnings.warn(
+                    '`loss_pose` for TopDown is deprecated, '
+                    'use `loss_keypoint` for heads instead. See '
+                    'https://github.com/open-mmlab/mmpose/pull/382'
+                    ' for more information.', DeprecationWarning)
+                keypoint_head['loss_keypoint'] = loss_pose
+                
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+
+        associate_keypoint_heads = []
+        keypoint_heads_cnt = 1
+
+        if associate_keypoint_head is not None:
+            if not isinstance(associate_keypoint_head, list):
+                associate_keypoint_head = [associate_keypoint_head]
+            for single_keypoint_head in associate_keypoint_head:
+                single_keypoint_head['train_cfg'] = train_cfg
+                single_keypoint_head['test_cfg'] = test_cfg
+                associate_keypoint_heads.append(builder.build_head(single_keypoint_head))
+                keypoint_heads_cnt += 1
+
+        self.associate_keypoint_heads = nn.ModuleList(associate_keypoint_heads)
+
+        self.keypoint_heads_cnt = keypoint_heads_cnt
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_neck(self):
+        """Check if has neck."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+        for item in self.associate_keypoint_heads:
+            item.init_weights()
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+
+        img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device)
+
+        output = self.backbone(img, img_sources)
+        if self.with_neck:
+            output = self.neck(output)
+        # if return loss
+        losses = dict()
+
+        main_stream_select = (img_sources == 0)
+        # if torch.sum(main_stream_select) > 0:
+        output_select = self.keypoint_head(output)
+
+        target_select = target * main_stream_select.view(-1, 1, 1, 1)
+        target_weight_select = target_weight * main_stream_select.view(-1, 1, 1)
+
+        keypoint_losses = self.keypoint_head.get_loss(
+            output_select, target_select, target_weight_select)
+        losses['main_stream_loss'] = keypoint_losses['heatmap_loss']
+        keypoint_accuracy = self.keypoint_head.get_accuracy(
+            output_select, target_select, target_weight_select)
+        losses['main_stream_acc'] = keypoint_accuracy['acc_pose']
+
+        for idx in range(1, self.keypoint_heads_cnt):
+            idx_select = (img_sources == idx)
+            target_select = target * idx_select.view(-1, 1, 1, 1)
+            target_weight_select = target_weight * idx_select.view(-1, 1, 1)
+            output_select = self.associate_keypoint_heads[idx - 1](output)
+            keypoint_losses = self.associate_keypoint_heads[idx - 1].get_loss(
+                output_select, target_select, target_weight_select)
+            losses[f'{idx}_loss'] = keypoint_losses['heatmap_loss']
+            keypoint_accuracy = self.associate_keypoint_heads[idx - 1].get_accuracy(
+                output_select, target_select, target_weight_select)
+            losses[f'{idx}_acc'] = keypoint_accuracy['acc_pose']
+
+        return losses
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+        img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device)
+
+        features = self.backbone(img, img_sources)
+
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped, img_sources)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap +
+                                  output_flipped_heatmap) * 0.5
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='TopDown')
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    text_color='white',
+                    radius=4,
+                    thickness=1,
+                    font_scale=0.5,
+                    bbox_thickness=1,
+                    win_name='',
+                    show=False,
+                    show_keypoint_weight=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            skeleton (list[list]): The connection of keypoints.
+                skeleton is 0-based indexing.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            text_color (str or tuple or :obj:`Color`): Color of texts.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            show_keypoint_weight (bool): Whether to change the transparency
+                using the predicted confidence scores of keypoints.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+
+        bbox_result = []
+        bbox_labels = []
+        pose_result = []
+        for res in result:
+            if 'bbox' in res:
+                bbox_result.append(res['bbox'])
+                bbox_labels.append(res.get('label', None))
+            pose_result.append(res['keypoints'])
+
+        if bbox_result:
+            bboxes = np.vstack(bbox_result)
+            # draw bounding boxes
+            imshow_bboxes(
+                img,
+                bboxes,
+                labels=bbox_labels,
+                colors=bbox_color,
+                text_color=text_color,
+                thickness=bbox_thickness,
+                font_scale=font_scale,
+                show=False)
+
+        if pose_result:
+            imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
+                             pose_kpt_color, pose_link_color, radius,
+                             thickness)
+
+        if show:
+            imshow(img, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py
new file mode 100644
index 0000000..a98e911
--- /dev/null
+++ b/mmpose/models/heads/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ae_higher_resolution_head import AEHigherResolutionHead
+from .ae_multi_stage_head import AEMultiStageHead
+from .ae_simple_head import AESimpleHead
+from .deconv_head import DeconvHead
+from .deeppose_regression_head import DeepposeRegressionHead
+from .hmr_head import HMRMeshHead
+from .interhand_3d_head import Interhand3DHead
+from .temporal_regression_head import TemporalRegressionHead
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead,
+                                               TopdownHeatmapMultiStageHead)
+from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
+from .vipnas_heatmap_simple_head import ViPNASHeatmapSimpleHead
+from .voxelpose_head import CuboidCenterHead, CuboidPoseHead
+
+__all__ = [
+    'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead',
+    'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead',
+    'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead',
+    'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead',
+    'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead',
+    'CuboidPoseHead'
+]
diff --git a/mmpose/models/heads/ae_higher_resolution_head.py b/mmpose/models/heads/ae_higher_resolution_head.py
new file mode 100644
index 0000000..9bf3399
--- /dev/null
+++ b/mmpose/models/heads/ae_higher_resolution_head.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init,
+                      normal_init)
+
+from mmpose.models.builder import build_loss
+from ..backbones.resnet import BasicBlock
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class AEHigherResolutionHead(nn.Module):
+    """Associative embedding with higher resolution head. paper ref: Bowen
+    Cheng et al. "HigherHRNet: Scale-Aware Representation Learning for Bottom-
+    Up Human Pose Estimation".
+
+    Args:
+        in_channels (int): Number of input channels.
+        num_joints (int): Number of joints
+        tag_per_joint (bool): If tag_per_joint is True,
+            the dimension of tags equals to num_joints,
+            else the dimension of tags is 1. Default: True
+        extra (dict): Configs for extra conv layers. Default: None
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        cat_output (list[bool]): Option to concat outputs.
+        with_ae_loss (list[bool]): Option to use ae loss.
+        loss_keypoint (dict): Config for loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 tag_per_joint=True,
+                 extra=None,
+                 num_deconv_layers=1,
+                 num_deconv_filters=(32, ),
+                 num_deconv_kernels=(4, ),
+                 num_basic_blocks=4,
+                 cat_output=None,
+                 with_ae_loss=None,
+                 loss_keypoint=None):
+        super().__init__()
+
+        self.loss = build_loss(loss_keypoint)
+        dim_tag = num_joints if tag_per_joint else 1
+
+        self.num_deconvs = num_deconv_layers
+        self.cat_output = cat_output
+
+        final_layer_output_channels = []
+
+        if with_ae_loss[0]:
+            out_channels = num_joints + dim_tag
+        else:
+            out_channels = num_joints
+
+        final_layer_output_channels.append(out_channels)
+        for i in range(num_deconv_layers):
+            if with_ae_loss[i + 1]:
+                out_channels = num_joints + dim_tag
+            else:
+                out_channels = num_joints
+            final_layer_output_channels.append(out_channels)
+
+        deconv_layer_output_channels = []
+        for i in range(num_deconv_layers):
+            if with_ae_loss[i]:
+                out_channels = num_joints + dim_tag
+            else:
+                out_channels = num_joints
+            deconv_layer_output_channels.append(out_channels)
+
+        self.final_layers = self._make_final_layers(
+            in_channels, final_layer_output_channels, extra, num_deconv_layers,
+            num_deconv_filters)
+        self.deconv_layers = self._make_deconv_layers(
+            in_channels, deconv_layer_output_channels, num_deconv_layers,
+            num_deconv_filters, num_deconv_kernels, num_basic_blocks,
+            cat_output)
+
+    @staticmethod
+    def _make_final_layers(in_channels, final_layer_output_channels, extra,
+                           num_deconv_layers, num_deconv_filters):
+        """Make final layers."""
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            else:
+                padding = 0
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        final_layers = []
+        final_layers.append(
+            build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=in_channels,
+                out_channels=final_layer_output_channels[0],
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding))
+
+        for i in range(num_deconv_layers):
+            in_channels = num_deconv_filters[i]
+            final_layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=in_channels,
+                    out_channels=final_layer_output_channels[i + 1],
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+        return nn.ModuleList(final_layers)
+
+    def _make_deconv_layers(self, in_channels, deconv_layer_output_channels,
+                            num_deconv_layers, num_deconv_filters,
+                            num_deconv_kernels, num_basic_blocks, cat_output):
+        """Make deconv layers."""
+        deconv_layers = []
+        for i in range(num_deconv_layers):
+            if cat_output[i]:
+                in_channels += deconv_layer_output_channels[i]
+
+            planes = num_deconv_filters[i]
+            deconv_kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_deconv_kernels[i])
+
+            layers = []
+            layers.append(
+                nn.Sequential(
+                    build_upsample_layer(
+                        dict(type='deconv'),
+                        in_channels=in_channels,
+                        out_channels=planes,
+                        kernel_size=deconv_kernel,
+                        stride=2,
+                        padding=padding,
+                        output_padding=output_padding,
+                        bias=False), nn.BatchNorm2d(planes, momentum=0.1),
+                    nn.ReLU(inplace=True)))
+            for _ in range(num_basic_blocks):
+                layers.append(nn.Sequential(BasicBlock(planes, planes), ))
+            deconv_layers.append(nn.Sequential(*layers))
+            in_channels = planes
+
+        return nn.ModuleList(deconv_layers)
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def get_loss(self, outputs, targets, masks, joints):
+        """Calculate bottom-up keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            outputs (list(torch.Tensor[N,K,H,W])): Multi-scale output heatmaps.
+            targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps.
+            masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target
+                heatmaps
+            joints (List(torch.Tensor[N,M,K,2])): Joints of multi-scale target
+                heatmaps for ae loss
+        """
+
+        losses = dict()
+
+        heatmaps_losses, push_losses, pull_losses = self.loss(
+            outputs, targets, masks, joints)
+
+        for idx in range(len(targets)):
+            if heatmaps_losses[idx] is not None:
+                heatmaps_loss = heatmaps_losses[idx].mean(dim=0)
+                if 'heatmap_loss' not in losses:
+                    losses['heatmap_loss'] = heatmaps_loss
+                else:
+                    losses['heatmap_loss'] += heatmaps_loss
+            if push_losses[idx] is not None:
+                push_loss = push_losses[idx].mean(dim=0)
+                if 'push_loss' not in losses:
+                    losses['push_loss'] = push_loss
+                else:
+                    losses['push_loss'] += push_loss
+            if pull_losses[idx] is not None:
+                pull_loss = pull_losses[idx].mean(dim=0)
+                if 'pull_loss' not in losses:
+                    losses['pull_loss'] = pull_loss
+                else:
+                    losses['pull_loss'] += pull_loss
+
+        return losses
+
+    def forward(self, x):
+        """Forward function."""
+        if isinstance(x, list):
+            x = x[0]
+
+        final_outputs = []
+        y = self.final_layers[0](x)
+        final_outputs.append(y)
+
+        for i in range(self.num_deconvs):
+            if self.cat_output[i]:
+                x = torch.cat((x, y), 1)
+
+            x = self.deconv_layers[i](x)
+            y = self.final_layers[i + 1](x)
+            final_outputs.append(y)
+
+        return final_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for _, m in self.final_layers.named_modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
diff --git a/mmpose/models/heads/ae_multi_stage_head.py b/mmpose/models/heads/ae_multi_stage_head.py
new file mode 100644
index 0000000..195666b
--- /dev/null
+++ b/mmpose/models/heads/ae_multi_stage_head.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init,
+                      normal_init)
+
+from mmpose.models.builder import build_loss
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class AEMultiStageHead(nn.Module):
+    """Associative embedding multi-stage head.
+    paper ref: Alejandro Newell et al. "Associative
+    Embedding: End-to-end Learning for Joint Detection
+    and Grouping"
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        loss_keypoint (dict): Config for loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_stages=1,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 loss_keypoint=None):
+        super().__init__()
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.in_channels = in_channels
+        self.num_stages = num_stages
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        # build multi-stage deconv layers
+        self.multi_deconv_layers = nn.ModuleList([])
+        for _ in range(self.num_stages):
+            if num_deconv_layers > 0:
+                deconv_layers = self._make_deconv_layer(
+                    num_deconv_layers,
+                    num_deconv_filters,
+                    num_deconv_kernels,
+                )
+            elif num_deconv_layers == 0:
+                deconv_layers = nn.Identity()
+            else:
+                raise ValueError(
+                    f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+            self.multi_deconv_layers.append(deconv_layers)
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        # build multi-stage final layers
+        self.multi_final_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if identity_final_layer:
+                final_layer = nn.Identity()
+            else:
+                final_layer = build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=num_deconv_filters[-1]
+                    if num_deconv_layers > 0 else in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding)
+            self.multi_final_layers.append(final_layer)
+
+    def get_loss(self, output, targets, masks, joints):
+        """Calculate bottom-up keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (List(torch.Tensor[NxKxHxW])): Output heatmaps.
+            targets(List(List(torch.Tensor[NxKxHxW]))):
+                Multi-stage and multi-scale target heatmaps.
+            masks(List(List(torch.Tensor[NxHxW]))):
+                Masks of multi-stage and multi-scale target heatmaps
+            joints(List(List(torch.Tensor[NxMxKx2]))):
+                Joints of multi-stage multi-scale target heatmaps for ae loss
+        """
+
+        losses = dict()
+
+        # Flatten list:
+        # [stage_1_scale_1, stage_1_scale_2, ... , stage_1_scale_m,
+        # ...
+        # stage_n_scale_1, stage_n_scale_2, ... , stage_n_scale_m]
+        targets = [target for _targets in targets for target in _targets]
+        masks = [mask for _masks in masks for mask in _masks]
+        joints = [joint for _joints in joints for joint in _joints]
+
+        heatmaps_losses, push_losses, pull_losses = self.loss(
+            output, targets, masks, joints)
+
+        for idx in range(len(targets)):
+            if heatmaps_losses[idx] is not None:
+                heatmaps_loss = heatmaps_losses[idx].mean(dim=0)
+                if 'heatmap_loss' not in losses:
+                    losses['heatmap_loss'] = heatmaps_loss
+                else:
+                    losses['heatmap_loss'] += heatmaps_loss
+            if push_losses[idx] is not None:
+                push_loss = push_losses[idx].mean(dim=0)
+                if 'push_loss' not in losses:
+                    losses['push_loss'] = push_loss
+                else:
+                    losses['push_loss'] += push_loss
+            if pull_losses[idx] is not None:
+                pull_loss = pull_losses[idx].mean(dim=0)
+                if 'pull_loss' not in losses:
+                    losses['pull_loss'] = pull_loss
+                else:
+                    losses['pull_loss'] += pull_loss
+
+        return losses
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages.
+        """
+        out = []
+        assert isinstance(x, list)
+        for i in range(self.num_stages):
+            y = self.multi_deconv_layers[i](x[i])
+            y = self.multi_final_layers[i](y)
+            out.append(y)
+        return out
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.multi_deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.multi_final_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
diff --git a/mmpose/models/heads/ae_simple_head.py b/mmpose/models/heads/ae_simple_head.py
new file mode 100644
index 0000000..9297f71
--- /dev/null
+++ b/mmpose/models/heads/ae_simple_head.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import HEADS
+from .deconv_head import DeconvHead
+
+
+@HEADS.register_module()
+class AESimpleHead(DeconvHead):
+    """Associative embedding simple head.
+    paper ref: Alejandro Newell et al. "Associative
+    Embedding: End-to-end Learning for Joint Detection
+    and Grouping"
+
+    Args:
+        in_channels (int): Number of input channels.
+        num_joints (int): Number of joints.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        tag_per_joint (bool): If tag_per_joint is True,
+            the dimension of tags equals to num_joints,
+            else the dimension of tags is 1. Default: True
+        with_ae_loss (list[bool]): Option to use ae loss or not.
+        loss_keypoint (dict): Config for loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 tag_per_joint=True,
+                 with_ae_loss=None,
+                 extra=None,
+                 loss_keypoint=None):
+
+        dim_tag = num_joints if tag_per_joint else 1
+        if with_ae_loss[0]:
+            out_channels = num_joints + dim_tag
+        else:
+            out_channels = num_joints
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            num_deconv_layers=num_deconv_layers,
+            num_deconv_filters=num_deconv_filters,
+            num_deconv_kernels=num_deconv_kernels,
+            extra=extra,
+            loss_keypoint=loss_keypoint)
+
+    def get_loss(self, outputs, targets, masks, joints):
+        """Calculate bottom-up keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            outputs (list(torch.Tensor[N,K,H,W])): Multi-scale output heatmaps.
+            targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps.
+            masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target
+                heatmaps
+            joints(List(torch.Tensor[N,M,K,2])): Joints of multi-scale target
+                heatmaps for ae loss
+        """
+
+        losses = dict()
+
+        heatmaps_losses, push_losses, pull_losses = self.loss(
+            outputs, targets, masks, joints)
+
+        for idx in range(len(targets)):
+            if heatmaps_losses[idx] is not None:
+                heatmaps_loss = heatmaps_losses[idx].mean(dim=0)
+                if 'heatmap_loss' not in losses:
+                    losses['heatmap_loss'] = heatmaps_loss
+                else:
+                    losses['heatmap_loss'] += heatmaps_loss
+            if push_losses[idx] is not None:
+                push_loss = push_losses[idx].mean(dim=0)
+                if 'push_loss' not in losses:
+                    losses['push_loss'] = push_loss
+                else:
+                    losses['push_loss'] += push_loss
+            if pull_losses[idx] is not None:
+                pull_loss = pull_losses[idx].mean(dim=0)
+                if 'pull_loss' not in losses:
+                    losses['pull_loss'] = pull_loss
+                else:
+                    losses['pull_loss'] += pull_loss
+
+        return losses
diff --git a/mmpose/models/heads/deconv_head.py b/mmpose/models/heads/deconv_head.py
new file mode 100644
index 0000000..90846d2
--- /dev/null
+++ b/mmpose/models/heads/deconv_head.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.models.builder import HEADS, build_loss
+from mmpose.models.utils.ops import resize
+
+
+@HEADS.register_module()
+class DeconvHead(nn.Module):
+    """Simple deconv head.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resized to the
+                same size as the first one and then concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=17,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.loss = build_loss(loss_keypoint)
+
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def get_loss(self, outputs, targets, masks):
+        """Calculate bottom-up masked mse loss.
+
+        Note:
+            - batch_size: N
+            - num_channels: C
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            outputs (List(torch.Tensor[N,C,H,W])): Multi-scale outputs.
+            targets (List(torch.Tensor[N,C,H,W])): Multi-scale targets.
+            masks (List(torch.Tensor[N,H,W])): Masks of multi-scale targets.
+        """
+
+        losses = dict()
+
+        for idx in range(len(targets)):
+            if 'loss' not in losses:
+                losses['loss'] = self.loss(outputs[idx], targets[idx],
+                                           masks[idx])
+            else:
+                losses['loss'] += self.loss(outputs[idx], targets[idx],
+                                            masks[idx])
+
+        return losses
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        final_outputs = []
+        x = self.deconv_layers(x)
+        y = self.final_layer(x)
+        final_outputs.append(y)
+        return final_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
diff --git a/mmpose/models/heads/deeppose_regression_head.py b/mmpose/models/heads/deeppose_regression_head.py
new file mode 100644
index 0000000..f326e26
--- /dev/null
+++ b/mmpose/models/heads/deeppose_regression_head.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import normal_init
+
+from mmpose.core.evaluation import (keypoint_pck_accuracy,
+                                    keypoints_from_regression)
+from mmpose.core.post_processing import fliplr_regression
+from mmpose.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class DeepposeRegressionHead(nn.Module):
+    """Deeppose regression head with fully connected layers.
+
+    "DeepPose: Human Pose Estimation via Deep Neural Networks".
+
+    Args:
+        in_channels (int): Number of input channels
+        num_joints (int): Number of joints
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_joints = num_joints
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+
+        self.fc = nn.Linear(self.in_channels, self.num_joints * 2)
+
+    def forward(self, x):
+        """Forward function."""
+        output = self.fc(x)
+        N, C = output.shape
+        return output.reshape([N, C // 2, 2])
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+        losses['reg_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        N = output.shape[0]
+
+        _, avg_acc, cnt = keypoint_pck_accuracy(
+            output.detach().cpu().numpy(),
+            target.detach().cpu().numpy(),
+            target_weight[:, :, 0].detach().cpu().numpy() > 0,
+            thr=0.05,
+            normalize=np.ones((N, 2), dtype=np.float32))
+        accuracy['acc_pose'] = avg_acc
+
+        return accuracy
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_regression (np.ndarray): Output regression.
+
+        Args:
+            x (torch.Tensor[N, K, 2]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_regression = fliplr_regression(
+                output.detach().cpu().numpy(), flip_pairs)
+        else:
+            output_regression = output.detach().cpu().numpy()
+        return output_regression
+
+    def decode(self, img_metas, output, **kwargs):
+        """Decode the keypoints from output regression.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, 2]): predicted regression vector.
+            kwargs: dict contains 'img_size'.
+                img_size (tuple(img_width, img_height)): input image size.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_regression(output, c, s,
+                                                   kwargs['img_size'])
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
+
+    def init_weights(self):
+        normal_init(self.fc, mean=0, std=0.01, bias=0)
diff --git a/mmpose/models/heads/hmr_head.py b/mmpose/models/heads/hmr_head.py
new file mode 100644
index 0000000..015a307
--- /dev/null
+++ b/mmpose/models/heads/hmr_head.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+
+from ..builder import HEADS
+from ..utils.geometry import rot6d_to_rotmat
+
+
+@HEADS.register_module()
+class HMRMeshHead(nn.Module):
+    """SMPL parameters regressor head of simple baseline. "End-to-end Recovery
+    of Human Shape and Pose", CVPR'2018.
+
+    Args:
+        in_channels (int): Number of input channels
+        smpl_mean_params (str): The file name of the mean SMPL parameters
+        n_iter (int): The iterations of estimating delta parameters
+    """
+
+    def __init__(self, in_channels, smpl_mean_params=None, n_iter=3):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.n_iter = n_iter
+
+        npose = 24 * 6
+        nbeta = 10
+        ncam = 3
+        hidden_dim = 1024
+
+        self.fc1 = nn.Linear(in_channels + npose + nbeta + ncam, hidden_dim)
+        self.drop1 = nn.Dropout()
+        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
+        self.drop2 = nn.Dropout()
+        self.decpose = nn.Linear(hidden_dim, npose)
+        self.decshape = nn.Linear(hidden_dim, nbeta)
+        self.deccam = nn.Linear(hidden_dim, ncam)
+
+        # Load mean SMPL parameters
+        if smpl_mean_params is None:
+            init_pose = torch.zeros([1, npose])
+            init_shape = torch.zeros([1, nbeta])
+            init_cam = torch.FloatTensor([[1, 0, 0]])
+        else:
+            mean_params = np.load(smpl_mean_params)
+            init_pose = torch.from_numpy(
+                mean_params['pose'][:]).unsqueeze(0).float()
+            init_shape = torch.from_numpy(
+                mean_params['shape'][:]).unsqueeze(0).float()
+            init_cam = torch.from_numpy(
+                mean_params['cam']).unsqueeze(0).float()
+        self.register_buffer('init_pose', init_pose)
+        self.register_buffer('init_shape', init_shape)
+        self.register_buffer('init_cam', init_cam)
+
+    def forward(self, x):
+        """Forward function.
+
+        x is the image feature map and is expected to be in shape (batch size x
+        channel number x height x width)
+        """
+        batch_size = x.shape[0]
+        # extract the global feature vector by average along
+        # spatial dimension.
+        x = x.mean(dim=-1).mean(dim=-1)
+
+        init_pose = self.init_pose.expand(batch_size, -1)
+        init_shape = self.init_shape.expand(batch_size, -1)
+        init_cam = self.init_cam.expand(batch_size, -1)
+
+        pred_pose = init_pose
+        pred_shape = init_shape
+        pred_cam = init_cam
+        for _ in range(self.n_iter):
+            xc = torch.cat([x, pred_pose, pred_shape, pred_cam], 1)
+            xc = self.fc1(xc)
+            xc = self.drop1(xc)
+            xc = self.fc2(xc)
+            xc = self.drop2(xc)
+            pred_pose = self.decpose(xc) + pred_pose
+            pred_shape = self.decshape(xc) + pred_shape
+            pred_cam = self.deccam(xc) + pred_cam
+
+        pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
+        out = (pred_rotmat, pred_shape, pred_cam)
+        return out
+
+    def init_weights(self):
+        """Initialize model weights."""
+        xavier_init(self.decpose, gain=0.01)
+        xavier_init(self.decshape, gain=0.01)
+        xavier_init(self.deccam, gain=0.01)
diff --git a/mmpose/models/heads/interhand_3d_head.py b/mmpose/models/heads/interhand_3d_head.py
new file mode 100644
index 0000000..aebe4a5
--- /dev/null
+++ b/mmpose/models/heads/interhand_3d_head.py
@@ -0,0 +1,521 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.core.evaluation.top_down_eval import (
+    keypoints_from_heatmaps3d, multilabel_classification_accuracy)
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from mmpose.models.necks import GlobalAveragePooling
+from ..builder import HEADS
+
+
+class Heatmap3DHead(nn.Module):
+    """Heatmap3DHead is a sub-module of Interhand3DHead, and outputs 3D
+    heatmaps. Heatmap3DHead is composed of (>=0) number of deconv layers and a
+    simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        depth_size (int): Number of depth discretization size
+        num_deconv_layers (int): Number of deconv layers.
+        num_deconv_layers should >= 0. Note that 0 means no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        extra (dict): Configs for extra conv layers. Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 depth_size=64,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None):
+
+        super().__init__()
+
+        assert out_channels % depth_size == 0
+        self.depth_size = depth_size
+        self.in_channels = in_channels
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        N, C, H, W = x.shape
+        # reshape the 2D heatmap to 3D heatmap
+        x = x.reshape(N, C // self.depth_size, self.depth_size, H, W)
+        return x
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+
+
+class Heatmap1DHead(nn.Module):
+    """Heatmap1DHead is a sub-module of Interhand3DHead, and outputs 1D
+    heatmaps.
+
+    Args:
+        in_channels (int): Number of input channels
+        heatmap_size (int): Heatmap size
+        hidden_dims (list|tuple): Number of feature dimension of FC layers.
+    """
+
+    def __init__(self, in_channels=2048, heatmap_size=64, hidden_dims=(512, )):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.heatmap_size = heatmap_size
+
+        feature_dims = [in_channels, *hidden_dims, heatmap_size]
+        self.fc = self._make_linear_layers(feature_dims, relu_final=False)
+
+    def soft_argmax_1d(self, heatmap1d):
+        heatmap1d = F.softmax(heatmap1d, 1)
+        accu = heatmap1d * torch.arange(
+            self.heatmap_size, dtype=heatmap1d.dtype,
+            device=heatmap1d.device)[None, :]
+        coord = accu.sum(dim=1)
+        return coord
+
+    def _make_linear_layers(self, feat_dims, relu_final=False):
+        """Make linear layers."""
+        layers = []
+        for i in range(len(feat_dims) - 1):
+            layers.append(nn.Linear(feat_dims[i], feat_dims[i + 1]))
+            if i < len(feat_dims) - 2 or \
+                    (i == len(feat_dims) - 2 and relu_final):
+                layers.append(nn.ReLU(inplace=True))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        heatmap1d = self.fc(x)
+        value = self.soft_argmax_1d(heatmap1d).view(-1, 1)
+        return value
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.fc.modules():
+            if isinstance(m, nn.Linear):
+                normal_init(m, mean=0, std=0.01, bias=0)
+
+
+class MultilabelClassificationHead(nn.Module):
+    """MultilabelClassificationHead is a sub-module of Interhand3DHead, and
+    outputs hand type classification.
+
+    Args:
+        in_channels (int): Number of input channels
+        num_labels (int): Number of labels
+        hidden_dims (list|tuple): Number of hidden dimension of FC layers.
+    """
+
+    def __init__(self, in_channels=2048, num_labels=2, hidden_dims=(512, )):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_labesl = num_labels
+
+        feature_dims = [in_channels, *hidden_dims, num_labels]
+        self.fc = self._make_linear_layers(feature_dims, relu_final=False)
+
+    def _make_linear_layers(self, feat_dims, relu_final=False):
+        """Make linear layers."""
+        layers = []
+        for i in range(len(feat_dims) - 1):
+            layers.append(nn.Linear(feat_dims[i], feat_dims[i + 1]))
+            if i < len(feat_dims) - 2 or \
+                    (i == len(feat_dims) - 2 and relu_final):
+                layers.append(nn.ReLU(inplace=True))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        labels = torch.sigmoid(self.fc(x))
+        return labels
+
+    def init_weights(self):
+        for m in self.fc.modules():
+            if isinstance(m, nn.Linear):
+                normal_init(m, mean=0, std=0.01, bias=0)
+
+
+@HEADS.register_module()
+class Interhand3DHead(nn.Module):
+    """Interhand 3D head of paper ref: Gyeongsik Moon. "InterHand2.6M: A
+    Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single
+    RGB Image".
+
+    Args:
+        keypoint_head_cfg (dict): Configs of Heatmap3DHead for hand
+            keypoint estimation.
+        root_head_cfg (dict): Configs of Heatmap1DHead for relative
+            hand root depth estimation.
+        hand_type_head_cfg (dict): Configs of MultilabelClassificationHead
+            for hand type classification.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+        loss_root_depth (dict): Config for relative root depth loss.
+            Default: None.
+        loss_hand_type (dict): Config for hand type classification
+            loss. Default: None.
+    """
+
+    def __init__(self,
+                 keypoint_head_cfg,
+                 root_head_cfg,
+                 hand_type_head_cfg,
+                 loss_keypoint=None,
+                 loss_root_depth=None,
+                 loss_hand_type=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        # build sub-module heads
+        self.right_hand_head = Heatmap3DHead(**keypoint_head_cfg)
+        self.left_hand_head = Heatmap3DHead(**keypoint_head_cfg)
+        self.root_head = Heatmap1DHead(**root_head_cfg)
+        self.hand_type_head = MultilabelClassificationHead(
+            **hand_type_head_cfg)
+        self.neck = GlobalAveragePooling()
+
+        # build losses
+        self.keypoint_loss = build_loss(loss_keypoint)
+        self.root_depth_loss = build_loss(loss_root_depth)
+        self.hand_type_loss = build_loss(loss_hand_type)
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+    def init_weights(self):
+        self.left_hand_head.init_weights()
+        self.right_hand_head.init_weights()
+        self.root_head.init_weights()
+        self.hand_type_head.init_weights()
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate loss for hand keypoint heatmaps, relative root depth and
+        hand type.
+
+        Args:
+            output (list[Tensor]): a list of outputs from multiple heads.
+            target (list[Tensor]): a list of targets for multiple heads.
+            target_weight (list[Tensor]): a list of targets weight for
+                multiple heads.
+        """
+        losses = dict()
+
+        # hand keypoint loss
+        assert not isinstance(self.keypoint_loss, nn.Sequential)
+        out, tar, tar_weight = output[0], target[0], target_weight[0]
+        assert tar.dim() == 5 and tar_weight.dim() == 3
+        losses['hand_loss'] = self.keypoint_loss(out, tar, tar_weight)
+
+        # relative root depth loss
+        assert not isinstance(self.root_depth_loss, nn.Sequential)
+        out, tar, tar_weight = output[1], target[1], target_weight[1]
+        assert tar.dim() == 2 and tar_weight.dim() == 2
+        losses['rel_root_loss'] = self.root_depth_loss(out, tar, tar_weight)
+
+        # hand type loss
+        assert not isinstance(self.hand_type_loss, nn.Sequential)
+        out, tar, tar_weight = output[2], target[2], target_weight[2]
+        assert tar.dim() == 2 and tar_weight.dim() in [1, 2]
+        losses['hand_type_loss'] = self.hand_type_loss(out, tar, tar_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for hand type.
+
+        Args:
+            output (list[Tensor]): a list of outputs from multiple heads.
+            target (list[Tensor]): a list of targets for multiple heads.
+            target_weight (list[Tensor]): a list of targets weight for
+                multiple heads.
+        """
+        accuracy = dict()
+        avg_acc = multilabel_classification_accuracy(
+            output[2].detach().cpu().numpy(),
+            target[2].detach().cpu().numpy(),
+            target_weight[2].detach().cpu().numpy(),
+        )
+        accuracy['acc_classification'] = float(avg_acc)
+        return accuracy
+
+    def forward(self, x):
+        """Forward function."""
+        outputs = []
+        outputs.append(
+            torch.cat([self.right_hand_head(x),
+                       self.left_hand_head(x)], dim=1))
+        x = self.neck(x)
+        outputs.append(self.root_head(x))
+        outputs.append(self.hand_type_head(x))
+        return outputs
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output (list[np.ndarray]): list of output hand keypoint
+            heatmaps, relative root depth and hand type.
+
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            # flip 3D heatmap
+            heatmap_3d = output[0]
+            N, K, D, H, W = heatmap_3d.shape
+            # reshape 3D heatmap to 2D heatmap
+            heatmap_3d = heatmap_3d.reshape(N, K * D, H, W)
+            # 2D heatmap flip
+            heatmap_3d_flipped_back = flip_back(
+                heatmap_3d.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # reshape back to 3D heatmap
+            heatmap_3d_flipped_back = heatmap_3d_flipped_back.reshape(
+                N, K, D, H, W)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                heatmap_3d_flipped_back[...,
+                                        1:] = heatmap_3d_flipped_back[..., :-1]
+            output[0] = heatmap_3d_flipped_back
+
+            # flip relative hand root depth
+            output[1] = -output[1].detach().cpu().numpy()
+
+            # flip hand type
+            hand_type = output[2].detach().cpu().numpy()
+            hand_type_flipped_back = hand_type.copy()
+            hand_type_flipped_back[:, 0] = hand_type[:, 1]
+            hand_type_flipped_back[:, 1] = hand_type[:, 0]
+            output[2] = hand_type_flipped_back
+        else:
+            output = [out.detach().cpu().numpy() for out in output]
+
+        return output
+
+    def decode(self, img_metas, output, **kwargs):
+        """Decode hand keypoint, relative root depth and hand type.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+                - "heatmap3d_depth_bound": depth bound of hand keypoint
+                    3D heatmap
+                - "root_depth_bound": depth bound of relative root depth
+                    1D heatmap
+            output (list[np.ndarray]): model predicted 3D heatmaps, relative
+                root depth and hand type.
+        """
+
+        batch_size = len(img_metas)
+        result = {}
+
+        heatmap3d_depth_bound = np.ones(batch_size, dtype=np.float32)
+        root_depth_bound = np.ones(batch_size, dtype=np.float32)
+        center = np.zeros((batch_size, 2), dtype=np.float32)
+        scale = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size, dtype=np.float32)
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        for i in range(batch_size):
+            heatmap3d_depth_bound[i] = img_metas[i]['heatmap3d_depth_bound']
+            root_depth_bound[i] = img_metas[i]['root_depth_bound']
+            center[i, :] = img_metas[i]['center']
+            scale[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_boxes[:, 0:2] = center[:, 0:2]
+        all_boxes[:, 2:4] = scale[:, 0:2]
+        # scale is defined as: bbox_size / 200.0, so we
+        # need multiply 200.0 to get bbox size
+        all_boxes[:, 4] = np.prod(scale * 200.0, axis=1)
+        all_boxes[:, 5] = score
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        # decode 3D heatmaps of hand keypoints
+        heatmap3d = output[0]
+        preds, maxvals = keypoints_from_heatmaps3d(heatmap3d, center, scale)
+        keypoints_3d = np.zeros((batch_size, preds.shape[1], 4),
+                                dtype=np.float32)
+        keypoints_3d[:, :, 0:3] = preds[:, :, 0:3]
+        keypoints_3d[:, :, 3:4] = maxvals
+        # transform keypoint depth to camera space
+        keypoints_3d[:, :, 2] = \
+            (keypoints_3d[:, :, 2] / self.right_hand_head.depth_size - 0.5) \
+            * heatmap3d_depth_bound[:, np.newaxis]
+
+        result['preds'] = keypoints_3d
+
+        # decode relative hand root depth
+        # transform relative root depth to camera space
+        result['rel_root_depth'] = (output[1] / self.root_head.heatmap_size -
+                                    0.5) * root_depth_bound
+
+        # decode hand type
+        result['hand_type'] = output[2] > 0.5
+        return result
diff --git a/mmpose/models/heads/temporal_regression_head.py b/mmpose/models/heads/temporal_regression_head.py
new file mode 100644
index 0000000..97a07f9
--- /dev/null
+++ b/mmpose/models/heads/temporal_regression_head.py
@@ -0,0 +1,319 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, constant_init, kaiming_init
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmpose.core import (WeightNormClipHook, compute_similarity_transform,
+                         fliplr_regression)
+from mmpose.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class TemporalRegressionHead(nn.Module):
+    """Regression head of VideoPose3D.
+
+    "3D human pose estimation in video with temporal convolutions and
+    semi-supervised training", CVPR'2019.
+
+    Args:
+        in_channels (int): Number of input channels
+        num_joints (int): Number of joints
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+        max_norm (float|None): if not None, the weight of convolution layers
+            will be clipped to have a maximum norm of max_norm.
+        is_trajectory (bool): If the model only predicts root joint
+            position, then this arg should be set to True. In this case,
+            traj_loss will be calculated. Otherwise, it should be set to
+            False. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 max_norm=None,
+                 loss_keypoint=None,
+                 is_trajectory=False,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_joints = num_joints
+        self.max_norm = max_norm
+        self.loss = build_loss(loss_keypoint)
+        self.is_trajectory = is_trajectory
+        if self.is_trajectory:
+            assert self.num_joints == 1
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+
+        self.conv = build_conv_layer(
+            dict(type='Conv1d'), in_channels, num_joints * 3, 1)
+
+        if self.max_norm is not None:
+            # Apply weight norm clip to conv layers
+            weight_clip = WeightNormClipHook(self.max_norm)
+            for module in self.modules():
+                if isinstance(module, nn.modules.conv._ConvNd):
+                    weight_clip.register(module)
+
+    @staticmethod
+    def _transform_inputs(x):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (tuple or list of Tensor | Tensor): multi-level features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(x, (list, tuple)):
+            return x
+
+        assert len(x) > 0
+
+        # return the top-level feature of the 1D feature pyramid
+        return x[-1]
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+
+        assert x.ndim == 3 and x.shape[2] == 1, f'Invalid shape {x.shape}'
+        output = self.conv(x)
+        N = output.shape[0]
+        return output.reshape(N, self.num_joints, 3)
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 3]): Output keypoints.
+            target (torch.Tensor[N, K, 3]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 3]):
+                Weights across different joint types.
+                If self.is_trajectory is True and target_weight is None,
+                target_weight will be set inversely proportional to joint
+                depth.
+        """
+        losses = dict()
+        assert not isinstance(self.loss, nn.Sequential)
+
+        # trajectory model
+        if self.is_trajectory:
+            if target.dim() == 2:
+                target.unsqueeze_(1)
+
+            if target_weight is None:
+                target_weight = (1 / target[:, :, 2:]).expand(target.shape)
+            assert target.dim() == 3 and target_weight.dim() == 3
+
+            losses['traj_loss'] = self.loss(output, target, target_weight)
+
+        # pose model
+        else:
+            if target_weight is None:
+                target_weight = target.new_ones(target.shape)
+            assert target.dim() == 3 and target_weight.dim() == 3
+            losses['reg_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight, metas):
+        """Calculate accuracy for keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 3]): Output keypoints.
+            target (torch.Tensor[N, K, 3]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 3]):
+                Weights across different joint types.
+            metas (list(dict)): Information about data augmentation including:
+
+                - target_image_path (str): Optional, path to the image file
+                - target_mean (float): Optional, normalization parameter of
+                    the target pose.
+                - target_std (float): Optional, normalization parameter of the
+                    target pose.
+                - root_position (np.ndarray[3,1]): Optional, global
+                    position of the root joint.
+                - root_index (torch.ndarray[1,]): Optional, original index of
+                    the root joint before root-centering.
+        """
+
+        accuracy = dict()
+
+        N = output.shape[0]
+        output_ = output.detach().cpu().numpy()
+        target_ = target.detach().cpu().numpy()
+        # Denormalize the predicted pose
+        if 'target_mean' in metas[0] and 'target_std' in metas[0]:
+            target_mean = np.stack([m['target_mean'] for m in metas])
+            target_std = np.stack([m['target_std'] for m in metas])
+            output_ = self._denormalize_joints(output_, target_mean,
+                                               target_std)
+            target_ = self._denormalize_joints(target_, target_mean,
+                                               target_std)
+
+        # Restore global position
+        if self.test_cfg.get('restore_global_position', False):
+            root_pos = np.stack([m['root_position'] for m in metas])
+            root_idx = metas[0].get('root_position_index', None)
+            output_ = self._restore_global_position(output_, root_pos,
+                                                    root_idx)
+            target_ = self._restore_global_position(target_, root_pos,
+                                                    root_idx)
+        # Get target weight
+        if target_weight is None:
+            target_weight_ = np.ones_like(target_)
+        else:
+            target_weight_ = target_weight.detach().cpu().numpy()
+            if self.test_cfg.get('restore_global_position', False):
+                root_idx = metas[0].get('root_position_index', None)
+                root_weight = metas[0].get('root_joint_weight', 1.0)
+                target_weight_ = self._restore_root_target_weight(
+                    target_weight_, root_weight, root_idx)
+
+        mpjpe = np.mean(
+            np.linalg.norm((output_ - target_) * target_weight_, axis=-1))
+
+        transformed_output = np.zeros_like(output_)
+        for i in range(N):
+            transformed_output[i, :, :] = compute_similarity_transform(
+                output_[i, :, :], target_[i, :, :])
+        p_mpjpe = np.mean(
+            np.linalg.norm(
+                (transformed_output - target_) * target_weight_, axis=-1))
+
+        accuracy['mpjpe'] = output.new_tensor(mpjpe)
+        accuracy['p_mpjpe'] = output.new_tensor(p_mpjpe)
+
+        return accuracy
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_regression (np.ndarray): Output regression.
+
+        Args:
+            x (torch.Tensor[N, K, 2]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_regression = fliplr_regression(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                center_mode='static',
+                center_x=0)
+        else:
+            output_regression = output.detach().cpu().numpy()
+        return output_regression
+
+    def decode(self, metas, output):
+        """Decode the keypoints from output regression.
+
+        Args:
+            metas (list(dict)): Information about data augmentation.
+                By default this includes:
+
+                - "target_image_path": path to the image file
+            output (np.ndarray[N, K, 3]): predicted regression vector.
+            metas (list(dict)): Information about data augmentation including:
+
+                - target_image_path (str): Optional, path to the image file
+                - target_mean (float): Optional, normalization parameter of
+                    the target pose.
+                - target_std (float): Optional, normalization parameter of the
+                    target pose.
+                - root_position (np.ndarray[3,1]): Optional, global
+                    position of the root joint.
+                - root_index (torch.ndarray[1,]): Optional, original index of
+                    the root joint before root-centering.
+        """
+
+        # Denormalize the predicted pose
+        if 'target_mean' in metas[0] and 'target_std' in metas[0]:
+            target_mean = np.stack([m['target_mean'] for m in metas])
+            target_std = np.stack([m['target_std'] for m in metas])
+            output = self._denormalize_joints(output, target_mean, target_std)
+
+        # Restore global position
+        if self.test_cfg.get('restore_global_position', False):
+            root_pos = np.stack([m['root_position'] for m in metas])
+            root_idx = metas[0].get('root_position_index', None)
+            output = self._restore_global_position(output, root_pos, root_idx)
+
+        target_image_paths = [m.get('target_image_path', None) for m in metas]
+        result = {'preds': output, 'target_image_paths': target_image_paths}
+
+        return result
+
+    @staticmethod
+    def _denormalize_joints(x, mean, std):
+        """Denormalize joint coordinates with given statistics mean and std.
+
+        Args:
+            x (np.ndarray[N, K, 3]): Normalized joint coordinates.
+            mean (np.ndarray[K, 3]): Mean value.
+            std (np.ndarray[K, 3]): Std value.
+        """
+        assert x.ndim == 3
+        assert x.shape == mean.shape == std.shape
+
+        return x * std + mean
+
+    @staticmethod
+    def _restore_global_position(x, root_pos, root_idx=None):
+        """Restore global position of the root-centered joints.
+
+        Args:
+            x (np.ndarray[N, K, 3]): root-centered joint coordinates
+            root_pos (np.ndarray[N,1,3]): The global position of the
+                root joint.
+            root_idx (int|None): If not none, the root joint will be inserted
+                back to the pose at the given index.
+        """
+        x = x + root_pos
+        if root_idx is not None:
+            x = np.insert(x, root_idx, root_pos.squeeze(1), axis=1)
+        return x
+
+    @staticmethod
+    def _restore_root_target_weight(target_weight, root_weight, root_idx=None):
+        """Restore the target weight of the root joint after the restoration of
+        the global position.
+
+        Args:
+            target_weight (np.ndarray[N, K, 1]): Target weight of relativized
+                joints.
+            root_weight (float): The target weight value of the root joint.
+            root_idx (int|None): If not none, the root joint weight will be
+                inserted back to the target weight at the given index.
+        """
+        if root_idx is not None:
+            root_weight = np.full(
+                target_weight.shape[0], root_weight, dtype=target_weight.dtype)
+            target_weight = np.insert(
+                target_weight, root_idx, root_weight[:, None], axis=1)
+        return target_weight
+
+    def init_weights(self):
+        """Initialize the weights."""
+        for m in self.modules():
+            if isinstance(m, nn.modules.conv._ConvNd):
+                kaiming_init(m, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m, _BatchNorm):
+                constant_init(m, 1)
diff --git a/mmpose/models/heads/topdown_heatmap_base_head.py b/mmpose/models/heads/topdown_heatmap_base_head.py
new file mode 100644
index 0000000..09646ea
--- /dev/null
+++ b/mmpose/models/heads/topdown_heatmap_base_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import torch.nn as nn
+
+from mmpose.core.evaluation.top_down_eval import keypoints_from_heatmaps
+
+
+class TopdownHeatmapBaseHead(nn.Module):
+    """Base class for top-down heatmap heads.
+
+    All top-down heatmap heads should subclass it.
+    All subclass should overwrite:
+
+    Methods:`get_loss`, supporting to calculate loss.
+    Methods:`get_accuracy`, supporting to calculate accuracy.
+    Methods:`forward`, supporting to forward model.
+    Methods:`inference_model`, supporting to inference model.
+    """
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def get_loss(self, **kwargs):
+        """Gets the loss."""
+
+    @abstractmethod
+    def get_accuracy(self, **kwargs):
+        """Gets the accuracy."""
+
+    @abstractmethod
+    def forward(self, **kwargs):
+        """Forward function."""
+
+    @abstractmethod
+    def inference_model(self, **kwargs):
+        """Inference function."""
+
+    def decode(self, img_metas, output, **kwargs):
+        """Decode keypoints from heatmaps.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_heatmaps(
+            output,
+            c,
+            s,
+            unbiased=self.test_cfg.get('unbiased_decoding', False),
+            post_process=self.test_cfg.get('post_process', 'default'),
+            kernel=self.test_cfg.get('modulate_kernel', 11),
+            valid_radius_factor=self.test_cfg.get('valid_radius_factor',
+                                                  0.0546875),
+            use_udp=self.test_cfg.get('use_udp', False),
+            target_type=self.test_cfg.get('target_type', 'GaussianHeatmap'))
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
diff --git a/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
new file mode 100644
index 0000000..c439f5b
--- /dev/null
+++ b/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
@@ -0,0 +1,572 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, Linear,
+                      build_activation_layer, build_conv_layer,
+                      build_norm_layer, build_upsample_layer, constant_init,
+                      kaiming_init, normal_init)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from ..builder import HEADS
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class TopdownHeatmapMultiStageHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap multi-stage head.
+
+    TopdownHeatmapMultiStageHead is consisted of multiple branches,
+    each of which has num_deconv_layers(>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_stages (int): Number of stages.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=512,
+                 out_channels=17,
+                 num_stages=1,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_stages = num_stages
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        # build multi-stage deconv layers
+        self.multi_deconv_layers = nn.ModuleList([])
+        for _ in range(self.num_stages):
+            if num_deconv_layers > 0:
+                deconv_layers = self._make_deconv_layer(
+                    num_deconv_layers,
+                    num_deconv_filters,
+                    num_deconv_kernels,
+                )
+            elif num_deconv_layers == 0:
+                deconv_layers = nn.Identity()
+            else:
+                raise ValueError(
+                    f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+            self.multi_deconv_layers.append(deconv_layers)
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        # build multi-stage final layers
+        self.multi_final_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if identity_final_layer:
+                final_layer = nn.Identity()
+            else:
+                final_layer = build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=num_deconv_filters[-1]
+                    if num_deconv_layers > 0 else in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding)
+            self.multi_final_layers.append(final_layer)
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]):
+                Output heatmaps.
+            target (torch.Tensor[N,K,H,W]):
+                Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert isinstance(output, list)
+        assert target.dim() == 4 and target_weight.dim() == 3
+
+        if isinstance(self.loss, nn.Sequential):
+            assert len(self.loss) == len(output)
+        for i in range(len(output)):
+            target_i = target
+            target_weight_i = target_weight
+            if isinstance(self.loss, nn.Sequential):
+                loss_func = self.loss[i]
+            else:
+                loss_func = self.loss
+            loss_i = loss_func(output[i], target_i, target_weight_i)
+            if 'heatmap_loss' not in losses:
+                losses['heatmap_loss'] = loss_i
+            else:
+                losses['heatmap_loss'] += loss_i
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output[-1].detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages.
+        """
+        out = []
+        assert isinstance(x, list)
+        for i in range(self.num_stages):
+            y = self.multi_deconv_layers[i](x[i])
+            y = self.multi_final_layers[i](y)
+            out.append(y)
+        return out
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (List[torch.Tensor[NxKxHxW]]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        assert isinstance(output, list)
+        output = output[-1]
+
+        if flip_pairs is not None:
+            # perform flip
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+
+        return output_heatmap
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.multi_deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.multi_final_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+
+
+class PredictHeatmap(nn.Module):
+    """Predict the heat map for an input feature.
+
+    Args:
+        unit_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        out_shape (tuple): Shape of the output heatmap.
+        use_prm (bool): Whether to use pose refine machine. Default: False.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 unit_channels,
+                 out_channels,
+                 out_shape,
+                 use_prm=False,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.out_channels = out_channels
+        self.out_shape = out_shape
+        self.use_prm = use_prm
+        if use_prm:
+            self.prm = PRM(out_channels, norm_cfg=norm_cfg)
+        self.conv_layers = nn.Sequential(
+            ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=norm_cfg,
+                inplace=False),
+            ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                inplace=False))
+
+    def forward(self, feature):
+        feature = self.conv_layers(feature)
+        output = nn.functional.interpolate(
+            feature, size=self.out_shape, mode='bilinear', align_corners=True)
+        if self.use_prm:
+            output = self.prm(output)
+        return output
+
+
+class PRM(nn.Module):
+    """Pose Refine Machine.
+
+    Please refer to "Learning Delicate Local Representations
+    for Multi-Person Pose Estimation" (ECCV 2020).
+
+    Args:
+        out_channels (int): Channel number of the output. Equals to
+            the number of key points.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self, out_channels, norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.out_channels = out_channels
+        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.middle_path = nn.Sequential(
+            Linear(self.out_channels, self.out_channels),
+            build_norm_layer(dict(type='BN1d'), out_channels)[1],
+            build_activation_layer(dict(type='ReLU')),
+            Linear(self.out_channels, self.out_channels),
+            build_norm_layer(dict(type='BN1d'), out_channels)[1],
+            build_activation_layer(dict(type='ReLU')),
+            build_activation_layer(dict(type='Sigmoid')))
+
+        self.bottom_path = nn.Sequential(
+            ConvModule(
+                self.out_channels,
+                self.out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=norm_cfg,
+                inplace=False),
+            DepthwiseSeparableConvModule(
+                self.out_channels,
+                1,
+                kernel_size=9,
+                stride=1,
+                padding=4,
+                norm_cfg=norm_cfg,
+                inplace=False), build_activation_layer(dict(type='Sigmoid')))
+        self.conv_bn_relu_prm_1 = ConvModule(
+            self.out_channels,
+            self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False)
+
+    def forward(self, x):
+        out = self.conv_bn_relu_prm_1(x)
+        out_1 = out
+
+        out_2 = self.global_pooling(out_1)
+        out_2 = out_2.view(out_2.size(0), -1)
+        out_2 = self.middle_path(out_2)
+        out_2 = out_2.unsqueeze(2)
+        out_2 = out_2.unsqueeze(3)
+
+        out_3 = self.bottom_path(out_1)
+        out = out_1 * (1 + out_2 * out_3)
+
+        return out
+
+
+@HEADS.register_module()
+class TopdownHeatmapMSMUHead(TopdownHeatmapBaseHead):
+    """Heads for multi-stage multi-unit heads used in Multi-Stage Pose
+    estimation Network (MSPN), and Residual Steps Networks (RSN).
+
+    Args:
+        unit_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        out_shape (tuple): Shape of the output heatmap.
+        num_stages (int): Number of stages.
+        num_units (int): Number of units in each stage.
+        use_prm (bool): Whether to use pose refine machine (PRM).
+            Default: False.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 out_shape,
+                 unit_channels=256,
+                 out_channels=17,
+                 num_stages=4,
+                 num_units=4,
+                 use_prm=False,
+                 norm_cfg=dict(type='BN'),
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self.out_shape = out_shape
+        self.unit_channels = unit_channels
+        self.out_channels = out_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.predict_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            for j in range(self.num_units):
+                self.predict_layers.append(
+                    PredictHeatmap(
+                        unit_channels,
+                        out_channels,
+                        out_shape,
+                        use_prm,
+                        norm_cfg=norm_cfg))
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,O,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,O,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,O,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert isinstance(output, list)
+        assert target.dim() == 5 and target_weight.dim() == 4
+        assert target.size(1) == len(output)
+
+        if isinstance(self.loss, nn.Sequential):
+            assert len(self.loss) == len(output)
+        for i in range(len(output)):
+            target_i = target[:, i, :, :, :]
+            target_weight_i = target_weight[:, i, :, :]
+
+            if isinstance(self.loss, nn.Sequential):
+                loss_func = self.loss[i]
+            else:
+                loss_func = self.loss
+
+            loss_i = loss_func(output[i], target_i, target_weight_i)
+            if 'heatmap_loss' not in losses:
+                losses['heatmap_loss'] = loss_i
+            else:
+                losses['heatmap_loss'] += loss_i
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            assert isinstance(output, list)
+            assert target.dim() == 5 and target_weight.dim() == 4
+            _, avg_acc, _ = pose_pck_accuracy(
+                output[-1].detach().cpu().numpy(),
+                target[:, -1, ...].detach().cpu().numpy(),
+                target_weight[:, -1,
+                              ...].detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages
+                                and units.
+        """
+        out = []
+        assert isinstance(x, list)
+        assert len(x) == self.num_stages
+        assert isinstance(x[0], list)
+        assert len(x[0]) == self.num_units
+        assert x[0][0].shape[1] == self.unit_channels
+        for i in range(self.num_stages):
+            for j in range(self.num_units):
+                y = self.predict_layers[i * self.num_units + j](x[i][j])
+                out.append(y)
+
+        return out
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (list[torch.Tensor[N,K,H,W]]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        assert isinstance(output, list)
+        output = output[-1]
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.predict_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.Linear):
+                normal_init(m, std=0.01)
diff --git a/mmpose/models/heads/topdown_heatmap_simple_head.py b/mmpose/models/heads/topdown_heatmap_simple_head.py
new file mode 100644
index 0000000..72f3348
--- /dev/null
+++ b/mmpose/models/heads/topdown_heatmap_simple_head.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from mmpose.models.utils.ops import resize
+from ..builder import HEADS
+import torch.nn.functional as F
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap simple head. paper ref: Bin Xiao et al. ``Simple
+    Baselines for Human Pose Estimation and Tracking``.
+
+    TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resized to the
+                same size as the first one and then concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 upsample=0,):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.loss = build_loss(loss_keypoint)
+        self.upsample = upsample
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 4 and target_weight.dim() == 3
+        losses['heatmap_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output.detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        return x
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            if not isinstance(inputs, list):
+                if self.upsample > 0:
+                    inputs = resize(
+                        input=F.relu(inputs),
+                        scale_factor=self.upsample,
+                        mode='bilinear',
+                        align_corners=self.align_corners
+                        )
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
diff --git a/mmpose/models/heads/vipnas_heatmap_simple_head.py b/mmpose/models/heads/vipnas_heatmap_simple_head.py
new file mode 100644
index 0000000..4170312
--- /dev/null
+++ b/mmpose/models/heads/vipnas_heatmap_simple_head.py
@@ -0,0 +1,349 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from mmpose.models.utils.ops import resize
+from ..builder import HEADS
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class ViPNASHeatmapSimpleHead(TopdownHeatmapBaseHead):
+    """ViPNAS heatmap simple head.
+
+    ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2105.10154>`__ .
+
+    TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        num_deconv_groups (list|tuple): Group number.
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(144, 144, 144),
+                 num_deconv_kernels=(4, 4, 4),
+                 num_deconv_groups=(16, 16, 16),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers, num_deconv_filters, num_deconv_kernels,
+                num_deconv_groups)
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 4 and target_weight.dim() == 3
+        losses['heatmap_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type.lower() == 'GaussianHeatmap'.lower():
+            _, avg_acc, _ = pose_pck_accuracy(
+                output.detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        return x
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels,
+                           num_groups):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_groups):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_groups({len(num_groups)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            groups = num_groups[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    groups=groups,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
diff --git a/mmpose/models/heads/voxelpose_head.py b/mmpose/models/heads/voxelpose_head.py
new file mode 100644
index 0000000..8799bdc
--- /dev/null
+++ b/mmpose/models/heads/voxelpose_head.py
@@ -0,0 +1,167 @@
+# ------------------------------------------------------------------------------
+# Copyright and License Information
+# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models
+# Original Licence: MIT License
+# ------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class CuboidCenterHead(nn.Module):
+    """Get results from the 3D human center heatmap. In this module, human 3D
+    centers are local maximums obtained from the 3D heatmap via NMS (max-
+    pooling).
+
+    Args:
+        space_size (list[3]): The size of the 3D space.
+        cube_size (list[3]): The size of the heatmap volume.
+        space_center (list[3]): The coordinate of space center.
+        max_num (int): Maximum of human center detections.
+        max_pool_kernel (int): Kernel size of the max-pool kernel in nms.
+    """
+
+    def __init__(self,
+                 space_size,
+                 space_center,
+                 cube_size,
+                 max_num=10,
+                 max_pool_kernel=3):
+        super(CuboidCenterHead, self).__init__()
+        # use register_buffer
+        self.register_buffer('grid_size', torch.tensor(space_size))
+        self.register_buffer('cube_size', torch.tensor(cube_size))
+        self.register_buffer('grid_center', torch.tensor(space_center))
+
+        self.num_candidates = max_num
+        self.max_pool_kernel = max_pool_kernel
+        self.loss = nn.MSELoss()
+
+    def _get_real_locations(self, indices):
+        """
+        Args:
+            indices (torch.Tensor(NXP)): Indices of points in the 3D tensor
+
+        Returns:
+            real_locations (torch.Tensor(NXPx3)): Locations of points
+                in the world coordinate system
+        """
+        real_locations = indices.float() / (
+                self.cube_size - 1) * self.grid_size + \
+            self.grid_center - self.grid_size / 2.0
+        return real_locations
+
+    def _nms_by_max_pool(self, heatmap_volumes):
+        max_num = self.num_candidates
+        batch_size = heatmap_volumes.shape[0]
+        root_cubes_nms = self._max_pool(heatmap_volumes)
+        root_cubes_nms_reshape = root_cubes_nms.reshape(batch_size, -1)
+        topk_values, topk_index = root_cubes_nms_reshape.topk(max_num)
+        topk_unravel_index = self._get_3d_indices(topk_index,
+                                                  heatmap_volumes[0].shape)
+
+        return topk_values, topk_unravel_index
+
+    def _max_pool(self, inputs):
+        kernel = self.max_pool_kernel
+        padding = (kernel - 1) // 2
+        max = F.max_pool3d(
+            inputs, kernel_size=kernel, stride=1, padding=padding)
+        keep = (inputs == max).float()
+        return keep * inputs
+
+    @staticmethod
+    def _get_3d_indices(indices, shape):
+        """Get indices in the 3-D tensor.
+
+        Args:
+            indices (torch.Tensor(NXp)): Indices of points in the 1D tensor
+            shape (torch.Size(3)): The shape of the original 3D tensor
+
+        Returns:
+            indices: Indices of points in the original 3D tensor
+        """
+        batch_size = indices.shape[0]
+        num_people = indices.shape[1]
+        indices_x = (indices //
+                     (shape[1] * shape[2])).reshape(batch_size, num_people, -1)
+        indices_y = ((indices % (shape[1] * shape[2])) //
+                     shape[2]).reshape(batch_size, num_people, -1)
+        indices_z = (indices % shape[2]).reshape(batch_size, num_people, -1)
+        indices = torch.cat([indices_x, indices_y, indices_z], dim=2)
+        return indices
+
+    def forward(self, heatmap_volumes):
+        """
+
+        Args:
+            heatmap_volumes (torch.Tensor(NXLXWXH)):
+                3D human center heatmaps predicted by the network.
+        Returns:
+            human_centers (torch.Tensor(NXPX5)):
+                Coordinates of human centers.
+        """
+        batch_size = heatmap_volumes.shape[0]
+
+        topk_values, topk_unravel_index = self._nms_by_max_pool(
+            heatmap_volumes.detach())
+
+        topk_unravel_index = self._get_real_locations(topk_unravel_index)
+
+        human_centers = torch.zeros(
+            batch_size, self.num_candidates, 5, device=heatmap_volumes.device)
+        human_centers[:, :, 0:3] = topk_unravel_index
+        human_centers[:, :, 4] = topk_values
+
+        return human_centers
+
+    def get_loss(self, pred_cubes, gt):
+
+        return dict(loss_center=self.loss(pred_cubes, gt))
+
+
+@HEADS.register_module()
+class CuboidPoseHead(nn.Module):
+
+    def __init__(self, beta):
+        """Get results from the 3D human pose heatmap. Instead of obtaining
+        maximums on the heatmap, this module regresses the coordinates of
+        keypoints via integral pose regression. Refer to `paper.
+
+        <https://arxiv.org/abs/2004.06239>` for more details.
+
+        Args:
+            beta: Constant to adjust the magnification of soft-maxed heatmap.
+        """
+        super(CuboidPoseHead, self).__init__()
+        self.beta = beta
+        self.loss = nn.L1Loss()
+
+    def forward(self, heatmap_volumes, grid_coordinates):
+        """
+
+        Args:
+            heatmap_volumes (torch.Tensor(NxKxLxWxH)):
+                3D human pose heatmaps predicted by the network.
+            grid_coordinates (torch.Tensor(Nx(LxWxH)x3)):
+                Coordinates of the grids in the heatmap volumes.
+        Returns:
+            human_poses (torch.Tensor(NxKx3)): Coordinates of human poses.
+        """
+        batch_size = heatmap_volumes.size(0)
+        channel = heatmap_volumes.size(1)
+        x = heatmap_volumes.reshape(batch_size, channel, -1, 1)
+        x = F.softmax(self.beta * x, dim=2)
+        grid_coordinates = grid_coordinates.unsqueeze(1)
+        x = torch.mul(x, grid_coordinates)
+        human_poses = torch.sum(x, dim=2)
+
+        return human_poses
+
+    def get_loss(self, preds, targets, weights):
+
+        return dict(loss_pose=self.loss(preds * weights, targets * weights))
diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py
new file mode 100644
index 0000000..d67973f
--- /dev/null
+++ b/mmpose/models/losses/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .classfication_loss import BCELoss
+from .heatmap_loss import AdaptiveWingLoss
+from .mesh_loss import GANLoss, MeshLoss
+from .mse_loss import JointsMSELoss, JointsOHKMMSELoss
+from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory
+from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss,
+                              SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss,
+                              WingLoss)
+
+__all__ = [
+    'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss',
+    'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss',
+    'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss',
+    'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss'
+]
diff --git a/mmpose/models/losses/classfication_loss.py b/mmpose/models/losses/classfication_loss.py
new file mode 100644
index 0000000..b79b69d
--- /dev/null
+++ b/mmpose/models/losses/classfication_loss.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class BCELoss(nn.Module):
+    """Binary Cross Entropy loss."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.binary_cross_entropy
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_labels: K
+
+        Args:
+            output (torch.Tensor[N, K]): Output classification.
+            target (torch.Tensor[N, K]): Target classification.
+            target_weight (torch.Tensor[N, K] or torch.Tensor[N]):
+                Weights across different labels.
+        """
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output, target, reduction='none')
+            if target_weight.dim() == 1:
+                target_weight = target_weight[:, None]
+            loss = (loss * target_weight).mean()
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
diff --git a/mmpose/models/losses/heatmap_loss.py b/mmpose/models/losses/heatmap_loss.py
new file mode 100644
index 0000000..9471457
--- /dev/null
+++ b/mmpose/models/losses/heatmap_loss.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class AdaptiveWingLoss(nn.Module):
+    """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face
+    Alignment via Heatmap Regression' Wang et al. ICCV'2019.
+
+    Args:
+        alpha (float), omega (float), epsilon (float), theta (float)
+            are hyper-parameters.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 alpha=2.1,
+                 omega=14,
+                 epsilon=1,
+                 theta=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.alpha = float(alpha)
+        self.omega = float(omega)
+        self.epsilon = float(epsilon)
+        self.theta = float(theta)
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            pred (torch.Tensor[NxKxHxW]): Predicted heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+        """
+        H, W = pred.shape[2:4]
+        delta = (target - pred).abs()
+
+        A = self.omega * (
+            1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+        ) * (self.alpha - target) * (torch.pow(
+            self.theta / self.epsilon,
+            self.alpha - target - 1)) * (1 / self.epsilon)
+        C = self.theta * A - self.omega * torch.log(
+            1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+
+        losses = torch.where(
+            delta < self.theta,
+            self.omega *
+            torch.log(1 +
+                      torch.pow(delta / self.epsilon, self.alpha - target)),
+            A * delta - C)
+
+        return torch.mean(losses)
+
+    def forward(self, output, target, target_weight):
+        """Forward function.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[NxKxHxW]): Output heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            loss = self.criterion(output * target_weight.unsqueeze(-1),
+                                  target * target_weight.unsqueeze(-1))
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
diff --git a/mmpose/models/losses/mesh_loss.py b/mmpose/models/losses/mesh_loss.py
new file mode 100644
index 0000000..f9d18bd
--- /dev/null
+++ b/mmpose/models/losses/mesh_loss.py
@@ -0,0 +1,340 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from ..utils.geometry import batch_rodrigues
+
+
+def perspective_projection(points, rotation, translation, focal_length,
+                           camera_center):
+    """This function computes the perspective projection of a set of 3D points.
+
+    Note:
+        - batch size: B
+        - point number: N
+
+    Args:
+        points (Tensor([B, N, 3])): A set of 3D points
+        rotation (Tensor([B, 3, 3])): Camera rotation matrix
+        translation (Tensor([B, 3])): Camera translation
+        focal_length (Tensor([B,])): Focal length
+        camera_center (Tensor([B, 2])): Camera center
+
+    Returns:
+        projected_points (Tensor([B, N, 2])): Projected 2D
+            points in image space.
+    """
+
+    batch_size = points.shape[0]
+    K = torch.zeros([batch_size, 3, 3], device=points.device)
+    K[:, 0, 0] = focal_length
+    K[:, 1, 1] = focal_length
+    K[:, 2, 2] = 1.
+    K[:, :-1, -1] = camera_center
+
+    # Transform points
+    points = torch.einsum('bij,bkj->bki', rotation, points)
+    points = points + translation.unsqueeze(1)
+
+    # Apply perspective distortion
+    projected_points = points / points[:, :, -1].unsqueeze(-1)
+
+    # Apply camera intrinsics
+    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
+    projected_points = projected_points[:, :, :-1]
+    return projected_points
+
+
+@LOSSES.register_module()
+class MeshLoss(nn.Module):
+    """Mix loss for 3D human mesh. It is composed of loss on 2D joints, 3D
+    joints, mesh vertices and smpl parameters (if any).
+
+    Args:
+        joints_2d_loss_weight (float): Weight for loss on 2D joints.
+        joints_3d_loss_weight (float): Weight for loss on 3D joints.
+        vertex_loss_weight (float): Weight for loss on 3D verteices.
+        smpl_pose_loss_weight (float): Weight for loss on SMPL
+            pose parameters.
+        smpl_beta_loss_weight (float): Weight for loss on SMPL
+            shape parameters.
+        img_res (int): Input image resolution.
+        focal_length (float): Focal length of camera model. Default=5000.
+    """
+
+    def __init__(self,
+                 joints_2d_loss_weight,
+                 joints_3d_loss_weight,
+                 vertex_loss_weight,
+                 smpl_pose_loss_weight,
+                 smpl_beta_loss_weight,
+                 img_res,
+                 focal_length=5000):
+
+        super().__init__()
+        # Per-vertex loss on the mesh
+        self.criterion_vertex = nn.L1Loss(reduction='none')
+
+        # Joints (2D and 3D) loss
+        self.criterion_joints_2d = nn.SmoothL1Loss(reduction='none')
+        self.criterion_joints_3d = nn.SmoothL1Loss(reduction='none')
+
+        # Loss for SMPL parameter regression
+        self.criterion_regr = nn.MSELoss(reduction='none')
+
+        self.joints_2d_loss_weight = joints_2d_loss_weight
+        self.joints_3d_loss_weight = joints_3d_loss_weight
+        self.vertex_loss_weight = vertex_loss_weight
+        self.smpl_pose_loss_weight = smpl_pose_loss_weight
+        self.smpl_beta_loss_weight = smpl_beta_loss_weight
+        self.focal_length = focal_length
+        self.img_res = img_res
+
+    def joints_2d_loss(self, pred_joints_2d, gt_joints_2d, joints_2d_visible):
+        """Compute 2D reprojection loss on the joints.
+
+        The loss is weighted by joints_2d_visible.
+        """
+        conf = joints_2d_visible.float()
+        loss = (conf *
+                self.criterion_joints_2d(pred_joints_2d, gt_joints_2d)).mean()
+        return loss
+
+    def joints_3d_loss(self, pred_joints_3d, gt_joints_3d, joints_3d_visible):
+        """Compute 3D joints loss for the examples that 3D joint annotations
+        are available.
+
+        The loss is weighted by joints_3d_visible.
+        """
+        conf = joints_3d_visible.float()
+        if len(gt_joints_3d) > 0:
+            gt_pelvis = (gt_joints_3d[:, 2, :] + gt_joints_3d[:, 3, :]) / 2
+            gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :]
+            pred_pelvis = (pred_joints_3d[:, 2, :] +
+                           pred_joints_3d[:, 3, :]) / 2
+            pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :]
+            return (
+                conf *
+                self.criterion_joints_3d(pred_joints_3d, gt_joints_3d)).mean()
+        return pred_joints_3d.sum() * 0
+
+    def vertex_loss(self, pred_vertices, gt_vertices, has_smpl):
+        """Compute 3D vertex loss for the examples that 3D human mesh
+        annotations are available.
+
+        The loss is weighted by the has_smpl.
+        """
+        conf = has_smpl.float()
+        loss_vertex = self.criterion_vertex(pred_vertices, gt_vertices)
+        loss_vertex = (conf[:, None, None] * loss_vertex).mean()
+        return loss_vertex
+
+    def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas,
+                    has_smpl):
+        """Compute SMPL parameters loss for the examples that SMPL parameter
+        annotations are available.
+
+        The loss is weighted by has_smpl.
+        """
+        conf = has_smpl.float()
+        gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss_regr_pose = self.criterion_regr(pred_rotmat, gt_rotmat)
+        loss_regr_betas = self.criterion_regr(pred_betas, gt_betas)
+        loss_regr_pose = (conf[:, None, None, None] * loss_regr_pose).mean()
+        loss_regr_betas = (conf[:, None] * loss_regr_betas).mean()
+        return loss_regr_pose, loss_regr_betas
+
+    def project_points(self, points_3d, camera):
+        """Perform orthographic projection of 3D points using the camera
+        parameters, return projected 2D points in image plane.
+
+        Note:
+            - batch size: B
+            - point number: N
+
+        Args:
+            points_3d (Tensor([B, N, 3])): 3D points.
+            camera (Tensor([B, 3])): camera parameters with the
+                3 channel as (scale, translation_x, translation_y)
+
+        Returns:
+            Tensor([B, N, 2]): projected 2D points \
+                in image space.
+        """
+        batch_size = points_3d.shape[0]
+        device = points_3d.device
+        cam_t = torch.stack([
+            camera[:, 1], camera[:, 2], 2 * self.focal_length /
+            (self.img_res * camera[:, 0] + 1e-9)
+        ],
+                            dim=-1)
+        camera_center = camera.new_zeros([batch_size, 2])
+        rot_t = torch.eye(
+            3, device=device,
+            dtype=points_3d.dtype).unsqueeze(0).expand(batch_size, -1, -1)
+        joints_2d = perspective_projection(
+            points_3d,
+            rotation=rot_t,
+            translation=cam_t,
+            focal_length=self.focal_length,
+            camera_center=camera_center)
+        return joints_2d
+
+    def forward(self, output, target):
+        """Forward function.
+
+        Args:
+            output (dict): dict of network predicted results.
+                Keys: 'vertices', 'joints_3d', 'camera',
+                'pose'(optional), 'beta'(optional)
+            target (dict): dict of ground-truth labels.
+                Keys: 'vertices', 'joints_3d', 'joints_3d_visible',
+                'joints_2d', 'joints_2d_visible', 'pose', 'beta',
+                'has_smpl'
+
+        Returns:
+            dict: dict of losses.
+        """
+        losses = {}
+
+        # Per-vertex loss for the shape
+        pred_vertices = output['vertices']
+
+        gt_vertices = target['vertices']
+        has_smpl = target['has_smpl']
+        loss_vertex = self.vertex_loss(pred_vertices, gt_vertices, has_smpl)
+        losses['vertex_loss'] = loss_vertex * self.vertex_loss_weight
+
+        # Compute loss on SMPL parameters, if available
+        if 'pose' in output.keys() and 'beta' in output.keys():
+            pred_rotmat = output['pose']
+            pred_betas = output['beta']
+            gt_pose = target['pose']
+            gt_betas = target['beta']
+            loss_regr_pose, loss_regr_betas = self.smpl_losses(
+                pred_rotmat, pred_betas, gt_pose, gt_betas, has_smpl)
+            losses['smpl_pose_loss'] = \
+                loss_regr_pose * self.smpl_pose_loss_weight
+            losses['smpl_beta_loss'] = \
+                loss_regr_betas * self.smpl_beta_loss_weight
+
+        # Compute 3D joints loss
+        pred_joints_3d = output['joints_3d']
+        gt_joints_3d = target['joints_3d']
+        joints_3d_visible = target['joints_3d_visible']
+        loss_joints_3d = self.joints_3d_loss(pred_joints_3d, gt_joints_3d,
+                                             joints_3d_visible)
+        losses['joints_3d_loss'] = loss_joints_3d * self.joints_3d_loss_weight
+
+        # Compute 2D reprojection loss for the 2D joints
+        pred_camera = output['camera']
+        gt_joints_2d = target['joints_2d']
+        joints_2d_visible = target['joints_2d_visible']
+        pred_joints_2d = self.project_points(pred_joints_3d, pred_camera)
+
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_joints_2d is
+        #  the center of the input image.
+        pred_joints_2d = 2 * pred_joints_2d / (self.img_res - 1)
+        # The coordinate origin of gt_joints_2d is
+        # the top left corner of the input image.
+        gt_joints_2d = 2 * gt_joints_2d / (self.img_res - 1) - 1
+        loss_joints_2d = self.joints_2d_loss(pred_joints_2d, gt_joints_2d,
+                                             joints_2d_visible)
+        losses['joints_2d_loss'] = loss_joints_2d * self.joints_2d_loss_weight
+
+        return losses
+
+
+@LOSSES.register_module()
+class GANLoss(nn.Module):
+    """Define GAN loss.
+
+    Args:
+        gan_type (str): Support 'vanilla', 'lsgan', 'wgan', 'hinge'.
+        real_label_val (float): The value for real label. Default: 1.0.
+        fake_label_val (float): The value for fake label. Default: 0.0.
+        loss_weight (float): Loss weight. Default: 1.0.
+            Note that loss_weight is only for generators; and it is always 1.0
+            for discriminators.
+    """
+
+    def __init__(self,
+                 gan_type,
+                 real_label_val=1.0,
+                 fake_label_val=0.0,
+                 loss_weight=1.0):
+        super().__init__()
+        self.gan_type = gan_type
+        self.loss_weight = loss_weight
+        self.real_label_val = real_label_val
+        self.fake_label_val = fake_label_val
+
+        if self.gan_type == 'vanilla':
+            self.loss = nn.BCEWithLogitsLoss()
+        elif self.gan_type == 'lsgan':
+            self.loss = nn.MSELoss()
+        elif self.gan_type == 'wgan':
+            self.loss = self._wgan_loss
+        elif self.gan_type == 'hinge':
+            self.loss = nn.ReLU()
+        else:
+            raise NotImplementedError(
+                f'GAN type {self.gan_type} is not implemented.')
+
+    @staticmethod
+    def _wgan_loss(input, target):
+        """wgan loss.
+
+        Args:
+            input (Tensor): Input tensor.
+            target (bool): Target label.
+
+        Returns:
+            Tensor: wgan loss.
+        """
+        return -input.mean() if target else input.mean()
+
+    def get_target_label(self, input, target_is_real):
+        """Get target label.
+
+        Args:
+            input (Tensor): Input tensor.
+            target_is_real (bool): Whether the target is real or fake.
+
+        Returns:
+            (bool | Tensor): Target tensor. Return bool for wgan, \
+                otherwise, return Tensor.
+        """
+
+        if self.gan_type == 'wgan':
+            return target_is_real
+        target_val = (
+            self.real_label_val if target_is_real else self.fake_label_val)
+        return input.new_ones(input.size()) * target_val
+
+    def forward(self, input, target_is_real, is_disc=False):
+        """
+        Args:
+            input (Tensor): The input for the loss module, i.e., the network
+                prediction.
+            target_is_real (bool): Whether the targe is real or fake.
+            is_disc (bool): Whether the loss for discriminators or not.
+                Default: False.
+
+        Returns:
+            Tensor: GAN loss value.
+        """
+        target_label = self.get_target_label(input, target_is_real)
+        if self.gan_type == 'hinge':
+            if is_disc:  # for discriminators in hinge-gan
+                input = -input if target_is_real else input
+                loss = self.loss(1 + input).mean()
+            else:  # for generators in hinge-gan
+                loss = -input.mean()
+        else:  # other gan types
+            loss = self.loss(input, target_label)
+
+        # loss_weight is always 1.0 for discriminators
+        return loss if is_disc else loss * self.loss_weight
diff --git a/mmpose/models/losses/mse_loss.py b/mmpose/models/losses/mse_loss.py
new file mode 100644
index 0000000..f972efa
--- /dev/null
+++ b/mmpose/models/losses/mse_loss.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class JointsMSELoss(nn.Module):
+    """MSE loss for heatmaps.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+
+        loss = 0.
+
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                loss += self.criterion(heatmap_pred * target_weight[:, idx],
+                                       heatmap_gt * target_weight[:, idx])
+            else:
+                loss += self.criterion(heatmap_pred, heatmap_gt)
+
+        return loss / num_joints * self.loss_weight
+
+
+@LOSSES.register_module()
+class CombinedTargetMSELoss(nn.Module):
+    """MSE loss for combined target.
+        CombinedTarget: The combination of classification target
+        (response map) and regression target (offset map).
+        Paper ref: Huang et al. The Devil is in the Details: Delving into
+        Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss(reduction='mean')
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight):
+        batch_size = output.size(0)
+        num_channels = output.size(1)
+        heatmaps_pred = output.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        heatmaps_gt = target.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        loss = 0.
+        num_joints = num_channels // 3
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx * 3].squeeze()
+            heatmap_gt = heatmaps_gt[idx * 3].squeeze()
+            offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze()
+            offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze()
+            offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze()
+            offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze()
+            if self.use_target_weight:
+                heatmap_pred = heatmap_pred * target_weight[:, idx]
+                heatmap_gt = heatmap_gt * target_weight[:, idx]
+            # classification loss
+            loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt)
+            # regression loss
+            loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred,
+                                         heatmap_gt * offset_x_gt)
+            loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred,
+                                         heatmap_gt * offset_y_gt)
+        return loss / num_joints * self.loss_weight
+
+
+@LOSSES.register_module()
+class JointsOHKMMSELoss(nn.Module):
+    """MSE loss with online hard keypoint mining.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        topk (int): Only top k joint losses are kept.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, topk=8, loss_weight=1.):
+        super().__init__()
+        assert topk > 0
+        self.criterion = nn.MSELoss(reduction='none')
+        self.use_target_weight = use_target_weight
+        self.topk = topk
+        self.loss_weight = loss_weight
+
+    def _ohkm(self, loss):
+        """Online hard keypoint mining."""
+        ohkm_loss = 0.
+        N = len(loss)
+        for i in range(N):
+            sub_loss = loss[i]
+            _, topk_idx = torch.topk(
+                sub_loss, k=self.topk, dim=0, sorted=False)
+            tmp_loss = torch.gather(sub_loss, 0, topk_idx)
+            ohkm_loss += torch.sum(tmp_loss) / self.topk
+        ohkm_loss /= N
+        return ohkm_loss
+
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+        if num_joints < self.topk:
+            raise ValueError(f'topk ({self.topk}) should not '
+                             f'larger than num_joints ({num_joints}).')
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+
+        losses = []
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                losses.append(
+                    self.criterion(heatmap_pred * target_weight[:, idx],
+                                   heatmap_gt * target_weight[:, idx]))
+            else:
+                losses.append(self.criterion(heatmap_pred, heatmap_gt))
+
+        losses = [loss.mean(dim=1).unsqueeze(dim=1) for loss in losses]
+        losses = torch.cat(losses, dim=1)
+
+        return self._ohkm(losses) * self.loss_weight
diff --git a/mmpose/models/losses/multi_loss_factory.py b/mmpose/models/losses/multi_loss_factory.py
new file mode 100644
index 0000000..65f90a7
--- /dev/null
+++ b/mmpose/models/losses/multi_loss_factory.py
@@ -0,0 +1,281 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+def _make_input(t, requires_grad=False, device=torch.device('cpu')):
+    """Make zero inputs for AE loss.
+
+    Args:
+        t (torch.Tensor): input
+        requires_grad (bool): Option to use requires_grad.
+        device: torch device
+
+    Returns:
+        torch.Tensor: zero input.
+    """
+    inp = torch.autograd.Variable(t, requires_grad=requires_grad)
+    inp = inp.sum()
+    inp = inp.to(device)
+    return inp
+
+
+@LOSSES.register_module()
+class HeatmapLoss(nn.Module):
+    """Accumulate the heatmap loss for each image in the batch.
+
+    Args:
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+
+    def __init__(self, supervise_empty=True):
+        super().__init__()
+        self.supervise_empty = supervise_empty
+
+    def forward(self, pred, gt, mask):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            pred (torch.Tensor[N,K,H,W]):heatmap of output.
+            gt (torch.Tensor[N,K,H,W]): target heatmap.
+            mask (torch.Tensor[N,H,W]): mask of target.
+        """
+        assert pred.size() == gt.size(
+        ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}'
+
+        if not self.supervise_empty:
+            empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float()
+            loss = ((pred - gt)**2) * empty_mask.expand_as(
+                pred) * mask[:, None, :, :].expand_as(pred)
+        else:
+            loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred)
+        loss = loss.mean(dim=3).mean(dim=2).mean(dim=1)
+        return loss
+
+
+@LOSSES.register_module()
+class AELoss(nn.Module):
+    """Associative Embedding loss.
+
+    `Associative Embedding: End-to-End Learning for Joint Detection and
+    Grouping <https://arxiv.org/abs/1611.05424v2>`_.
+    """
+
+    def __init__(self, loss_type):
+        super().__init__()
+        self.loss_type = loss_type
+
+    def singleTagLoss(self, pred_tag, joints):
+        """Associative embedding loss for one image.
+
+        Note:
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image.
+            joints (torch.Tensor[M,K,2]): joints information for one image.
+        """
+        tags = []
+        pull = 0
+        for joints_per_person in joints:
+            tmp = []
+            for joint in joints_per_person:
+                if joint[1] > 0:
+                    tmp.append(pred_tag[joint[0]])
+            if len(tmp) == 0:
+                continue
+            tmp = torch.stack(tmp)
+            tags.append(torch.mean(tmp, dim=0))
+            pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2)
+
+        num_tags = len(tags)
+        if num_tags == 0:
+            return (
+                _make_input(torch.zeros(1).float(), device=pred_tag.device),
+                _make_input(torch.zeros(1).float(), device=pred_tag.device))
+        elif num_tags == 1:
+            return (_make_input(
+                torch.zeros(1).float(), device=pred_tag.device), pull)
+
+        tags = torch.stack(tags)
+
+        size = (num_tags, num_tags)
+        A = tags.expand(*size)
+        B = A.permute(1, 0)
+
+        diff = A - B
+
+        if self.loss_type == 'exp':
+            diff = torch.pow(diff, 2)
+            push = torch.exp(-diff)
+            push = torch.sum(push) - num_tags
+        elif self.loss_type == 'max':
+            diff = 1 - torch.abs(diff)
+            push = torch.clamp(diff, min=0).sum() - num_tags
+        else:
+            raise ValueError('Unknown ae loss type')
+
+        push_loss = push / ((num_tags - 1) * num_tags) * 0.5
+        pull_loss = pull / (num_tags)
+
+        return push_loss, pull_loss
+
+    def forward(self, tags, joints):
+        """Accumulate the tag loss for each image in the batch.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            tags (torch.Tensor[N,KxHxW,1]): tag channels of output.
+            joints (torch.Tensor[N,M,K,2]): joints information.
+        """
+        pushes, pulls = [], []
+        joints = joints.cpu().data.numpy()
+        batch_size = tags.size(0)
+        for i in range(batch_size):
+            push, pull = self.singleTagLoss(tags[i], joints[i])
+            pushes.append(push)
+            pulls.append(pull)
+        return torch.stack(pushes), torch.stack(pulls)
+
+
+@LOSSES.register_module()
+class MultiLossFactory(nn.Module):
+    """Loss for bottom-up models.
+
+    Args:
+        num_joints (int): Number of keypoints.
+        num_stages (int): Number of stages.
+        ae_loss_type (str): Type of ae loss.
+        with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap.
+        push_loss_factor (list[float]):
+            Parameter of push loss in multi-heatmap.
+        pull_loss_factor (list[float]):
+            Parameter of pull loss in multi-heatmap.
+        with_heatmap_loss (list[bool]):
+            Use heatmap loss or not in multi-heatmap.
+        heatmaps_loss_factor (list[float]):
+            Parameter of heatmap loss in multi-heatmap.
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+
+    def __init__(self,
+                 num_joints,
+                 num_stages,
+                 ae_loss_type,
+                 with_ae_loss,
+                 push_loss_factor,
+                 pull_loss_factor,
+                 with_heatmaps_loss,
+                 heatmaps_loss_factor,
+                 supervise_empty=True):
+        super().__init__()
+
+        assert isinstance(with_heatmaps_loss, (list, tuple)), \
+            'with_heatmaps_loss should be a list or tuple'
+        assert isinstance(heatmaps_loss_factor, (list, tuple)), \
+            'heatmaps_loss_factor should be a list or tuple'
+        assert isinstance(with_ae_loss, (list, tuple)), \
+            'with_ae_loss should be a list or tuple'
+        assert isinstance(push_loss_factor, (list, tuple)), \
+            'push_loss_factor should be a list or tuple'
+        assert isinstance(pull_loss_factor, (list, tuple)), \
+            'pull_loss_factor should be a list or tuple'
+
+        self.num_joints = num_joints
+        self.num_stages = num_stages
+        self.ae_loss_type = ae_loss_type
+        self.with_ae_loss = with_ae_loss
+        self.push_loss_factor = push_loss_factor
+        self.pull_loss_factor = pull_loss_factor
+        self.with_heatmaps_loss = with_heatmaps_loss
+        self.heatmaps_loss_factor = heatmaps_loss_factor
+
+        self.heatmaps_loss = \
+            nn.ModuleList(
+                [
+                    HeatmapLoss(supervise_empty)
+                    if with_heatmaps_loss else None
+                    for with_heatmaps_loss in self.with_heatmaps_loss
+                ]
+            )
+
+        self.ae_loss = \
+            nn.ModuleList(
+                [
+                    AELoss(self.ae_loss_type) if with_ae_loss else None
+                    for with_ae_loss in self.with_ae_loss
+                ]
+            )
+
+    def forward(self, outputs, heatmaps, masks, joints):
+        """Forward function to calculate losses.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+            - output_channel: C C=2K if use ae loss else K
+
+        Args:
+            outputs (list(torch.Tensor[N,C,H,W])): outputs of stages.
+            heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps.
+            masks (list(torch.Tensor[N,H,W])): masks of heatmaps.
+            joints (list(torch.Tensor[N,M,K,2])): joints of ae loss.
+        """
+        heatmaps_losses = []
+        push_losses = []
+        pull_losses = []
+        for idx in range(len(outputs)):
+            offset_feat = 0
+            if self.heatmaps_loss[idx]:
+                heatmaps_pred = outputs[idx][:, :self.num_joints]
+                offset_feat = self.num_joints
+                heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred,
+                                                        heatmaps[idx],
+                                                        masks[idx])
+                heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx]
+                heatmaps_losses.append(heatmaps_loss)
+            else:
+                heatmaps_losses.append(None)
+
+            if self.ae_loss[idx]:
+                tags_pred = outputs[idx][:, offset_feat:]
+                batch_size = tags_pred.size()[0]
+                tags_pred = tags_pred.contiguous().view(batch_size, -1, 1)
+
+                push_loss, pull_loss = self.ae_loss[idx](tags_pred,
+                                                         joints[idx])
+                push_loss = push_loss * self.push_loss_factor[idx]
+                pull_loss = pull_loss * self.pull_loss_factor[idx]
+
+                push_losses.append(push_loss)
+                pull_losses.append(pull_loss)
+            else:
+                push_losses.append(None)
+                pull_losses.append(None)
+
+        return heatmaps_losses, push_losses, pull_losses
diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py
new file mode 100644
index 0000000..db41783
--- /dev/null
+++ b/mmpose/models/losses/regression_loss.py
@@ -0,0 +1,448 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class SmoothL1Loss(nn.Module):
+    """SmoothL1Loss loss.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.smooth_l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class WingLoss(nn.Module):
+    """Wing Loss. paper ref: 'Wing Loss for Robust Facial Landmark Localisation
+    with Convolutional Neural Networks' Feng et al. CVPR'2018.
+
+    Args:
+        omega (float): Also referred to as width.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 omega=10.0,
+                 epsilon=2.0,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega = omega
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.C = self.omega * (1.0 - math.log(1.0 + self.omega / self.epsilon))
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega,
+            self.omega * torch.log(1.0 + delta / self.epsilon), delta - self.C)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class SoftWingLoss(nn.Module):
+    """Soft Wing Loss 'Structure-Coherent Deep Feature Learning for Robust Face
+    Alignment' Lin et al. TIP'2021.
+
+    loss =
+        1. |x|                           , if |x| < omega1
+        2. omega2*ln(1+|x|/epsilon) + B, if |x| >= omega1
+
+    Args:
+        omega1 (float): The first threshold.
+        omega2 (float): The second threshold.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 omega1=2.0,
+                 omega2=20.0,
+                 epsilon=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega1 = omega1
+        self.omega2 = omega2
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.B = self.omega1 - self.omega2 * math.log(1.0 + self.omega1 /
+                                                      self.epsilon)
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega1, delta,
+            self.omega2 * torch.log(1.0 + delta / self.epsilon) + self.B)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class MPJPELoss(nn.Module):
+    """MPJPE (Mean Per Joint Position Error) loss.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.norm((output - target) * target_weight, dim=-1))
+        else:
+            loss = torch.mean(torch.norm(output - target, dim=-1))
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class L1Loss(nn.Module):
+    """L1Loss loss ."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class MSELoss(nn.Module):
+    """MSE loss for coordinate regression."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.mse_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class BoneLoss(nn.Module):
+    """Bone length loss.
+
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        use_target_weight (bool): Option to use weighted bone loss.
+            Different bone types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.joint_parents = joint_parents
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        self.non_root_indices = []
+        for i in range(len(self.joint_parents)):
+            if i != self.joint_parents[i]:
+                self.non_root_indices.append(i)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K-1]):
+                Weights across different bone types.
+        """
+        output_bone = torch.norm(
+            output - output[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        target_bone = torch.norm(
+            target - target[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.abs((output_bone * target_weight).mean(dim=0) -
+                          (target_bone * target_weight).mean(dim=0)))
+        else:
+            loss = torch.mean(
+                torch.abs(output_bone.mean(dim=0) - target_bone.mean(dim=0)))
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class SemiSupervisionLoss(nn.Module):
+    """Semi-supervision loss for unlabeled data. It is composed of projection
+    loss and bone loss.
+
+    Paper ref: `3D human pose estimation in video with temporal convolutions
+    and semi-supervised training` Dario Pavllo et al. CVPR'2019.
+
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        projection_loss_weight (float): Weight for projection loss.
+        bone_loss_weight (float): Weight for bone loss.
+        warmup_iterations (int): Number of warmup iterations. In the first
+            `warmup_iterations` iterations, the model is trained only on
+            labeled data, and semi-supervision loss will be 0.
+            This is a workaround since currently we cannot access
+            epoch number in loss functions. Note that the iteration number in
+            an epoch can be changed due to different GPU numbers in multi-GPU
+            settings. So please set this parameter carefully.
+            warmup_iterations = dataset_size // samples_per_gpu // gpu_num
+            * warmup_epochs
+    """
+
+    def __init__(self,
+                 joint_parents,
+                 projection_loss_weight=1.,
+                 bone_loss_weight=1.,
+                 warmup_iterations=0):
+        super().__init__()
+        self.criterion_projection = MPJPELoss(
+            loss_weight=projection_loss_weight)
+        self.criterion_bone = BoneLoss(
+            joint_parents, loss_weight=bone_loss_weight)
+        self.warmup_iterations = warmup_iterations
+        self.num_iterations = 0
+
+    @staticmethod
+    def project_joints(x, intrinsics):
+        """Project 3D joint coordinates to 2D image plane using camera
+        intrinsic parameters.
+
+        Args:
+            x (torch.Tensor[N, K, 3]): 3D joint coordinates.
+            intrinsics (torch.Tensor[N, 4] | torch.Tensor[N, 9]): Camera
+                intrinsics: f (2), c (2), k (3), p (2).
+        """
+        while intrinsics.dim() < x.dim():
+            intrinsics.unsqueeze_(1)
+        f = intrinsics[..., :2]
+        c = intrinsics[..., 2:4]
+        _x = torch.clamp(x[:, :, :2] / x[:, :, 2:], -1, 1)
+        if intrinsics.shape[-1] == 9:
+            k = intrinsics[..., 4:7]
+            p = intrinsics[..., 7:9]
+
+            r2 = torch.sum(_x[:, :, :2]**2, dim=-1, keepdim=True)
+            radial = 1 + torch.sum(
+                k * torch.cat((r2, r2**2, r2**3), dim=-1),
+                dim=-1,
+                keepdim=True)
+            tan = torch.sum(p * _x, dim=-1, keepdim=True)
+            _x = _x * (radial + tan) + p * r2
+        _x = f * _x + c
+        return _x
+
+    def forward(self, output, target):
+        losses = dict()
+
+        self.num_iterations += 1
+        if self.num_iterations <= self.warmup_iterations:
+            return losses
+
+        labeled_pose = output['labeled_pose']
+        unlabeled_pose = output['unlabeled_pose']
+        unlabeled_traj = output['unlabeled_traj']
+        unlabeled_target_2d = target['unlabeled_target_2d']
+        intrinsics = target['intrinsics']
+
+        # projection loss
+        unlabeled_output = unlabeled_pose + unlabeled_traj
+        unlabeled_output_2d = self.project_joints(unlabeled_output, intrinsics)
+        loss_proj = self.criterion_projection(unlabeled_output_2d,
+                                              unlabeled_target_2d, None)
+        losses['proj_loss'] = loss_proj
+
+        # bone loss
+        loss_bone = self.criterion_bone(unlabeled_pose, labeled_pose, None)
+        losses['bone_loss'] = loss_bone
+
+        return losses
diff --git a/mmpose/models/misc/__init__.py b/mmpose/models/misc/__init__.py
new file mode 100644
index 0000000..ef101fe
--- /dev/null
+++ b/mmpose/models/misc/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmpose/models/misc/discriminator.py b/mmpose/models/misc/discriminator.py
new file mode 100644
index 0000000..712f0a8
--- /dev/null
+++ b/mmpose/models/misc/discriminator.py
@@ -0,0 +1,307 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import normal_init, xavier_init
+
+from mmpose.models.utils.geometry import batch_rodrigues
+
+
+class BaseDiscriminator(nn.Module):
+    """Base linear module for SMPL parameter discriminator.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (9, 32, 32, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active function
+            or not, such as (True, True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        super().__init__()
+        self.fc_layers = fc_layers
+        self.use_dropout = use_dropout
+        self.drop_prob = drop_prob
+        self.use_activation = use_activation
+        self._check()
+        self.create_layers()
+
+    def _check(self):
+        """Check input to avoid ValueError."""
+        if not isinstance(self.fc_layers, tuple):
+            raise TypeError(f'fc_layers require tuple, '
+                            f'get {type(self.fc_layers)}')
+
+        if not isinstance(self.use_dropout, tuple):
+            raise TypeError(f'use_dropout require tuple, '
+                            f'get {type(self.use_dropout)}')
+
+        if not isinstance(self.drop_prob, tuple):
+            raise TypeError(f'drop_prob require tuple, '
+                            f'get {type(self.drop_prob)}')
+
+        if not isinstance(self.use_activation, tuple):
+            raise TypeError(f'use_activation require tuple, '
+                            f'get {type(self.use_activation)}')
+
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_drop_prob = len(self.drop_prob)
+        l_use_activation = len(self.use_activation)
+
+        pass_check = (
+            l_fc_layer >= 2 and l_use_drop < l_fc_layer
+            and l_drop_prob < l_fc_layer and l_use_activation < l_fc_layer
+            and l_drop_prob == l_use_drop)
+
+        if not pass_check:
+            msg = 'Wrong BaseDiscriminator parameters!'
+            raise ValueError(msg)
+
+    def create_layers(self):
+        """Create layers."""
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_use_activation = len(self.use_activation)
+
+        self.fc_blocks = nn.Sequential()
+
+        for i in range(l_fc_layer - 1):
+            self.fc_blocks.add_module(
+                name=f'regressor_fc_{i}',
+                module=nn.Linear(
+                    in_features=self.fc_layers[i],
+                    out_features=self.fc_layers[i + 1]))
+
+            if i < l_use_activation and self.use_activation[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_af_{i}', module=nn.ReLU())
+
+            if i < l_use_drop and self.use_dropout[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_fc_dropout_{i}',
+                    module=nn.Dropout(p=self.drop_prob[i]))
+
+    @abstractmethod
+    def forward(self, inputs):
+        """Forward function."""
+        msg = 'the base class [BaseDiscriminator] is not callable!'
+        raise NotImplementedError(msg)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.fc_blocks.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class ShapeDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL shape parameters, the inputs is (batch_size x 10)
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count, such as (10, 5, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or
+            not for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class PoseDiscriminator(nn.Module):
+    """Discriminator for SMPL pose parameters of each joint. It is composed of
+    discriminators for each joints. The inputs is (batch_size x joint_count x
+    9)
+
+    Args:
+        channels (Tuple): Tuple of channel number,
+            such as (9, 32, 32, 1)
+        joint_count (int): Joint number, such as 23
+    """
+
+    def __init__(self, channels, joint_count):
+        super().__init__()
+        if channels[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {channels[-1]}'
+            raise ValueError(msg)
+        self.joint_count = joint_count
+
+        self.conv_blocks = nn.Sequential()
+        len_channels = len(channels)
+        for idx in range(len_channels - 2):
+            self.conv_blocks.add_module(
+                name=f'conv_{idx}',
+                module=nn.Conv2d(
+                    in_channels=channels[idx],
+                    out_channels=channels[idx + 1],
+                    kernel_size=1,
+                    stride=1))
+
+        self.fc_layer = nn.ModuleList()
+        for idx in range(joint_count):
+            self.fc_layer.append(
+                nn.Linear(
+                    in_features=channels[len_channels - 2], out_features=1))
+
+    def forward(self, inputs):
+        """Forward function.
+
+        The input is (batch_size x joint_count x 9).
+        """
+        # shape: batch_size x 9 x 1 x joint_count
+        inputs = inputs.transpose(1, 2).unsqueeze(2).contiguous()
+        # shape: batch_size x c x 1 x joint_count
+        internal_outputs = self.conv_blocks(inputs)
+        outputs = []
+        for idx in range(self.joint_count):
+            outputs.append(self.fc_layer[idx](internal_outputs[:, :, 0, idx]))
+
+        return torch.cat(outputs, 1), internal_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.conv_blocks:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+        for m in self.fc_layer.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class FullPoseDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL pose parameters of all joints.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (736, 1024, 1024, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer must be 1,' \
+                  f' but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class SMPLDiscriminator(nn.Module):
+    """Discriminator for SMPL pose and shape parameters. It is composed of a
+    discriminator for SMPL shape parameters, a discriminator for SMPL pose
+    parameters of all joints  and a discriminator for SMPL pose parameters of
+    each joint.
+
+    Args:
+        beta_channel (tuple of int): Tuple of neuron count of the
+            discriminator of shape parameters. Defaults to (10, 5, 1)
+        per_joint_channel (tuple of int): Tuple of neuron count of the
+            discriminator of each joint. Defaults to (9, 32, 32, 1)
+        full_pose_channel (tuple of int): Tuple of neuron count of the
+            discriminator of full pose. Defaults to (23*32, 1024, 1024, 1)
+    """
+
+    def __init__(self,
+                 beta_channel=(10, 5, 1),
+                 per_joint_channel=(9, 32, 32, 1),
+                 full_pose_channel=(23 * 32, 1024, 1024, 1)):
+        super().__init__()
+        self.joint_count = 23
+        # The count of SMPL shape parameter is 10.
+        assert beta_channel[0] == 10
+        # Use 3 x 3 rotation matrix as the pose parameters
+        # of each joint, so the input channel is 9.
+        assert per_joint_channel[0] == 9
+        assert self.joint_count * per_joint_channel[-2] \
+            == full_pose_channel[0]
+
+        self.beta_channel = beta_channel
+        self.per_joint_channel = per_joint_channel
+        self.full_pose_channel = full_pose_channel
+        self._create_sub_modules()
+
+    def _create_sub_modules(self):
+        """Create sub discriminators."""
+
+        # create theta discriminator for each joint
+        self.pose_discriminator = PoseDiscriminator(self.per_joint_channel,
+                                                    self.joint_count)
+
+        # create full pose discriminator for total joints
+        fc_layers = self.full_pose_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+
+        self.full_pose_discriminator = FullPoseDiscriminator(
+            fc_layers, use_dropout, drop_prob, use_activation)
+
+        # create shape discriminator for betas
+        fc_layers = self.beta_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+        self.shape_discriminator = ShapeDiscriminator(fc_layers, use_dropout,
+                                                      drop_prob,
+                                                      use_activation)
+
+    def forward(self, thetas):
+        """Forward function."""
+        _, poses, shapes = thetas
+
+        batch_size = poses.shape[0]
+        shape_disc_value = self.shape_discriminator(shapes)
+
+        # The first rotation matrix is global rotation
+        # and is NOT used in discriminator.
+        if poses.dim() == 2:
+            rotate_matrixs = \
+                batch_rodrigues(poses.contiguous().view(-1, 3)
+                                ).view(batch_size, 24, 9)[:, 1:, :]
+        else:
+            rotate_matrixs = poses.contiguous().view(batch_size, 24,
+                                                     9)[:, 1:, :].contiguous()
+        pose_disc_value, pose_inter_disc_value \
+            = self.pose_discriminator(rotate_matrixs)
+        full_pose_disc_value = self.full_pose_discriminator(
+            pose_inter_disc_value.contiguous().view(batch_size, -1))
+        return torch.cat(
+            (pose_disc_value, full_pose_disc_value, shape_disc_value), 1)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        self.full_pose_discriminator.init_weights()
+        self.pose_discriminator.init_weights()
+        self.shape_discriminator.init_weights()
diff --git a/mmpose/models/necks/__init__.py b/mmpose/models/necks/__init__.py
new file mode 100644
index 0000000..0d3a5cc
--- /dev/null
+++ b/mmpose/models/necks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .gap_neck import GlobalAveragePooling
+from .posewarper_neck import PoseWarperNeck
+
+__all__ = ['GlobalAveragePooling', 'PoseWarperNeck']
diff --git a/mmpose/models/necks/gap_neck.py b/mmpose/models/necks/gap_neck.py
new file mode 100644
index 0000000..5e6ad68
--- /dev/null
+++ b/mmpose/models/necks/gap_neck.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class GlobalAveragePooling(nn.Module):
+    """Global Average Pooling neck.
+
+    Note that we use `view` to remove extra channel after pooling. We do not
+    use `squeeze` as it will also remove the batch dimension when the tensor
+    has a batch dimension of size 1, which can lead to unexpected errors.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.gap = nn.AdaptiveAvgPool2d((1, 1))
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        if isinstance(inputs, tuple):
+            outs = tuple([self.gap(x) for x in inputs])
+            outs = tuple(
+                [out.view(x.size(0), -1) for out, x in zip(outs, inputs)])
+        elif isinstance(inputs, list):
+            outs = [self.gap(x) for x in inputs]
+            outs = [out.view(x.size(0), -1) for out, x in zip(outs, inputs)]
+        elif isinstance(inputs, torch.Tensor):
+            outs = self.gap(inputs)
+            outs = outs.view(inputs.size(0), -1)
+        else:
+            raise TypeError('neck inputs should be tuple or torch.tensor')
+        return outs
diff --git a/mmpose/models/necks/posewarper_neck.py b/mmpose/models/necks/posewarper_neck.py
new file mode 100644
index 0000000..dd4ddfb
--- /dev/null
+++ b/mmpose/models/necks/posewarper_neck.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from mmcv.utils import digit_version
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.models.utils.ops import resize
+from ..backbones.resnet import BasicBlock, Bottleneck
+from ..builder import NECKS
+
+try:
+    from mmcv.ops import DeformConv2d
+    has_mmcv_full = True
+except (ImportError, ModuleNotFoundError):
+    has_mmcv_full = False
+
+
+@NECKS.register_module()
+class PoseWarperNeck(nn.Module):
+    """PoseWarper neck.
+
+    `"Learning temporal pose estimation from sparsely-labeled videos"
+    <https://arxiv.org/abs/1906.04016>`_.
+
+    Args:
+        in_channels (int): Number of input channels from backbone
+        out_channels (int): Number of output channels
+        inner_channels (int): Number of intermediate channels of the res block
+        deform_groups (int): Number of groups in the deformable conv
+        dilations (list|tuple): different dilations of the offset conv layers
+        trans_conv_kernel (int): the kernel of the trans conv layer, which is
+            used to get heatmap from the output of backbone. Default: 1
+        res_blocks_cfg (dict|None): config of residual blocks. If None,
+            use the default values. If not None, it should contain the
+            following keys:
+
+            - block (str): the type of residual block, Default: 'BASIC'.
+            - num_blocks (int):  the number of blocks, Default: 20.
+
+        offsets_kernel (int): the kernel of offset conv layer.
+        deform_conv_kernel (int): the kernel of defomrable conv layer.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resize to \
+                the same size as first one and than concat together. \
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into \
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+
+        freeze_trans_layer (bool): Whether to freeze the transition layer
+            (stop grad and set eval mode). Default: True.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        im2col_step (int): the argument `im2col_step` in deformable conv,
+            Default: 80.
+    """
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+    minimum_mmcv_version = '1.3.17'
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 inner_channels,
+                 deform_groups=17,
+                 dilations=(3, 6, 12, 18, 24),
+                 trans_conv_kernel=1,
+                 res_blocks_cfg=None,
+                 offsets_kernel=3,
+                 deform_conv_kernel=3,
+                 in_index=0,
+                 input_transform=None,
+                 freeze_trans_layer=True,
+                 norm_eval=False,
+                 im2col_step=80):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.inner_channels = inner_channels
+        self.deform_groups = deform_groups
+        self.dilations = dilations
+        self.trans_conv_kernel = trans_conv_kernel
+        self.res_blocks_cfg = res_blocks_cfg
+        self.offsets_kernel = offsets_kernel
+        self.deform_conv_kernel = deform_conv_kernel
+        self.in_index = in_index
+        self.input_transform = input_transform
+        self.freeze_trans_layer = freeze_trans_layer
+        self.norm_eval = norm_eval
+        self.im2col_step = im2col_step
+
+        identity_trans_layer = False
+
+        assert trans_conv_kernel in [0, 1, 3]
+        kernel_size = trans_conv_kernel
+        if kernel_size == 3:
+            padding = 1
+        elif kernel_size == 1:
+            padding = 0
+        else:
+            # 0 for Identity mapping.
+            identity_trans_layer = True
+
+        if identity_trans_layer:
+            self.trans_layer = nn.Identity()
+        else:
+            self.trans_layer = build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding)
+
+        # build chain of residual blocks
+        if res_blocks_cfg is not None and not isinstance(res_blocks_cfg, dict):
+            raise TypeError('res_blocks_cfg should be dict or None.')
+
+        if res_blocks_cfg is None:
+            block_type = 'BASIC'
+            num_blocks = 20
+        else:
+            block_type = res_blocks_cfg.get('block', 'BASIC')
+            num_blocks = res_blocks_cfg.get('num_blocks', 20)
+
+        block = self.blocks_dict[block_type]
+
+        res_layers = []
+        downsample = nn.Sequential(
+            build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=out_channels,
+                out_channels=inner_channels,
+                kernel_size=1,
+                stride=1,
+                bias=False),
+            build_norm_layer(dict(type='BN'), inner_channels)[1])
+        res_layers.append(
+            block(
+                in_channels=out_channels,
+                out_channels=inner_channels,
+                downsample=downsample))
+
+        for _ in range(1, num_blocks):
+            res_layers.append(block(inner_channels, inner_channels))
+        self.offset_feats = nn.Sequential(*res_layers)
+
+        # build offset layers
+        self.num_offset_layers = len(dilations)
+        assert self.num_offset_layers > 0, 'Number of offset layers ' \
+            'should be larger than 0.'
+
+        target_offset_channels = 2 * offsets_kernel**2 * deform_groups
+
+        offset_layers = [
+            build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=inner_channels,
+                out_channels=target_offset_channels,
+                kernel_size=offsets_kernel,
+                stride=1,
+                dilation=dilations[i],
+                padding=dilations[i],
+                bias=False,
+            ) for i in range(self.num_offset_layers)
+        ]
+        self.offset_layers = nn.ModuleList(offset_layers)
+
+        # build deformable conv layers
+        assert digit_version(mmcv.__version__) >= \
+            digit_version(self.minimum_mmcv_version), \
+            f'Current MMCV version: {mmcv.__version__}, ' \
+            f'but MMCV >= {self.minimum_mmcv_version} is required, see ' \
+            f'https://github.com/open-mmlab/mmcv/issues/1440, ' \
+            f'Please install the latest MMCV.'
+
+        if has_mmcv_full:
+            deform_conv_layers = [
+                DeformConv2d(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=deform_conv_kernel,
+                    stride=1,
+                    padding=int(deform_conv_kernel / 2) * dilations[i],
+                    dilation=dilations[i],
+                    deform_groups=deform_groups,
+                    im2col_step=self.im2col_step,
+                ) for i in range(self.num_offset_layers)
+            ]
+        else:
+            raise ImportError('Please install the full version of mmcv '
+                              'to use `DeformConv2d`.')
+
+        self.deform_conv_layers = nn.ModuleList(deform_conv_layers)
+
+        self.freeze_layers()
+
+    def freeze_layers(self):
+        if self.freeze_trans_layer:
+            self.trans_layer.eval()
+
+            for param in self.trans_layer.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                constant_init(m, 1)
+            elif isinstance(m, DeformConv2d):
+                filler = torch.zeros([
+                    m.weight.size(0),
+                    m.weight.size(1),
+                    m.weight.size(2),
+                    m.weight.size(3)
+                ],
+                                     dtype=torch.float32,
+                                     device=m.weight.device)
+                for k in range(m.weight.size(0)):
+                    filler[k, k,
+                           int(m.weight.size(2) / 2),
+                           int(m.weight.size(3) / 2)] = 1.0
+                m.weight = torch.nn.Parameter(filler)
+                m.weight.requires_grad = True
+
+        # posewarper offset layer weight initialization
+        for m in self.offset_layers.modules():
+            constant_init(m, 0)
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def forward(self, inputs, frame_weight):
+        assert isinstance(inputs, (list, tuple)), 'PoseWarperNeck inputs ' \
+            'should be list or tuple, even though the length is 1, ' \
+            'for unified processing.'
+
+        output_heatmap = 0
+        if len(inputs) > 1:
+            inputs = [self._transform_inputs(input) for input in inputs]
+            inputs = [self.trans_layer(input) for input in inputs]
+
+            # calculate difference features
+            diff_features = [
+                self.offset_feats(inputs[0] - input) for input in inputs
+            ]
+
+            for i in range(len(inputs)):
+                if frame_weight[i] == 0:
+                    continue
+                warped_heatmap = 0
+                for j in range(self.num_offset_layers):
+                    offset = (self.offset_layers[j](diff_features[i]))
+                    warped_heatmap_tmp = self.deform_conv_layers[j](inputs[i],
+                                                                    offset)
+                    warped_heatmap += warped_heatmap_tmp / \
+                        self.num_offset_layers
+
+                output_heatmap += warped_heatmap * frame_weight[i]
+
+        else:
+            inputs = inputs[0]
+            inputs = self._transform_inputs(inputs)
+            inputs = self.trans_layer(inputs)
+
+            num_frames = len(frame_weight)
+            batch_size = inputs.size(0) // num_frames
+            ref_x = inputs[:batch_size]
+            ref_x_tiled = ref_x.repeat(num_frames, 1, 1, 1)
+
+            offset_features = self.offset_feats(ref_x_tiled - inputs)
+
+            warped_heatmap = 0
+            for j in range(self.num_offset_layers):
+                offset = self.offset_layers[j](offset_features)
+
+                warped_heatmap_tmp = self.deform_conv_layers[j](inputs, offset)
+                warped_heatmap += warped_heatmap_tmp / self.num_offset_layers
+
+            for i in range(num_frames):
+                if frame_weight[i] == 0:
+                    continue
+                output_heatmap += warped_heatmap[i * batch_size:(i + 1) *
+                                                 batch_size] * frame_weight[i]
+
+        return output_heatmap
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self.freeze_layers()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmpose/models/registry.py b/mmpose/models/registry.py
new file mode 100644
index 0000000..f354ae9
--- /dev/null
+++ b/mmpose/models/registry.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .builder import BACKBONES, HEADS, LOSSES, NECKS, POSENETS
+
+__all__ = ['BACKBONES', 'HEADS', 'LOSSES', 'NECKS', 'POSENETS']
+
+warnings.simplefilter('once', DeprecationWarning)
+warnings.warn(
+    'Registries (BACKBONES, NECKS, HEADS, LOSSES, POSENETS) have '
+    'been moved to mmpose.models.builder. Importing from '
+    'mmpose.models.registry will be deprecated in the future.',
+    DeprecationWarning)
diff --git a/mmpose/models/utils/__init__.py b/mmpose/models/utils/__init__.py
new file mode 100644
index 0000000..6871c66
--- /dev/null
+++ b/mmpose/models/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .smpl import SMPL
+
+__all__ = ['SMPL']
diff --git a/mmpose/models/utils/geometry.py b/mmpose/models/utils/geometry.py
new file mode 100644
index 0000000..0ceadae
--- /dev/null
+++ b/mmpose/models/utils/geometry.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn import functional as F
+
+
+def rot6d_to_rotmat(x):
+    """Convert 6D rotation representation to 3x3 rotation matrix.
+
+    Based on Zhou et al., "On the Continuity of Rotation
+    Representations in Neural Networks", CVPR 2019
+    Input:
+        (B,6) Batch of 6-D rotation representations
+    Output:
+        (B,3,3) Batch of corresponding rotation matrices
+    """
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+
+
+def batch_rodrigues(theta):
+    """Convert axis-angle representation to rotation matrix.
+    Args:
+        theta: size = [B, 3]
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    l2norm = torch.norm(theta + 1e-8, p=2, dim=1)
+    angle = torch.unsqueeze(l2norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim=1)
+    return quat_to_rotmat(quat)
+
+
+def quat_to_rotmat(quat):
+    """Convert quaternion coefficients to rotation matrix.
+    Args:
+        quat: size = [B, 4] 4 <===>(w, x, y, z)
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    norm_quat = quat
+    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:, 0], norm_quat[:, 1],\
+        norm_quat[:, 2], norm_quat[:, 3]
+
+    B = quat.size(0)
+
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w * x, w * y, w * z
+    xy, xz, yz = x * y, x * z, y * z
+
+    rotMat = torch.stack([
+        w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy,
+        w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz,
+        w2 - x2 - y2 + z2
+    ],
+                         dim=1).view(B, 3, 3)
+    return rotMat
diff --git a/mmpose/models/utils/ops.py b/mmpose/models/utils/ops.py
new file mode 100644
index 0000000..858d0a9
--- /dev/null
+++ b/mmpose/models/utils/ops.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    if isinstance(size, torch.Size):
+        size = tuple(int(x) for x in size)
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/mmpose/models/utils/smpl.py b/mmpose/models/utils/smpl.py
new file mode 100644
index 0000000..fe723d4
--- /dev/null
+++ b/mmpose/models/utils/smpl.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..builder import MESH_MODELS
+
+try:
+    from smplx import SMPL as SMPL_
+    has_smpl = True
+except (ImportError, ModuleNotFoundError):
+    has_smpl = False
+
+
+@MESH_MODELS.register_module()
+class SMPL(nn.Module):
+    """SMPL 3d human mesh model of paper ref: Matthew Loper. ``SMPL: A skinned
+    multi-person linear model''. This module is based on the smplx project
+    (https://github.com/vchoutas/smplx).
+
+    Args:
+        smpl_path (str): The path to the folder where the model weights are
+            stored.
+        joints_regressor (str): The path to the file where the joints
+            regressor weight are stored.
+    """
+
+    def __init__(self, smpl_path, joints_regressor):
+        super().__init__()
+
+        assert has_smpl, 'Please install smplx to use SMPL.'
+
+        self.smpl_neutral = SMPL_(
+            model_path=smpl_path,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='neutral')
+
+        self.smpl_male = SMPL_(
+            model_path=smpl_path,
+            create_betas=False,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='male')
+
+        self.smpl_female = SMPL_(
+            model_path=smpl_path,
+            create_betas=False,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='female')
+
+        joints_regressor = torch.tensor(
+            np.load(joints_regressor), dtype=torch.float)[None, ...]
+        self.register_buffer('joints_regressor', joints_regressor)
+
+        self.num_verts = self.smpl_neutral.get_num_verts()
+        self.num_joints = self.joints_regressor.shape[1]
+
+    def smpl_forward(self, model, **kwargs):
+        """Apply a specific SMPL model with given model parameters.
+
+        Note:
+            B: batch size
+            V: number of vertices
+            K: number of joints
+
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d joints regressed
+                    from mesh vertices.
+        """
+
+        betas = kwargs['betas']
+        batch_size = betas.shape[0]
+        device = betas.device
+        output = {}
+        if batch_size == 0:
+            output['vertices'] = betas.new_zeros([0, self.num_verts, 3])
+            output['joints'] = betas.new_zeros([0, self.num_joints, 3])
+        else:
+            smpl_out = model(**kwargs)
+            output['vertices'] = smpl_out.vertices
+            output['joints'] = torch.matmul(
+                self.joints_regressor.to(device), output['vertices'])
+        return output
+
+    def get_faces(self):
+        """Return mesh faces.
+
+        Note:
+            F: number of faces
+
+        Returns:
+            faces: np.ndarray([F, 3]), mesh faces
+        """
+        return self.smpl_neutral.faces
+
+    def forward(self,
+                betas,
+                body_pose,
+                global_orient,
+                transl=None,
+                gender=None):
+        """Forward function.
+
+        Note:
+            B: batch size
+            J: number of controllable joints of model, for smpl model J=23
+            K: number of joints
+
+        Args:
+            betas: Tensor([B, 10]), human body shape parameters of SMPL model.
+            body_pose: Tensor([B, J*3] or [B, J, 3, 3]), human body pose
+                parameters of SMPL model. It should be axis-angle vector
+                ([B, J*3]) or rotation matrix ([B, J, 3, 3)].
+            global_orient: Tensor([B, 3] or [B, 1, 3, 3]), global orientation
+                of human body. It should be axis-angle vector ([B, 3]) or
+                rotation matrix ([B, 1, 3, 3)].
+            transl: Tensor([B, 3]), global translation of human body.
+            gender: Tensor([B]), gender parameters of human body. -1 for
+                neutral, 0 for male , 1 for female.
+
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d joints regressed from
+                    mesh vertices.
+        """
+
+        batch_size = betas.shape[0]
+        pose2rot = True if body_pose.dim() == 2 else False
+        if batch_size > 0 and gender is not None:
+            output = {
+                'vertices': betas.new_zeros([batch_size, self.num_verts, 3]),
+                'joints': betas.new_zeros([batch_size, self.num_joints, 3])
+            }
+
+            mask = gender < 0
+            _out = self.smpl_forward(
+                self.smpl_neutral,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+
+            mask = gender == 0
+            _out = self.smpl_forward(
+                self.smpl_male,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+
+            mask = gender == 1
+            _out = self.smpl_forward(
+                self.smpl_male,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+        else:
+            return self.smpl_forward(
+                self.smpl_neutral,
+                betas=betas,
+                body_pose=body_pose,
+                global_orient=global_orient,
+                transl=transl,
+                pose2rot=pose2rot)
+
+        return output
diff --git a/mmpose/utils/__init__.py b/mmpose/utils/__init__.py
new file mode 100644
index 0000000..1293ca0
--- /dev/null
+++ b/mmpose/utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .logger import get_root_logger
+from .setup_env import setup_multi_processes
+from .timer import StopWatch
+
+__all__ = [
+    'get_root_logger', 'collect_env', 'StopWatch', 'setup_multi_processes'
+]
diff --git a/mmpose/utils/collect_env.py b/mmpose/utils/collect_env.py
new file mode 100644
index 0000000..f75c5ea
--- /dev/null
+++ b/mmpose/utils/collect_env.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import collect_env as collect_basic_env
+from mmcv.utils import get_git_hash
+
+import mmpose
+
+
+def collect_env():
+    env_info = collect_basic_env()
+    env_info['MMPose'] = (mmpose.__version__ + '+' + get_git_hash(digits=7))
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/mmpose/utils/hooks.py b/mmpose/utils/hooks.py
new file mode 100644
index 0000000..b68940f
--- /dev/null
+++ b/mmpose/utils/hooks.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+
+class OutputHook:
+
+    def __init__(self, module, outputs=None, as_tensor=False):
+        self.outputs = outputs
+        self.as_tensor = as_tensor
+        self.layer_outputs = {}
+        self.register(module)
+
+    def register(self, module):
+
+        def hook_wrapper(name):
+
+            def hook(model, input, output):
+                if self.as_tensor:
+                    self.layer_outputs[name] = output
+                else:
+                    if isinstance(output, list):
+                        self.layer_outputs[name] = [
+                            out.detach().cpu().numpy() for out in output
+                        ]
+                    else:
+                        self.layer_outputs[name] = output.detach().cpu().numpy(
+                        )
+
+            return hook
+
+        self.handles = []
+        if isinstance(self.outputs, (list, tuple)):
+            for name in self.outputs:
+                try:
+                    layer = rgetattr(module, name)
+                    h = layer.register_forward_hook(hook_wrapper(name))
+                except ModuleNotFoundError as module_not_found:
+                    raise ModuleNotFoundError(
+                        f'Module {name} not found') from module_not_found
+                self.handles.append(h)
+
+    def remove(self):
+        for h in self.handles:
+            h.remove()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.remove()
+
+
+# using wonder's beautiful simplification:
+# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects
+def rgetattr(obj, attr, *args):
+
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
diff --git a/mmpose/utils/logger.py b/mmpose/utils/logger.py
new file mode 100644
index 0000000..294837f
--- /dev/null
+++ b/mmpose/utils/logger.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+from mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Use `get_logger` method in mmcv to get the root logger.
+
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmpose".
+
+    Args:
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+
+    Returns:
+        logging.Logger: The root logger.
+    """
+    return get_logger(__name__.split('.')[0], log_file, log_level)
diff --git a/mmpose/utils/setup_env.py b/mmpose/utils/setup_env.py
new file mode 100644
index 0000000..21def2f
--- /dev/null
+++ b/mmpose/utils/setup_env.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import warnings
+
+import cv2
+import torch.multiprocessing as mp
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    if 'OMP_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
diff --git a/mmpose/utils/timer.py b/mmpose/utils/timer.py
new file mode 100644
index 0000000..5a3185c
--- /dev/null
+++ b/mmpose/utils/timer.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from contextlib import contextmanager
+from functools import partial
+
+import numpy as np
+from mmcv import Timer
+
+
+class RunningAverage():
+    r"""A helper class to calculate running average in a sliding window.
+
+    Args:
+        window (int): The size of the sliding window.
+    """
+
+    def __init__(self, window: int = 1):
+        self.window = window
+        self._data = []
+
+    def update(self, value):
+        """Update a new data sample."""
+        self._data.append(value)
+        self._data = self._data[-self.window:]
+
+    def average(self):
+        """Get the average value of current window."""
+        return np.mean(self._data)
+
+
+class StopWatch:
+    r"""A helper class to measure FPS and detailed time consuming of each phase
+    in a video processing loop or similar scenarios.
+
+    Args:
+        window (int): The sliding window size to calculate the running average
+            of the time consuming.
+
+    Example:
+        >>> from mmpose.utils import StopWatch
+        >>> import time
+        >>> stop_watch = StopWatch(window=10)
+        >>> with stop_watch.timeit('total'):
+        >>>     time.sleep(0.1)
+        >>>     # 'timeit' support nested use
+        >>>     with stop_watch.timeit('phase1'):
+        >>>         time.sleep(0.1)
+        >>>     with stop_watch.timeit('phase2'):
+        >>>         time.sleep(0.2)
+        >>>     time.sleep(0.2)
+        >>> report = stop_watch.report()
+    """
+
+    def __init__(self, window=1):
+        self.window = window
+        self._record = defaultdict(partial(RunningAverage, window=self.window))
+        self._timer_stack = []
+
+    @contextmanager
+    def timeit(self, timer_name='_FPS_'):
+        """Timing a code snippet with an assigned name.
+
+        Args:
+            timer_name (str): The unique name of the interested code snippet to
+                handle multiple timers and generate reports. Note that '_FPS_'
+                is a special key that the measurement will be in `fps` instead
+                of `millisecond`. Also see `report` and `report_strings`.
+                Default: '_FPS_'.
+        Note:
+            This function should always be used in a `with` statement, as shown
+            in the example.
+        """
+        self._timer_stack.append((timer_name, Timer()))
+        try:
+            yield
+        finally:
+            timer_name, timer = self._timer_stack.pop()
+            self._record[timer_name].update(timer.since_start())
+
+    def report(self, key=None):
+        """Report timing information.
+
+        Returns:
+            dict: The key is the timer name and the value is the \
+                corresponding average time consuming.
+        """
+        result = {
+            name: r.average() * 1000.
+            for name, r in self._record.items()
+        }
+
+        if '_FPS_' in result:
+            result['_FPS_'] = 1000. / result.pop('_FPS_')
+
+        if key is None:
+            return result
+        return result[key]
+
+    def report_strings(self):
+        """Report timing information in texture strings.
+
+        Returns:
+            list(str): Each element is the information string of a timed \
+                event, in format of '{timer_name}: {time_in_ms}'. \
+                Specially, if timer_name is '_FPS_', the result will \
+                be converted to fps.
+        """
+        result = self.report()
+        strings = []
+        if '_FPS_' in result:
+            strings.append(f'FPS: {result["_FPS_"]:>5.1f}')
+        strings += [f'{name}: {val:>3.0f}' for name, val in result.items()]
+        return strings
+
+    def reset(self):
+        self._record = defaultdict(list)
+        self._active_timer_stack = []
diff --git a/mmpose/version.py b/mmpose/version.py
new file mode 100644
index 0000000..1a10826
--- /dev/null
+++ b/mmpose/version.py
@@ -0,0 +1,19 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '0.24.0'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/nets/nn.py b/nets/nn.py
new file mode 100644
index 0000000..69c9c70
--- /dev/null
+++ b/nets/nn.py
@@ -0,0 +1,278 @@
+import copy
+import math
+
+import numpy
+import torch
+
+
+def normalize(v):
+    mag = torch.sqrt(torch.sum(v.pow(2), dim=1, keepdim=True))
+    eps = torch.FloatTensor([1E-8]).to(mag.device)
+    mag = torch.max(mag, eps)
+    return v / mag
+
+
+def cross_product(u, v):
+    shape = u.shape
+
+    i = u[:, 1] * v[:, 2] - u[:, 2] * v[:, 1]
+    j = u[:, 2] * v[:, 0] - u[:, 0] * v[:, 2]
+    k = u[:, 0] * v[:, 1] - u[:, 1] * v[:, 0]
+
+    i = i.view(shape[0], 1)
+    j = j.view(shape[0], 1)
+    k = k.view(shape[0], 1)
+
+    return torch.cat(tensors=(i, j, k), dim=1)
+
+
+class Conv(torch.nn.Module):
+    def __init__(self, in_ch, out_ch, k=1, s=1, p=0):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_ch, out_ch, k, s, p, bias=False)
+        self.norm = torch.nn.BatchNorm2d(out_ch)
+
+    def forward(self, x):
+        return self.norm(self.conv(x))
+
+
+class Residual(torch.nn.Module):
+    def __init__(self, in_ch, out_ch, k, s, p):
+        super().__init__()
+
+        assert k == 3
+        assert p == 1
+        self.in_channels = in_ch
+
+        self.relu = torch.nn.ReLU()
+        self.conv = torch.nn.Identity()
+
+        self.conv1 = Conv(in_ch, out_ch, k=k, s=s, p=p)
+        self.conv2 = Conv(in_ch, out_ch, k=1, s=s, p=p - k // 2)
+        self.identity = torch.nn.BatchNorm2d(in_ch) if in_ch == out_ch and s == 1 else None
+
+    @staticmethod
+    def __pad(k):
+        if k is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(k, pad=[1, 1, 1, 1])
+
+    def __fuse_norm(self, m):
+        if m is None:
+            return 0, 0
+        if isinstance(m, Conv):
+            kernel = m.conv.weight
+            running_mean = m.norm.running_mean
+            running_var = m.norm.running_var
+            gamma = m.norm.weight
+            beta = m.norm.bias
+            eps = m.norm.eps
+        else:
+            assert isinstance(m, torch.nn.BatchNorm2d)
+            if not hasattr(self, 'norm'):
+                in_channels = self.conv1.conv.in_channels
+                kernel_value = numpy.zeros((in_channels, in_channels, 3, 3), dtype=numpy.float32)
+                for i in range(in_channels):
+                    kernel_value[i, i % in_channels, 1, 1] = 1
+                self.norm = torch.from_numpy(kernel_value).to(m.weight.device)
+            kernel = self.norm
+            running_mean = m.running_mean
+            running_var = m.running_var
+            gamma = m.weight
+            beta = m.bias
+            eps = m.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def forward(self, x):
+        if self.identity is None:
+            return self.relu(self.conv1(x) + self.conv2(x))
+        else:
+            return self.relu(self.conv1(x) + self.conv2(x) + self.identity(x))
+
+    def fuse_forward(self, x):
+        return self.relu(self.conv(x))
+
+    def fuse(self):
+        k1, b1 = self.__fuse_norm(self.conv1)
+        k2, b2 = self.__fuse_norm(self.conv2)
+        k3, b3 = self.__fuse_norm(self.identity)
+
+        self.conv = torch.nn.Conv2d(in_channels=self.conv1.conv.in_channels,
+                                    out_channels=self.conv1.conv.out_channels,
+                                    kernel_size=self.conv1.conv.kernel_size,
+                                    stride=self.conv1.conv.stride,
+                                    padding=self.conv1.conv.padding,
+                                    dilation=self.conv1.conv.dilation,
+                                    groups=self.conv1.conv.groups, bias=True)
+
+        self.conv.weight.data = k1 + self.__pad(k2) + k3
+        self.conv.bias.data = b1 + b2 + b3
+
+        if hasattr(self, 'conv1'):
+            self.__delattr__('conv1')
+        if hasattr(self, 'conv2'):
+            self.__delattr__('conv2')
+        if hasattr(self, 'identity'):
+            self.__delattr__('identity')
+        if hasattr(self, 'norm'):
+            self.__delattr__('norm')
+        self.forward = self.fuse_forward
+
+
+class SixDRepVGG(torch.nn.Module):
+    def __init__(self, width, depth, num_classes=6):
+        super().__init__()
+
+        self.p1 = []
+        self.p2 = []
+        self.p3 = []
+        self.p4 = []
+        self.p5 = []
+
+        # p1
+        self.p1.append(Residual(width[0], width[1], k=3, s=2, p=1))
+        # p2
+        for i in range(depth[0]):
+            if i == 0:
+                self.p2.append(Residual(width[1], width[2], k=3, s=2, p=1))
+            else:
+                self.p2.append(Residual(width[2], width[2], k=3, s=1, p=1))
+        # p3
+        for i in range(depth[1]):
+            if i == 0:
+                self.p3.append(Residual(width[2], width[3], k=3, s=2, p=1))
+            else:
+                self.p3.append(Residual(width[3], width[3], k=3, s=1, p=1))
+        # p4
+        for i in range(depth[2]):
+            if i == 0:
+                self.p4.append(Residual(width[3], width[4], k=3, s=2, p=1))
+            else:
+                self.p4.append(Residual(width[4], width[4], k=3, s=1, p=1))
+        # p5
+        for i in range(depth[3]):
+            if i == 0:
+                self.p5.append(Residual(width[4], width[5], k=3, s=2, p=1))
+            else:
+                self.p5.append(Residual(width[5], width[5], k=3, s=1, p=1))
+
+        self.p1 = torch.nn.Sequential(*self.p1)
+        self.p2 = torch.nn.Sequential(*self.p2)
+        self.p3 = torch.nn.Sequential(*self.p3)
+        self.p4 = torch.nn.Sequential(*self.p4)
+        self.p5 = torch.nn.Sequential(*self.p5)
+        self.fc = torch.nn.Sequential(torch.nn.AdaptiveAvgPool2d(1),
+                                      torch.nn.Flatten(),
+                                      torch.nn.Linear(width[5], num_classes))
+
+    def forward(self, x):
+        p1 = self.p1(x)
+        p2 = self.p2(p1)
+        p3 = self.p3(p2)
+        p4 = self.p4(p3)
+        p5 = self.p5(p4)
+        fc = self.fc(p5)
+
+        x_raw = fc[:, 0:3]
+        y_raw = fc[:, 3:6]
+
+        x = normalize(x_raw)
+        z = cross_product(x, y_raw)
+        z = normalize(z)
+        y = cross_product(z, x)
+
+        x = x.view(-1, 3, 1)
+        y = y.view(-1, 3, 1)
+        z = z.view(-1, 3, 1)
+        return torch.cat(tensors=(x, y, z), dim=2)
+
+    def fuse(self):
+        for m in self.modules():
+            if type(m) is Residual:
+                m.fuse()
+        return self
+
+
+def rep_net_a0():
+    return SixDRepVGG(width=(3, 48, 48, 96, 192, 1280), depth=(2, 4, 14, 1))
+
+
+def rep_net_a1():
+    return SixDRepVGG(width=(3, 64, 64, 128, 256, 1280), depth=[2, 4, 14, 1])
+
+
+def rep_net_a2():
+    return SixDRepVGG(width=[3, 64, 96, 192, 384, 1408], depth=[2, 4, 14, 1])
+
+
+def rep_net_b0():
+    return SixDRepVGG(width=[3, 64, 64, 128, 256, 1280], depth=[4, 6, 16, 1])
+
+
+def rep_net_b1():
+    return SixDRepVGG(width=[3, 64, 128, 256, 512, 2048], depth=[4, 6, 16, 1])
+
+
+def rep_net_b2():
+    return SixDRepVGG(width=[3, 64, 160, 320, 640, 2560], depth=[4, 6, 16, 1])
+
+
+class EMA:
+    """
+    Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models
+    Keeps a moving average of everything in the model state_dict (parameters and buffers)
+    For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    """
+
+    def __init__(self, model, decay=0.9999, tau=2000, updates=0):
+        # Create EMA
+        self.ema = copy.deepcopy(model).eval()  # FP32 EMA
+        self.updates = updates  # number of EMA updates
+        # decay exponential ramp (to help early epochs)
+        self.decay = lambda x: decay * (1 - math.exp(-x / tau))
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def update(self, model):
+        if hasattr(model, 'module'):
+            model = model.module
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay(self.updates)
+
+            msd = model.state_dict()  # model state_dict
+            for k, v in self.ema.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1 - d) * msd[k].detach()
+
+
+class CosineLR:
+    def __init__(self, args, optimizer):
+        self.min_lr = 1E-6
+        self.epochs = args.epochs
+        self.learning_rates = [x['lr'] for x in optimizer.param_groups]
+
+    def step(self, epoch, optimizer):
+        param_groups = optimizer.param_groups
+        for param_group, lr in zip(param_groups, self.learning_rates):
+            alpha = math.cos(math.pi * epoch / self.epochs)
+            lr = 0.5 * (lr - self.min_lr) * (1 + alpha)
+            param_group['lr'] = self.min_lr + lr
+
+
+class ComputeLoss(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.eps = 1E-7
+
+    def forward(self, outputs, targets):
+        m = torch.bmm(targets, outputs.transpose(1, 2))
+        cos = (m[:, 0, 0] + m[:, 1, 1] + m[:, 2, 2] - 1) / 2
+        theta = torch.acos(torch.clamp(cos, -1 + self.eps, 1 - self.eps))
+
+        return torch.mean(theta)
diff --git a/rgbd_3d.py b/rgbd_3d.py
new file mode 100755
index 0000000..0448846
--- /dev/null
+++ b/rgbd_3d.py
@@ -0,0 +1,765 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# mmdet and mmpose import
+from mmpose.apis import (get_track_id, inference_top_down_pose_model,
+                         init_pose_model, process_mmdet_results,
+                         vis_pose_tracking_result)
+from mmpose.datasets import DatasetInfo
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+    has_mmdet = True
+except (ImportError, ModuleNotFoundError):
+    has_mmdet = False
+
+# ros related import
+import rospy
+from sensor_msgs.msg import Image, PointCloud2
+from geometry_msgs.msg import Pose, Point
+from cv_bridge import CvBridge
+
+# other import
+import cv2
+import os
+import matplotlib.pyplot as plt
+from argparse import ArgumentParser
+from datetime import datetime
+import time
+import json
+import warnings
+import numpy as np
+
+# utils import
+from utils import *
+
+# motion bert import
+import imageio
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from lib.utils.tools import *
+from lib.utils.learning import *
+from lib.utils.utils_data import flip_data
+from lib.data.dataset_wild import WildDetDataset
+from lib.utils.vismo import render_and_save
+
+import copy
+
+# remove numpy scientific notation
+np.set_printoptions(suppress=True)
+
+
+prSuccess("Everything imported !")
+
+
+def crop_scale(motion, scale_range=[1, 1]):
+    '''
+        For input of MotionBERT
+        Motion: [(M), T, 17, 3].
+        Normalize to [-1, 1]
+    '''
+    result = copy.deepcopy(motion)
+    valid_coords = motion[motion[..., 2]!=0][:,:2]
+    if len(valid_coords) < 4:
+        return np.zeros(motion.shape)
+    xmin = min(valid_coords[:,0])
+    xmax = max(valid_coords[:,0])
+    ymin = min(valid_coords[:,1])
+    ymax = max(valid_coords[:,1])
+    ratio = np.random.uniform(low=scale_range[0], high=scale_range[1], size=1)[0]
+    scale = max(xmax-xmin, ymax-ymin) * ratio
+    if scale==0:
+        return np.zeros(motion.shape)
+    xs = (xmin+xmax-scale) / 2
+    ys = (ymin+ymax-scale) / 2
+    result[...,:2] = (motion[..., :2]- [xs,ys]) / scale
+    result[...,:2] = (result[..., :2] - 0.5) * 2
+    result = np.clip(result, -1, 1)
+    return result
+
+
+def coco2h36m(x):
+    '''
+        Input: x ((M )x T x V x C)
+        
+        COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank}
+        
+        H36M:
+        0: 'root',
+        1: 'rhip',
+        2: 'rkne',
+        3: 'rank',
+        4: 'lhip',
+        5: 'lkne',
+        6: 'lank',
+        7: 'belly',
+        8: 'neck',
+        9: 'nose',
+        10: 'head',
+        11: 'lsho',
+        12: 'lelb',
+        13: 'lwri',
+        14: 'rsho',
+        15: 'relb',
+        16: 'rwri'
+    '''
+    y = np.zeros(x.shape)
+    y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5
+    y[:,1,:] = x[:,12,:]
+    y[:,2,:] = x[:,14,:]
+    y[:,3,:] = x[:,16,:]
+    y[:,4,:] = x[:,11,:]
+    y[:,5,:] = x[:,13,:]
+    y[:,6,:] = x[:,15,:]
+    y[:,8,:] = (x[:,5,:] + x[:,6,:]) * 0.5
+    y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5
+    y[:,9,:] = x[:,0,:]
+    y[:,10,:] = (x[:,1,:] + x[:,2,:]) * 0.5
+    y[:,11,:] = x[:,5,:]
+    y[:,12,:] = x[:,7,:]
+    y[:,13,:] = x[:,9,:]
+    y[:,14,:] = x[:,6,:]
+    y[:,15,:] = x[:,8,:]
+    y[:,16,:] = x[:,10,:]
+    return y
+
+
+class InferenceNodeRGBD(object):
+    def __init__(self, args):
+        
+        # init args
+        self.args = args
+        
+        # init detector and pose
+        prInfo('Initialiazing detector {}'.format(args.mb_checkpoint))        
+        self.det_model = init_detector(
+            args.det_config, args.det_checkpoint, device=args.device.lower())
+
+        prInfo('Initialiazing 2D Pose model {}'.format(args.mb_checkpoint))        
+        self.pose_model = init_pose_model(
+            args.pose_config, args.pose_checkpoint, device=args.device.lower())
+
+        # init 3d MotionBERT model
+        prInfo('Initialiazing 3D Pose Lifter {}'.format(args.mb_checkpoint))        
+        mb_3d_args = get_config(args.mb_3d_config)
+        self.motionbert_3d_model = load_backbone(mb_3d_args)
+        if torch.cuda.is_available():
+            self.motionbert_3d_model = nn.DataParallel(self.motionbert_3d_model)
+            self.motionbert_3d_model = self.motionbert_3d_model.cuda()
+        else:
+            prWarning("Expect cuda to be available but is_available returned false")
+            exit(0)
+
+        prInfo('Loading checkpoint {}'.format(args.mb_checkpoint))
+        mb_checkpoint = torch.load(args.mb_checkpoint, map_location=lambda storage, loc: storage)
+        self.motionbert_3d_model.load_state_dict(mb_checkpoint['model_pos'], strict=True)
+        self.motionbert_3d_model.eval()
+        prInfo('Loaded motionbert_3d_model')
+        # no need for the whole WildDetDataset stuff, just manually make the input trajectories for the tracks
+
+        # dataset params for detector and pose
+        self.dataset = self.pose_model.cfg.data['test']['type']
+        self.dataset_info = self.pose_model.cfg.data['test'].get('self.dataset_info', None)
+        if self.dataset_info is None:
+            warnings.warn(
+                'Please set `self.dataset_info` in the config.'
+                'Check https://github.com/open-mmlab/mmpose/pull/663 for details.',
+                DeprecationWarning)
+        else:
+            self.dataset_info = DatasetInfo(self.dataset_info)
+        
+        self.return_heatmap = False
+
+        self.next_id = 0
+        self.pose_results = []
+        self.count_frames = 0
+        self.tracks_in_current_image = {}
+
+        ## Init for node and save path
+        
+        self.rgb = None # Image frame
+        self.depth = None # Image frame
+
+        self.pcl_array_rgb = None
+        self.pcl_array_xyz = None
+        
+        self.depth_array_max_threshold = 20000 #3000 # does not apply when saving depth mono16 image
+        
+        # viewing options
+        self.depth_cmap = get_mpl_colormap(args.depth_cmap)
+        self.confidence_cmap = get_mpl_colormap("viridis")
+        self.vis_img = None # output image RGB + detections
+        self.view_all_classes_dets = True
+        self.display_all_detection = args.display_all_detection
+        self.light_display = args.light_display
+
+        self.pcl_current_seq = -1        
+        self.rgb_current_seq = -1
+        self.last_inferred_seq = -1
+        self.depth_current_seq = -1
+        self.current_image_count = 0
+
+        self.br = CvBridge()
+
+        prInfo("Setting node rate to {} fps".format(args.fps))
+        self.loop_rate = rospy.Rate(args.fps)
+
+        # make the output path
+        now = datetime.now()
+        timestamp = now.strftime("%Y_%m_%d_%H_%M_%S")
+        self.save_dir = os.path.join("output", "record_{:s}".format(timestamp))        
+        self.metadata = os.path.join(self.save_dir, "metadata.json")
+        self.save_dir_rgb = os.path.join(self.save_dir, "rgb")
+        self.save_dir_depth = os.path.join(self.save_dir, "depth")
+        self.save_dir_depth_color = os.path.join(self.save_dir, "depth_color")
+        self.save_dir_result = os.path.join(self.save_dir, "output")
+        self.save_dir_pcl_bin = os.path.join(self.save_dir, "pcl")
+
+        if args.save or args.light_save:
+            prInfo("Saving to {}/[rgb][depth][depth_color][output][pcl]".format(self.save_dir))
+            if not os.path.exists(self.save_dir):
+                prInfo("Creating directories to {}/[rgb][depth][depth_color][output][pcl]".format(self.save_dir))
+                os.makedirs(self.save_dir)
+                os.makedirs(self.save_dir_rgb)
+                os.makedirs(self.save_dir_pcl_bin)
+
+                if args.save:
+                    os.makedirs(self.save_dir_depth)
+                    os.makedirs(self.save_dir_depth_color)
+                    os.makedirs(self.save_dir_result)
+
+                args_dic = vars(args)
+                with open(self.metadata, 'w') as fp:
+                    json.dump(args_dic, fp)
+                    
+                prSuccess("Created directories to {}/[rgb][depth][depth_color][output][pcl]".format(self.save_dir))
+                time.sleep(1)
+
+        # Publishers
+        self.goal_pub = rospy.Publisher('points/handover_goal', Point,  queue_size=10)
+
+        # Subscribers
+        prInfo("Subscribing to {} for RGB".format(args.rgb_topic))
+        rospy.Subscriber(args.rgb_topic, Image,self.callback_rgb)
+        prInfo("Subscribing to {} for depth".format(args.depth_topic))
+        rospy.Subscriber(args.depth_topic,Image,self.callback_depth)
+        prInfo("Subscribing to {} for PCL".format(args.pcl_topic))
+        rospy.Subscriber(args.pcl_topic, PointCloud2, self.callback_pcl)
+
+
+    def callback_pcl(self, msg):
+        pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape((msg.height, msg.width, -1))
+        self.pcl_array_xyz = pcl_array[:,:,:3]
+        self.pcl_array_rgb = pcl_array[:,:,3:]
+        self.pcl_current_seq = msg.header.seq
+        # rospy.loginfo('pcl received ({})...'.format(msg.header.seq))
+
+    def callback_rgb(self, msg):
+        self.rgb = self.br.imgmsg_to_cv2(msg, "bgr8")
+        self.rgb_current_seq = msg.header.seq
+        # rospy.loginfo('RGB received ({})...'.format(msg.header.seq))
+
+    def callback_depth(self, msg):
+        self.depth = self.br.imgmsg_to_cv2(msg, "mono16")
+        self.depth_current_seq = msg.header.seq
+        # rospy.loginfo('Depth received ({})...'.format(msg.header.seq))
+
+    def is_ready(self):
+        ready = (self.rgb is not None) and (self.depth is not None) and (self.pcl_array_xyz is not None)
+        return ready
+    
+    def start(self):
+        
+        self.tracks = {} # all the tracks along time, we need to keep and history
+        
+        while not rospy.is_shutdown():
+            
+            if self.is_ready():
+                
+                image_count = self.current_image_count
+                self.current_image_count += 1
+
+                start_t = time.time()
+
+                image_seq_unique = self.rgb_current_seq
+                now = datetime.now()
+                timestamp = now.strftime("%Y_%m_%d_%H_%M_%S_%f")
+        
+                if self.args.save or self.args.light_save:
+                    rgb_path = os.path.join(self.save_dir_rgb, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp))
+                    cv2.imwrite(rgb_path,  self.rgb)
+                    prSuccess("Saved RGB to {}".format(rgb_path))
+                    
+                rgb_array = np.asarray(self.rgb)
+
+                if self.args.save:
+                    depth_path = os.path.join(self.save_dir_depth, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp))
+                    cv2.imwrite(depth_path,  self.depth)
+                    prSuccess("Saved depth to {}".format(depth_path))   
+                
+                depth_array = np.asarray(self.depth)
+                depth_array[depth_array > self.depth_array_max_threshold] = self.depth_array_max_threshold
+                
+                assert(depth_array.shape[0] == rgb_array.shape[0])
+                assert(depth_array.shape[1] == rgb_array.shape[1])
+                
+                # Process RGB array
+                if self.last_inferred_seq < self.rgb_current_seq:
+                    
+                    prInfo("Do inference on frame {}".format(self.rgb_current_seq))
+
+                    # keep old poses for tracking
+                    pose_results_last = self.pose_results
+
+                    tic = time.time()
+                    mmdet_results = inference_detector(self.det_model, rgb_array) # list of detection rectangle i.e [(x1,y1,x2,y2), ...]
+                    tac = time.time()
+                    prInfo("Detection in {:.4f} sec (frame {}, number of human detection {})".format(tac-tic, self.rgb_current_seq, len(mmdet_results[0])))
+                    
+                    # keep the person class bounding boxes.
+                    person_results = process_mmdet_results(mmdet_results, self.args.det_cat_id)   
+
+                    tic = time.time()
+                    # test a single image, with a list of bboxes.
+                    self.pose_results, returned_outputs = inference_top_down_pose_model(
+                        self.pose_model,
+                        rgb_array,
+                        person_results,
+                        bbox_thr=self.args.bbox_thr,
+                        format='xyxy',
+                        dataset=self.dataset,
+                        dataset_info=self.dataset_info,
+                        return_heatmap=self.return_heatmap,
+                        outputs=None)
+                    tac = time.time()
+                    prInfo("Poses in {:.4f} sec".format(tac-tic))
+
+                    # get track id for each person instance
+                    self.pose_results, self.next_id = get_track_id(
+                        self.pose_results,
+                        pose_results_last,
+                        self.next_id,
+                        use_oks=False,
+                        tracking_thr=self.args.tracking_thr,
+                        use_one_euro=self.args.euro,
+                        fps=10)
+                    
+                    # produce an output image
+                    if not self.args.no_show:
+                        self.vis_img = rgb_array.copy()
+
+                    if self.display_all_detection and not self.args.no_show:
+                        for c in range(len(mmdet_results)):
+                            if len(mmdet_results[c]) > 0:
+                                for bi in range(mmdet_results[c].shape[0]):
+                                    if mmdet_results[c][bi,4] > self.args.bbox_thr:
+                                        bbox = mmdet_results[c][bi,:4].copy().astype(np.int32)
+                                        bbox_ints = [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])]
+                                        pt1 = ( min( max(0,bbox_ints[0]), depth_array.shape[1]),
+                                                        min( max(0,bbox_ints[1]), depth_array.shape[0]) )
+                                        pt2 = ( min( max(0,bbox_ints[2]), depth_array.shape[1]),
+                                                        min( max(0,bbox_ints[3]), depth_array.shape[0]) )
+                                        cv2.rectangle(self.vis_img, pt1, pt2, (255,255,255), 1)
+                                        cv2.putText(self.vis_img, "{:s} ({:.0f}%)".format(YOLO_COCO_80_CLASSES[c], mmdet_results[c][bi,4]*100), pt1, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+                    
+                    
+                    #### post processing and 3D lifting ####
+                                        
+                    # remove too old tracks
+                    for idx, track in list(self.tracks.items()):
+                        if abs(self.current_image_count - track["last_seen"]) > self.args.max_frames_remove_tracks: 
+                            prInfo("Removing track {}, not seen since frame {}, current is {}".format(idx, track["last_seen"], self.current_image_count))
+                            self.tracks.pop(idx)
+    
+                    self.tracks_in_current_image =  {}
+                    
+                    for res in self.pose_results:
+                        
+                        # for each instance
+                        
+                        bbox = res["bbox"]
+                        keypoints = res["keypoints"]
+                        idx = res["track_id"]
+                        
+                        if idx not in self.tracks.keys():
+                            prInfo("Adding a new track with idx {}".format(idx))
+                            self.tracks[idx] = {}
+                            self.tracks[idx]["last_seen"] = self.current_image_count
+                            self.tracks[idx]["keypoints_2d"] = []
+                        
+                        # add keypoint to the current track
+                        self.tracks[idx]["last_seen"] = self.current_image_count
+                        self.tracks[idx]["keypoints_2d"].append(keypoints)
+                        
+                        self.tracks_in_current_image[idx] = {
+                                                            "right_wrist_depth" : None,
+                                                            "right_wrist_pose" : None,
+                                                            "left_wrist_depth" : None,
+                                                            "left_wrist_pose" : None,
+                                                            "depth_center" : None,
+                                                            "pose_center" : None,
+                                                            "pose_from" : None
+                                                            }
+                        
+                        # if history is long enough, process the trajectory for MotionBERT
+                        if len(self.tracks[idx]["keypoints_2d"]) >= self.args.mb_clip_len:
+                            prInfo("Running MotionBERT for track {}".format(idx))
+                            
+                            # prepare motion
+                            motion = np.asarray(self.tracks[idx]["keypoints_2d"]) # T, 17, 3
+                            motion = motion[-self.args.mb_clip_len:, :, :] # keep only the required len
+                            assert(motion.shape[1] == 17)
+                            assert(motion.shape[2] == 3)
+                            motion_h36 = coco2h36m(motion) # input is h36 format
+                            motion_h36_scaled = crop_scale(motion_h36) # scale [1,1], normalize, crop
+                            
+                            with torch.no_grad():
+                                current_input = torch.Tensor(motion_h36_scaled).unsqueeze(0).cuda()
+                                tic = time.time()
+                                predicted_3d_pos = self.motionbert_3d_model(current_input)
+                                tac = time.time()
+                                prInfo("MotionBERT in {:.4f} sec".format(tac-tic))
+                                                                
+                                # root relative
+                                predicted_3d_pos[:,:,0,:] = 0  # [1,T,17,3]
+                                
+                                # TODO : change it because a bit weird it is not aligned with 2D poses because of the history !
+                                predicted_3d_pos_np = predicted_3d_pos[0,-1,:,:].cpu().numpy() # keep only the last prediction
+                                if "keypoints_3d" in self.tracks[idx].keys():
+                                    self.tracks[idx]["keypoints_3d"].append(predicted_3d_pos_np)
+                                else:
+                                    self.tracks[idx]["keypoints_3d"] = [predicted_3d_pos_np]
+                            
+                        
+                        # Draw bounding bbox
+                        bbox = bbox.astype(np.int32)
+                        bbox_ints = [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])]
+                        pt1 = ( min( max(0,bbox_ints[0]), depth_array.shape[1]),
+                                        min( max(0,bbox_ints[1]), depth_array.shape[0]) )
+                        pt2 = ( min( max(0,bbox_ints[2]), depth_array.shape[1]),
+                                        min( max(0,bbox_ints[3]), depth_array.shape[0]) )
+                        color = RANDOM_COLORS[idx % 255]
+                        color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+                        
+                        if not self.args.no_show:
+                            cv2.rectangle( self.vis_img, pt1, pt2, color_tuple, 2)
+                                                
+                        body_center_joints = [] # to store center of lsho, rsho, lhip, rhip in pixels
+                                                                        
+                        for j in range(keypoints.shape[0]):
+                            
+                            kp = keypoints[j,:]
+                            confidence = int(kp[2] * 255)
+                            confidence_color = (self.confidence_cmap[min(255,confidence)]*255).astype(np.uint8) 
+
+                            if confidence > self.args.kpt_thr and kp[0] > 0 and kp[1] > 0 and kp[0] < depth_array.shape[1] and kp[1] < depth_array.shape[0]:
+                                
+                                if (j == 5) or (j == 6) or (j == 11) or (j == 12):
+                                    # one keypoint of the torso
+                                    body_center_joints.append(kp)
+
+                                if not self.args.no_show:
+                                    # kp_color_tuple = (int(confidence_color[0]), int(confidence_color[1]), int(confidence_color[2]))
+                                    cv2.circle(self.vis_img, (int(kp[0]), int(kp[1])), 2, color_tuple, thickness = 3)
+                                
+                                # if wrists, find depth and pose
+                                
+                                if (j == 10):
+                                    # right wrist
+                                    depth_wrist = depth_array[int(kp[1]), int(kp[0])]
+                                    pose_wrist = self.pcl_array_xyz[int(kp[1]), int(kp[0]),:]
+                                    self.tracks_in_current_image[idx]["right_wrist_depth"] = depth_wrist
+                                    self.tracks_in_current_image[idx]["right_wrist_pose"] = pose_wrist
+                                    if not self.light_display and not self.args.no_show:
+                                        cv2.drawMarker(self.vis_img, (int(kp[0]), int(kp[1])), color = color_tuple, thickness = 3, 
+                                                        markerType = cv2.MARKER_CROSS, line_type = cv2.LINE_AA,
+                                                        markerSize = 16)
+                                        cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_wrist/10, pose_wrist[0], pose_wrist[1], pose_wrist[2]), (int(kp[0]), int(kp[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 255), 2)
+                                
+                                elif (j == 9):
+                                    # left wrist
+                                    depth_wrist = depth_array[int(kp[1]), int(kp[0])]
+                                    pose_wrist = self.pcl_array_xyz[int(kp[1]), int(kp[0]),:]
+                                    self.tracks_in_current_image[idx]["left_wrist_depth"] = depth_wrist
+                                    self.tracks_in_current_image[idx]["left_wrist_pose"] = pose_wrist
+                                    if not self.light_display and not self.args.no_show:
+                                        cv2.drawMarker(self.vis_img, (int(kp[0]), int(kp[1])), color = color_tuple, thickness = 3, 
+                                                        markerType = cv2.MARKER_CROSS, line_type = cv2.LINE_AA,
+                                                        markerSize = 16)
+                                        cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_wrist/10, pose_wrist[0], pose_wrist[1], pose_wrist[2]), (int(kp[0]), int(kp[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 255), 2)
+
+                        # find the body center
+                        if len(body_center_joints) == 4:
+                            # if we managed to find the 4 points of the torso, search on the torso
+                            body_center_joints = np.asarray(body_center_joints) # lsho, rsho, lhip, rhip
+                            lsho = body_center_joints[0,:]
+                            rsho = body_center_joints[1,:]
+                            lhip = body_center_joints[2,:]
+                            rhip = body_center_joints[3,:]
+                            
+                            # find 4 points between lsho and rhip and 4 points between rsho and lhip to find something more precise
+                            seg_steps = [0.0, 0.25, 0.50, 0.75, 1.0]
+                            depths_torso = []
+                            poses_torso = []
+                            for step in seg_steps:
+                                
+                                p1 = step * lsho + (1 - step) * rhip
+                                if p1[0] < depth_array.shape[1] and p1[1] < depth_array.shape[0]:
+                                    depth_p1 = depth_array[int(p1[1]), int(p1[0])]
+                                    pose_p1 = self.pcl_array_xyz[int(p1[1]), int(p1[0]), :]
+                                    if depth_p1 > 0:
+                                        depths_torso.append(depth_p1)
+                                        poses_torso.append(pose_p1)
+                                    
+                                p2 = step * rsho + (1 - step) * lhip
+                                if p2[0] < depth_array.shape[1] and p2[1] < depth_array.shape[0]:
+                                    depth_p2 = depth_array[int(p2[1]), int(p2[0])]
+                                    pose_p2 = self.pcl_array_xyz[int(p2[1]), int(p2[0]), :]
+                                    if depth_p2 > 0:
+                                        depths_torso.append(depth_p2)
+                                        poses_torso.append(pose_p2)
+                                        
+                                if not self.args.no_show:
+                                    # draw to check
+                                    cv2.drawMarker(self.vis_img, (int(p1[0]), int(p1[1])), color = color_tuple, thickness = 1, 
+                                                    markerType = cv2.MARKER_DIAMOND, line_type = cv2.LINE_AA,
+                                                    markerSize = 8)                            
+                                    cv2.drawMarker(self.vis_img, (int(p2[0]), int(p2[1])), color = color_tuple, thickness = 1, 
+                                                    markerType = cv2.MARKER_DIAMOND, line_type = cv2.LINE_AA,
+                                                    markerSize = 8)  
+                            
+                            if len(depths_torso) > 3:
+                                # at least 4 points to average decently
+                                depth_body = np.asarray(depths_torso).mean()
+                                pose_body = np.asarray(poses_torso).mean(axis = 0)
+                                self.tracks_in_current_image[idx]["depth_center"] = depth_body # mm
+                                self.tracks_in_current_image[idx]["pose_center"] = pose_body # m
+                                self.tracks_in_current_image[idx]["pose_from"] = "torso"
+                                        
+                                # just for drawing
+                                body_center = np.mean(body_center_joints, axis = 0)
+                                # Draw center of body
+                                body_center = (int(body_center[0]), int(body_center[1]))
+                                if not self.light_display and not self.args.no_show:
+                                    cv2.drawMarker(self.vis_img, body_center, color = color_tuple, thickness = 3, 
+                                                    markerType = cv2.MARKER_TILTED_CROSS, line_type = cv2.LINE_AA,
+                                                    markerSize = 16)
+                                    cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_body/10, pose_body[0], pose_body[1], pose_body[2]), (int(body_center[0]), int(body_center[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 3)
+                                
+                            # # fetch depth and pose at torso center
+                            # if body_center[0] < depth_array.shape[1] and body_center[1] < depth_array.shape[0]:
+                            #     depth_center = depth_array[body_center[1], body_center[0]]
+                            #     pose_center = self.pcl_array_xyz[body_center[1], body_center[0],:]
+                            #     if not self.light_display:
+                            #         cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_center/10, pose_center[0], pose_center[1], pose_center[2]), (int(body_center[0]), int(body_center[1])), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3)
+                            #     if (depth_center != 0):
+                            #         self.tracks_in_current_image[idx]["depth_center"] = depth_center # mm
+                            #         self.tracks_in_current_image[idx]["pose_center"] = pose_center # m
+                            #         self.tracks_in_current_image[idx]["pose_from"] = "torso"
+                            #         # prSuccess("Publishing coordinates {:.2f} {:.2f} {:.2f}".format(pose_center[0], pose_center[1], pose_center[2]))
+                            #         # self.goal_pub.publish(Point(x = pose_center[0], y = pose_center[1], z = pose_center[2]))
+                            
+                        else:
+                            # if we did not managed to find the 4 points of the torso, search in the bbox
+                            prWarning("Can't use body center from shoulders and hips, use center of box for track {} || UPDATE : do nothing".format(idx))
+                            
+                            if False:
+                                # Draw center of bbox
+                                bbox_center = (int(pt1[0]/2 + pt2[0]/2), int(pt1[1]/2 + pt2[1]/2))
+                                if not self.light_display:
+                                    cv2.drawMarker(self.vis_img, bbox_center, color = color_tuple, thickness = 3, 
+                                                    markerType = cv2.MARKER_CROSS, line_type = cv2.LINE_AA,
+                                                    markerSize = 16)
+                                
+                                # fetch depth and pose at bbox center
+                                if bbox_center[0] < depth_array.shape[1] and bbox_center[1] < depth_array.shape[0]:
+                                    depth_center = depth_array[bbox_center[1], bbox_center[0]]
+                                    pose_center = self.pcl_array_xyz[bbox_center[1], bbox_center[0],:]
+                                    if not self.light_display:
+                                        cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_center/10, pose_center[0], pose_center[1], pose_center[2]), (int(bbox_center[0]), int(bbox_center[1])), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3)
+                                    if (depth_center != 0):
+                                        self.tracks_in_current_image[idx]["depth_center"] = depth_center # mm
+                                        self.tracks_in_current_image[idx]["pose_center"] = pose_center # m
+                                        self.tracks_in_current_image[idx]["pose_from"] = "bbox"
+                                        # prSuccess("Publishing coordinates {:.2f} {:.2f} {:.2255f}".format(pose_center[0], pose_center[1], pose_center[2]))
+                                        # self.goal_pub.publish(Point(x = pose_center[0], y = pose_center[1], z = pose_center[2]))
+                                    
+                        # draw skeleton
+                        if not self.args.no_show:
+                            for limb in COCO17_JOINTS_LIMBS:
+                                start = keypoints[limb[0],:]
+                                end = keypoints[limb[1],:]
+                                start_point = (int(start[0]), int(start[1])) 
+                                end_point = (int(end[0]), int(end[1]))
+                                if (start[2] > self.args.kpt_thr) and (end[2] > self.args.kpt_thr):
+                                    cv2.line(self.vis_img, start_point, end_point, color = color_tuple, thickness = 3)
+                        
+                    min_depth = 1e6 # mm
+                    min_depth_idx = -1
+                    for idx, track_info in self.tracks_in_current_image.items():
+                        depth = track_info["depth_center"]
+                        if depth is not None:
+                            if depth < min_depth:
+                                min_depth = depth
+                                min_depth_idx = idx
+                    
+                    if (min_depth_idx != -1):
+                        pose_closest = self.tracks_in_current_image[min_depth_idx]["pose_center"]
+                        prInfo("Using track {} as it is the closest".format(min_depth_idx))
+                        self.goal_pub.publish(Point(x = pose_closest[0], y = pose_closest[1], z = pose_closest[2]))
+                        prSuccess("Publishing coordinates {:.2f} {:.2f} {:.2f}".format(pose_closest[0], pose_closest[1], pose_closest[2]))
+                        if not self.args.no_show:
+                            cv2.putText(self.vis_img, "{:.2f} {:.2f} {:.2f}".format(pose_closest[0], pose_closest[1], pose_closest[2]), (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 255, 255), 5)
+                            cv2.putText(self.vis_img, "{:.2f} {:.2f} {:.2f}".format(pose_closest[0], pose_closest[1], pose_closest[2]), (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 0), 3)
+                    else:
+                        if not self.args.no_show:
+                            cv2.putText(self.vis_img, "No tracks with pose found", (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 255, 255), 5)
+                            cv2.putText(self.vis_img, "No tracks with pose found", (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 0), 3)           
+                    
+                    self.last_inferred_seq = self.rgb_current_seq
+
+                    if self.args.save and not self.args.no_show:
+                        results_path = os.path.join(self.save_dir_result, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp))
+                        cv2.imwrite(results_path, self.vis_img)
+                        prSuccess("Saved result to {}".format(results_path))   
+                                    
+                else:
+                    prWarning("No inference because the current RGB frame has already been processed")
+                
+                if not self.args.no_show:
+                    depth_array_norm = ((depth_array - depth_array.min())) / (depth_array.max() - depth_array.min())
+                    depth_array_norm = depth_array_norm * 255
+                    depth_array_norm = depth_array_norm.astype(np.uint8)
+                    depth_array_norm_colored = (self.depth_cmap[depth_array_norm] * 255).astype(np.uint8)
+
+                    if self.args.save:
+                        depth_color_path = os.path.join(self.save_dir_depth_color, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp))
+                        cv2.imwrite(depth_color_path,  depth_array_norm_colored)
+                        prSuccess("Saved depth color (scaled) to {}".format(depth_color_path))   
+
+                    if self.args.save or self.args.light_save:
+                        pcl_path = os.path.join(self.save_dir_pcl_bin, "{:08d}_seq_{:010d}_ts_{}.bin".format(image_count, image_seq_unique, timestamp))
+                        self.pcl_array_xyz.tofile(pcl_path)
+                        prSuccess("Saved pcl to {}".format(pcl_path))   
+
+                    if self.vis_img is not None:
+                        full_display_array = np.zeros((rgb_array.shape[0] * 2, rgb_array.shape[1], 3), dtype = np.uint8)
+                        full_display_array[:rgb_array.shape[0], : ,:] = self.vis_img 
+                        full_display_array[rgb_array.shape[0]:, : ,:] = depth_array_norm_colored
+                        
+                        cv2.imshow("RGBD window", full_display_array)
+                        cv2.waitKey(3)
+                        
+                end_t = time.time()
+                prInfoBold("Processed frame {} in {:.4f} sec".format(self.current_image_count, end_t-start_t))
+
+                    
+                
+                
+            else:
+                print("Images are None !")
+                
+            self.loop_rate.sleep()
+    
+if __name__ == '__main__':
+
+    ## Parser with params
+    parser = ArgumentParser()
+    parser.add_argument('--det_config', type=str, default = "./configs/detection/yolov3_d53_320_273e_coco.py", help='Config file for detection')
+    parser.add_argument('--det_checkpoint', type=str, default = "./models/yolov3_d53_320_273e_coco-421362b6.pth", help='Checkpoint file for detection')
+    parser.add_argument('--pose_config', type=str, default = "./configs/pose/ViTPose_small_coco_256x192.py", help='Config file for pose')
+    parser.add_argument('--pose_checkpoint', type=str, default = "./models/vitpose_small.pth", help='Checkpoint file for pose')
+    parser.add_argument(
+        '--device', 
+        default='cuda:0',
+        help='Device used for inference')
+    parser.add_argument(
+        '--det_cat_id',
+        type=int,
+        default=1,
+        help='Category id for bounding box detection model (person)')
+    parser.add_argument(
+        '--bbox_thr',
+        type=float,
+        default=0.3,
+        help='Bounding box score threshold')
+    parser.add_argument(
+        '--kpt_thr', 
+        type=float, 
+        default=0.3, 
+        help='Keypoint score threshold')
+    parser.add_argument(
+        '--tracking_thr', 
+        type=float, 
+        default=0.3, 
+        help='Tracking threshold')
+    parser.add_argument(
+        '--euro',
+        action='store_true',
+        help='Using One_Euro_Filter for smoothing')
+
+    parser.add_argument('--rgb_topic', default = "orbbec/rgb", type=str, help='ROS topic for RGB image')
+    parser.add_argument('--depth_topic', default = "orbbec/depth", type=str, help='ROS topic for depth image')
+    parser.add_argument('--pcl_topic', default = "orbbec/pcl", type=str, help='ROS topic for pcl')
+    parser.add_argument(
+        '--no_show',
+        action='store_true',
+        default=False,
+        help='whether to show visualizations.')
+    parser.add_argument(
+        '--save',
+        action='store_true',
+        default=False,
+        help='whether to save images (rgb and d and predictions and pcl)')
+    parser.add_argument(
+        '--light_save',
+        action='store_true',
+        default=False,
+        help='whether to save only rgb and pcl (not optimized use the light_save of visualizer for optimized saving)')
+    parser.add_argument(
+        '--display_all_detection', "-dad",
+        action='store_true',
+        default=False,
+        help='whether to display all detections or only human')
+    parser.add_argument(
+        '--light_display', "-ld",
+        action='store_true',
+        default=False,
+        help='whether to display only skeletons')
+    parser.add_argument(
+        '--fps',
+        type=int,
+        default=30,
+        help='Node and recording fps')
+    parser.add_argument('--depth_cmap', default = "jet", type=str, help='mpl colormap for depth image')
+
+    parser.add_argument('--mb_3d_config', type=str, default = "./configs/pose3d/MB_ft_h36m.yaml", help='Config file for 3D poses')
+    parser.add_argument('--mb_checkpoint', type=str, default = "./checkpoint/pose3d/MB_train_h36m/best_epoch.bin", help='Checkpoint file for 3D poses')
+    parser.add_argument(
+        '--mb_clip_len',
+        type=int,
+        default=10,
+        help='Number of past frames to use for MotionBERT (default in model is 243)')
+    parser.add_argument(
+        '--max_frames_remove_tracks',
+        type=int,
+        default=2,
+        help='Number frames without the track present to keep going before removing a track')
+    
+
+    args = parser.parse_args()
+    
+    assert has_mmdet, 'Please install mmdet to run the demo.'
+    assert args.det_config is not None
+    assert args.det_checkpoint is not None
+    
+    if (args.save or args.light_save) and args.no_show:
+        print("Do not use the no_show mode if save is enabled, no rendering is done if --no_show")
+    
+    prInfo("Loaded with args : {}".format(args))
+    
+    rospy.init_node("python_orbbec_inference", anonymous=True)
+    my_node = InferenceNodeRGBD(args)
+    my_node.start()
+    cv2.destroyAllWindows()
\ No newline at end of file
diff --git a/rgbd_detect.py b/rgbd_detect.py
new file mode 100644
index 0000000..662d30f
--- /dev/null
+++ b/rgbd_detect.py
@@ -0,0 +1,1011 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# mmdet and mmpose import
+from mmpose.apis import (
+    get_track_id,
+    inference_top_down_pose_model,
+    init_pose_model,
+    process_mmdet_results,
+    vis_pose_tracking_result,
+)
+from mmpose.datasets import DatasetInfo
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+
+    has_mmdet = True
+except (ImportError, ModuleNotFoundError):
+    has_mmdet = False
+
+# ros related import
+import rospy
+from sensor_msgs.msg import Image, PointCloud2
+from geometry_msgs.msg import TransformStamped
+from cv_bridge import CvBridge
+import tf2_ros
+
+# other import
+import cv2
+import os
+import matplotlib.pyplot as plt
+from argparse import ArgumentParser
+from datetime import datetime
+import time
+import json
+import warnings
+import numpy as np
+from PyKDL import Rotation
+
+# utils import
+from utils import *
+
+# remove numpy scientific notation
+np.set_printoptions(suppress=True)
+
+
+class InferenceNodeRGBD(object):
+    def __init__(self, args):
+
+        # init args
+        self.args = args
+
+        # init detector and pose
+        self.det_model = init_detector(
+            args.det_config, args.det_checkpoint, device=args.device.lower()
+        )
+
+        self.pose_model = init_pose_model(
+            args.pose_config, args.pose_checkpoint, device=args.device.lower()
+        )
+
+        self.dataset = self.pose_model.cfg.data["test"]["type"]
+        self.dataset_info = self.pose_model.cfg.data["test"].get(
+            "self.dataset_info", None
+        )
+        if self.dataset_info is None:
+            warnings.warn(
+                "Please set `self.dataset_info` in the config."
+                "Check https://github.com/open-mmlab/mmpose/pull/663 for details.",
+                DeprecationWarning,
+            )
+        else:
+            self.dataset_info = DatasetInfo(self.dataset_info)
+
+        self.return_heatmap = False
+
+        self.next_id = 0
+        self.pose_results = []
+        self.count_frames = 0
+        self.tracks_in_current_image = {}
+
+        ## Init for node and save path
+
+        self.rgb = None  # Image frame
+        self.depth = None  # Image frame
+
+        self.pcl_array_rgb = None
+        self.pcl_array_xyz = None
+
+        self.depth_array_max_threshold = (
+            20000  # 3000 # does not apply when saving depth mono16 image
+        )
+
+        # viewing options
+        self.depth_cmap = get_mpl_colormap(args.depth_cmap)
+        self.confidence_cmap = get_mpl_colormap("viridis")
+        self.vis_img = None  # output image RGB + detections
+        self.view_all_classes_dets = True
+        self.display_all_detection = args.display_all_detection
+        self.light_display = args.light_display
+
+        self.pcl_current_seq = -1
+        self.rgb_current_seq = -1
+        self.last_inferred_seq = -1
+        self.depth_current_seq = -1
+        self.current_image_count = 0
+
+        self.br = CvBridge()
+
+        prInfo("Setting node rate to {} fps".format(args.fps))
+        self.loop_rate = rospy.Rate(args.fps)
+
+        # make the output path
+        now = datetime.now()
+        timestamp = now.strftime("%Y_%m_%d_%H_%M_%S")
+        self.save_dir = os.path.join("output", "record_{:s}".format(timestamp))
+        self.metadata = os.path.join(self.save_dir, "metadata.json")
+        self.save_dir_rgb = os.path.join(self.save_dir, "rgb")
+        self.save_dir_depth = os.path.join(self.save_dir, "depth")
+        self.save_dir_result = os.path.join(self.save_dir, "output")
+        self.save_dir_pcl_bin = os.path.join(self.save_dir, "pcl")
+
+        if args.save or args.light_save:
+            prInfo(
+                "Saving to {}/[rgb][depth][depth_color][output][pcl]".format(
+                    self.save_dir
+                )
+            )
+            if not os.path.exists(self.save_dir):
+                prInfo(
+                    "Creating directories to {}/[rgb][depth][depth_color][output][pcl]".format(
+                        self.save_dir
+                    )
+                )
+                os.makedirs(self.save_dir)
+                os.makedirs(self.save_dir_rgb)
+                os.makedirs(self.save_dir_pcl_bin)
+
+                if args.save:
+                    os.makedirs(self.save_dir_depth)
+                    os.makedirs(self.save_dir_result)
+
+                args_dic = vars(args)
+                with open(self.metadata, "w") as fp:
+                    json.dump(args_dic, fp)
+
+                prSuccess(
+                    "Created directories to {}/[rgb][depth][depth_color][output][pcl]".format(
+                        self.save_dir
+                    )
+                )
+                time.sleep(1)
+
+        # Publishers
+        self.goal_pub = rospy.Publisher(
+            args.namespace + "/human", TransformStamped, queue_size=1
+        )
+
+        self.tf_br = tf2_ros.TransformBroadcaster()
+
+        # Subscribers
+        rgb_topic = args.namespace + "/rgb"
+        depth_topic = args.namespace + "/depth"
+        pcl_topic = args.namespace + "/pcl"
+        prInfo("Subscribing to {} for RGB".format(rgb_topic))
+        rospy.Subscriber(rgb_topic, Image, self.callback_rgb)
+        prInfo("Subscribing to {} for depth".format(depth_topic))
+        rospy.Subscriber(depth_topic, Image, self.callback_depth)
+        prInfo("Subscribing to {} for PCL".format(pcl_topic))
+        rospy.Subscriber(pcl_topic, PointCloud2, self.callback_pcl)
+
+        self.rgb_frame_id = None
+
+    def callback_pcl(self, msg):
+        if self.args.flip:
+            pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape(
+                (msg.height, msg.width, -1)
+            )[::-1, ::-1, :]
+        else:
+            pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape(
+                (msg.height, msg.width, -1)
+            )
+
+        # pcl_array = pcl_array[::-1, :, :]
+        self.pcl_array_xyz = pcl_array[:, :, :3]
+        # self.pcl_array_rgb = pcl_array[:,:,3:]
+        self.pcl_current_seq = msg.header.seq
+        # rospy.loginfo('pcl received ({})...'.format(msg.header.seq))
+
+    def callback_rgb(self, msg):
+        if self.rgb_frame_id != msg.header.frame_id:
+            self.rgb_frame_id = msg.header.frame_id
+        if self.args.flip:
+            self.rgb = cv2.flip(self.br.imgmsg_to_cv2(msg, "bgr8"), -1)
+        else:
+            self.rgb = self.br.imgmsg_to_cv2(msg, "bgr8")
+
+        # self.rgb = cv2.rotate(self.rgb, cv2.ROTATE_180)
+        self.rgb_current_seq = msg.header.seq
+        # rospy.loginfo('RGB received ({})...'.format(msg.header.seq))
+        self.rgb_timestamp = msg.header.stamp
+
+    def callback_depth(self, msg):
+        if self.args.flip:
+            self.depth = cv2.flip(self.br.imgmsg_to_cv2(msg, "mono16"), -1)
+        else:
+            self.depth = self.br.imgmsg_to_cv2(msg, "mono16")
+
+        self.depth_current_seq = msg.header.seq
+        # rospy.loginfo('Depth received ({})...'.format(msg.header.seq))
+
+    def is_ready(self):
+        ready = (
+            (self.rgb is not None)
+            and (self.depth is not None)
+            and (self.pcl_array_xyz is not None)
+        )
+        return ready
+    
+    def save_rgb(self, image_count, image_seq_unique, timestamp):
+        prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save")
+        rgb_path = os.path.join(
+            self.save_dir_rgb,
+            "{:08d}_seq_{:010d}_ts_{}.png".format(
+                image_count, image_seq_unique, timestamp
+            ),
+        )
+        cv2.imwrite(rgb_path, self.rgb)
+        prSuccess("Saved RGB to {}".format(rgb_path))
+
+    def save_depth(self, image_count, image_seq_unique, timestamp):
+        prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save")
+        depth_path = os.path.join(
+            self.save_dir_depth,
+            "{:08d}_seq_{:010d}_ts_{}.png".format(
+                image_count, image_seq_unique, timestamp
+            ),
+        )
+        cv2.imwrite(depth_path, self.depth)
+        prSuccess("Saved depth to {}".format(depth_path))
+
+    def save_output_image(self, image_count, image_seq_unique, timestamp):
+        prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save")
+        results_path = os.path.join(
+            self.save_dir_result,
+            "{:08d}_seq_{:010d}_ts_{}.png".format(
+                image_count, image_seq_unique, timestamp
+            ),
+        )
+        cv2.imwrite(results_path, self.vis_img)
+        prSuccess("Saved result to {}".format(results_path))
+
+    def save_pcl(self, image_count, image_seq_unique, timestamp):
+        prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save")
+        pcl_path = os.path.join(
+            self.save_dir_pcl_bin,
+            "{:08d}_seq_{:010d}_ts_{}.bin".format(
+                image_count, image_seq_unique, timestamp
+            ),
+        )
+        self.pcl_array_xyz.tofile(pcl_path)
+        prSuccess("Saved pcl to {}".format(pcl_path))
+
+    def plot_mmdet_bbox(self, mmdet_results, array_shape):
+        for c in range(len(mmdet_results)):
+            if len(mmdet_results[c]) > 0:
+                for bi in range(mmdet_results[c].shape[0]):
+                    if mmdet_results[c][bi, 4] > self.args.bbox_thr:
+                        bbox = (
+                            mmdet_results[c][bi, :4]
+                            .copy()
+                            .astype(np.int32)
+                        )
+                        bbox_ints = [
+                            int(bbox[0]),
+                            int(bbox[1]),
+                            int(bbox[2]),
+                            int(bbox[3]),
+                        ]
+                        pt1 = (
+                            min(
+                                max(0, bbox_ints[0]),
+                                array_shape[1],
+                            ),
+                            min(
+                                max(0, bbox_ints[1]),
+                                array_shape[0],
+                            ),
+                        )
+                        pt2 = (
+                            min(
+                                max(0, bbox_ints[2]),
+                                array_shape[1],
+                            ),
+                            min(
+                                max(0, bbox_ints[3]),
+                                array_shape[0],
+                            ),
+                        )
+                        cv2.rectangle(
+                            self.vis_img, pt1, pt2, (255, 255, 255), 1
+                        )
+                        cv2.putText(
+                            self.vis_img,
+                            "{:s} ({:.0f}%)".format(
+                                YOLO_COCO_80_CLASSES[c],
+                                mmdet_results[c][bi, 4] * 100,
+                            ),
+                            pt1,
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.5,
+                            (255, 255, 255),
+                            1,
+                        )
+                        
+    def plot_mmdet_person_bbox(self, idx, bbox, array_shape):
+        bbox_ints = [
+            int(bbox[0]),
+            int(bbox[1]),
+            int(bbox[2]),
+            int(bbox[3]),
+        ]
+        pt1 = (
+            min(max(0, bbox_ints[0]), array_shape[1]),
+            min(max(0, bbox_ints[1]), array_shape[0]),
+        )
+        pt2 = (
+            min(max(0, bbox_ints[2]), array_shape[1]),
+            min(max(0, bbox_ints[3]), array_shape[0]),
+        )
+        color = RANDOM_COLORS[idx]
+        color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+
+        cv2.rectangle(self.vis_img, pt1, pt2, color_tuple, 2)
+                
+    def process_keypoints(self, keypoints, depth_array, idx):
+        body_center_joints = (
+            []
+        )  # to store center of lsho, rsho, lhip, rhip in pixels
+        color = RANDOM_COLORS[idx]
+        color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+
+        for j in range(keypoints.shape[0]):
+
+            kp = keypoints[j, :]
+            confidence = int(kp[2] * 255)
+            confidence_color = (
+                self.confidence_cmap[min(255, confidence)] * 255
+            ).astype(np.uint8)
+
+            if (
+                kp[2] > self.args.kpt_thr
+                and kp[0] > 0
+                and kp[1] > 0
+                and kp[0] < depth_array.shape[1]
+                and kp[1] < depth_array.shape[0]
+            ):
+
+                if (j == 5) or (j == 6) or (j == 11) or (j == 12):
+                    # one keypoint of the torso
+                    body_center_joints.append(kp)
+
+                if not self.args.no_show:
+                    # kp_color_tuple = (int(confidence_color[0]), int(confidence_color[1]), int(confidence_color[2]))
+                    cv2.circle(
+                        self.vis_img,
+                        (int(kp[0]), int(kp[1])),
+                        2,
+                        color_tuple,
+                        thickness=3,
+                    )
+
+                # if wrists, find depth and pose
+
+                if j == 10:
+                    # right wrist
+                    depth_wrist = depth_array[int(kp[1]), int(kp[0])]
+                    pose_wrist = self.pcl_array_xyz[
+                        int(kp[1]), int(kp[0]), :
+                    ]
+                    self.tracks_in_current_image[idx][
+                        "right_wrist_depth"
+                    ] = depth_wrist
+                    self.tracks_in_current_image[idx][
+                        "right_wrist_pose"
+                    ] = pose_wrist
+                    if not self.light_display and not self.args.no_show:
+                        cv2.drawMarker(
+                            self.vis_img,
+                            (int(kp[0]), int(kp[1])),
+                            color=color_tuple,
+                            thickness=3,
+                            markerType=cv2.MARKER_CROSS,
+                            line_type=cv2.LINE_AA,
+                            markerSize=16,
+                        )
+                        cv2.putText(
+                            self.vis_img,
+                            "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(
+                                depth_wrist / 10,
+                                pose_wrist[0],
+                                pose_wrist[1],
+                                pose_wrist[2],
+                            ),
+                            (int(kp[0]), int(kp[1])),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.8,
+                            (255, 0, 255),
+                            2,
+                        )
+
+                elif j == 9:
+                    # left wrist
+                    depth_wrist = depth_array[int(kp[1]), int(kp[0])]
+                    pose_wrist = self.pcl_array_xyz[
+                        int(kp[1]), int(kp[0]), :
+                    ]
+                    self.tracks_in_current_image[idx][
+                        "left_wrist_depth"
+                    ] = depth_wrist
+                    self.tracks_in_current_image[idx][
+                        "left_wrist_pose"
+                    ] = pose_wrist
+                    if not self.light_display and not self.args.no_show:
+                        cv2.drawMarker(
+                            self.vis_img,
+                            (int(kp[0]), int(kp[1])),
+                            color=color_tuple,
+                            thickness=3,
+                            markerType=cv2.MARKER_CROSS,
+                            line_type=cv2.LINE_AA,
+                            markerSize=16,
+                        )
+                        cv2.putText(
+                            self.vis_img,
+                            "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(
+                                depth_wrist / 10,
+                                pose_wrist[0],
+                                pose_wrist[1],
+                                pose_wrist[2],
+                            ),
+                            (int(kp[0]), int(kp[1])),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.8,
+                            (255, 0, 255),
+                            2,
+                        )
+                        
+        return body_center_joints
+
+    def get_depth_and_poses_of_torso(self, depth_array, lsho, rsho, lhip, rhip, idx):
+
+        color = RANDOM_COLORS[idx]
+        color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+
+        # find 4 points between lsho and rhip and 4 points between rsho and lhip to find something more precise
+        seg_steps = [0.0, 0.25, 0.50, 0.75, 1.0]
+        depths_torso = []
+        poses_torso = []
+        for step in seg_steps:
+
+            p1 = step * lsho + (1 - step) * rhip
+            if (
+                p1[0] < depth_array.shape[1]
+                and p1[1] < depth_array.shape[0]
+            ):
+                depth_p1 = depth_array[int(p1[1]), int(p1[0])]
+                pose_p1 = self.pcl_array_xyz[
+                    int(p1[1]), int(p1[0]), :
+                ]
+                if depth_p1 > 0:
+                    depths_torso.append(depth_p1)
+                    poses_torso.append(pose_p1)
+
+            p2 = step * rsho + (1 - step) * lhip
+            if (
+                p2[0] < depth_array.shape[1]
+                and p2[1] < depth_array.shape[0]
+            ):
+                depth_p2 = depth_array[int(p2[1]), int(p2[0])]
+                pose_p2 = self.pcl_array_xyz[
+                    int(p2[1]), int(p2[0]), :
+                ]
+                if depth_p2 > 0:
+                    depths_torso.append(depth_p2)
+                    poses_torso.append(pose_p2)
+
+            if not self.args.no_show:
+                # draw to check
+                cv2.drawMarker(
+                    self.vis_img,
+                    (int(p1[0]), int(p1[1])),
+                    color=color_tuple,
+                    thickness=1,
+                    markerType=cv2.MARKER_DIAMOND,
+                    line_type=cv2.LINE_AA,
+                    markerSize=8,
+                )
+                cv2.drawMarker(
+                    self.vis_img,
+                    (int(p2[0]), int(p2[1])),
+                    color=color_tuple,
+                    thickness=1,
+                    markerType=cv2.MARKER_DIAMOND,
+                    line_type=cv2.LINE_AA,
+                    markerSize=8,
+                )
+        
+        return depths_torso, poses_torso        
+    
+    def plot_body_pose_data(self, body_center, depth_body, pose_body, idx):
+
+        color = RANDOM_COLORS[idx]
+        color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+
+        cv2.drawMarker(
+            self.vis_img,
+            body_center,
+            color = color_tuple,
+            thickness=3,
+            markerType=cv2.MARKER_TILTED_CROSS,
+            line_type=cv2.LINE_AA,
+            markerSize=16,
+        )
+        cv2.putText(
+            self.vis_img,
+            "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(
+                depth_body / 10,
+                pose_body[0],
+                pose_body[1],
+                pose_body[2],
+            ),
+            (int(body_center[0]), int(body_center[1])),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8,
+            (0, 255, 0),
+            3,
+        )
+    
+    def plot_skeleton_2d(self, keypoints, idx):
+
+        color = RANDOM_COLORS[idx]
+        color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+
+        for limb in COCO17_JOINTS_LIMBS:
+            start = keypoints[limb[0], :]
+            end = keypoints[limb[1], :]
+            start_point = (int(start[0]), int(start[1]))
+            end_point = (int(end[0]), int(end[1]))
+            if (start[2] > self.args.kpt_thr) and (
+                end[2] > self.args.kpt_thr
+            ):
+                cv2.line(
+                    self.vis_img,
+                    start_point,
+                    end_point,
+                    color = color_tuple,
+                    thickness=3,
+                )
+
+    def plot_det_text_info(self, pose_closest):
+        if pose_closest is not None:
+            cv2.putText(
+                self.vis_img,
+                "{:.2f} {:.2f} {:.2f}".format(
+                    pose_closest[0], pose_closest[1], pose_closest[2]
+                ),
+                (30, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.2,
+                (255, 255, 255),
+                5,
+            )
+            cv2.putText(
+                self.vis_img,
+                "{:.2f} {:.2f} {:.2f}".format(
+                    pose_closest[0], pose_closest[1], pose_closest[2]
+                ),
+                (30, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.2,
+                (0, 0, 0),
+                3,
+            )
+        else:
+            cv2.putText(
+                self.vis_img,
+                "No tracks with pose found",
+                (30, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.2,
+                (255, 255, 255),
+                5,
+            )
+            cv2.putText(
+                self.vis_img,
+                "No tracks with pose found",
+                (30, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.2,
+                (0, 0, 0),
+                3,
+            )
+    
+    def start(self):
+
+        while not rospy.is_shutdown():
+
+            if self.is_ready():
+
+                image_count = self.current_image_count
+                image_seq_unique = self.rgb_current_seq
+                now = datetime.now()
+                timestamp = now.strftime("%Y_%m_%d_%H_%M_%S_%f")
+
+                if self.args.save or self.args.light_save:
+                    self.save_rgb(image_count, image_seq_unique, timestamp)
+
+                rgb_array = np.array(self.rgb)
+
+                if self.args.save:
+                    self.save_depth(image_count, image_seq_unique, timestamp)
+
+                depth_array = np.array(self.depth)
+                depth_array[depth_array > self.depth_array_max_threshold] = (
+                    self.depth_array_max_threshold
+                )
+
+                assert depth_array.shape[0] == rgb_array.shape[0]
+                assert depth_array.shape[1] == rgb_array.shape[1]
+
+                # Process RGB array
+                if self.last_inferred_seq < self.rgb_current_seq:
+
+                    current_frame_processing = self.rgb_current_seq
+                    current_timestamp = self.rgb_timestamp
+                    current_frame_id = self.rgb_frame_id
+                    prInfo("Do inference on frame {}".format(current_frame_processing))
+
+                    # keep old poses for tracking
+                    pose_results_last = self.pose_results
+
+                    tic = time.time()
+                    mmdet_results = inference_detector(
+                        self.det_model, rgb_array
+                    )  # list of detection rectangle i.e [(x1,y1,x2,y2), ...]
+                    tac = time.time()
+                    prInfo(
+                        "Detection in {:.4f} sec (frame {}, number of human detection {})".format(
+                            tac - tic, current_frame_processing, len(mmdet_results[0])
+                        )
+                    )
+
+                    # keep the person class bounding boxes.
+                    person_results = process_mmdet_results(
+                        mmdet_results, self.args.det_cat_id
+                    )
+
+                    tic = time.time()
+                    # test a single image, with a list of bboxes.
+                    self.pose_results, returned_outputs = inference_top_down_pose_model(
+                        self.pose_model,
+                        rgb_array,
+                        person_results,
+                        bbox_thr=self.args.bbox_thr,
+                        format="xyxy",
+                        dataset=self.dataset,
+                        dataset_info=self.dataset_info,
+                        return_heatmap=self.return_heatmap,
+                        outputs=None,
+                    )
+                    tac = time.time()
+                    prInfo("Poses in {:.4f} sec".format(tac - tic))
+
+                    # get track id for each person instance
+                    self.pose_results, self.next_id = get_track_id(
+                        self.pose_results,
+                        pose_results_last,
+                        self.next_id,
+                        use_oks=False,
+                        tracking_thr=self.args.tracking_thr,
+                        use_one_euro=self.args.euro,
+                        fps=10,
+                    )
+
+                    # produce an output image
+                    if not self.args.no_show:
+                        self.vis_img = rgb_array.copy()
+
+                    if self.display_all_detection and not self.args.no_show:
+                        self.plot_mmdet_bbox(mmdet_results, depth_array.shape)
+
+                    #### post processing ####
+
+                    self.tracks_in_current_image = {}
+
+                    for res in self.pose_results:
+
+                        # for each instance
+
+                        bbox = res["bbox"]
+                        keypoints = res["keypoints"]
+                        idx = res["track_id"] % 255
+
+                        self.tracks_in_current_image[idx] = {
+                            "right_wrist_depth": None,
+                            "right_wrist_pose": None,
+                            "left_wrist_depth": None,
+                            "left_wrist_pose": None,
+                            "depth_center": None,
+                            "pose_center": None,
+                            "pose_from": None,
+                        }
+
+                        # Draw bounding bbox
+                        bbox = bbox.astype(np.int32)
+                        
+                        if not self.args.no_show:
+                            self.plot_mmdet_person_bbox(idx, bbox, depth_array.shape)
+
+                        # return the list of body center joints and also fill self.tracks_in_current_image[idx]
+                        body_center_joints = self.process_keypoints(keypoints, depth_array, idx)
+
+                        # find the body center
+                        if len(body_center_joints) == 4:
+                            # if we managed to find the 4 points of the torso, search on the torso
+                            body_center_joints = np.array(
+                                body_center_joints
+                            )  # lsho, rsho, lhip, rhip
+                            lsho = body_center_joints[0, :]
+                            rsho = body_center_joints[1, :]
+                            lhip = body_center_joints[2, :]
+                            rhip = body_center_joints[3, :]
+
+                            depths_torso, poses_torso = self.get_depth_and_poses_of_torso(depth_array, lsho, rsho, lhip, rhip, idx)
+
+                            if len(depths_torso) > 3:
+                                # at least 4 points to average decently
+                                depth_body = np.array(depths_torso).mean()
+                                pose_body = np.array(poses_torso).mean(axis=0)
+                                self.tracks_in_current_image[idx][
+                                    "depth_center"
+                                ] = depth_body  # mm
+                                self.tracks_in_current_image[idx][
+                                    "pose_center"
+                                ] = pose_body  # m
+                                self.tracks_in_current_image[idx]["pose_from"] = "torso"
+
+                                # just for drawing
+                                body_center = np.mean(body_center_joints, axis=0)
+                                # Draw center of body
+                                body_center = (int(body_center[0]), int(body_center[1]))
+                                
+                                if not self.light_display and not self.args.no_show:
+                                    self.plot_body_pose_data(body_center, depth_body, pose_body, idx)
+
+                        else:
+                            # if we did not managed to find the 4 points of the torso, search in the bbox
+                            prWarning(
+                                "Can't use body center from shoulders and hips for track {} : do nothing".format(
+                                    idx
+                                )
+                            )
+
+                        # draw skeleton
+                        if not self.args.no_show:
+                            self.plot_skeleton_2d(keypoints, idx)
+
+                    min_depth = 1e6  # mm
+                    min_depth_idx = -1
+                    for idx, track_info in self.tracks_in_current_image.items():
+                        depth = track_info["depth_center"]
+                        if depth is not None:
+                            if depth < min_depth:
+                                min_depth = depth
+                                min_depth_idx = idx
+
+                    if min_depth_idx != -1:
+                        pose_closest = self.tracks_in_current_image[min_depth_idx][
+                            "pose_center"
+                        ]
+                        prInfo(
+                            "Using track {} as it is the closest".format(min_depth_idx)
+                        )
+                        tf_msg = TransformStamped()
+                        tf_msg.child_frame_id = args.namespace + "/human"
+                        tf_msg.header.seq = current_frame_processing
+                        tf_msg.header.stamp = current_timestamp
+                        tf_msg.header.frame_id = current_frame_id
+                        # adapt to robot camera frame convention on the robot
+                        tf_msg.transform.translation.x = pose_closest[2]
+                        tf_msg.transform.translation.y = -pose_closest[0]
+                        tf_msg.transform.translation.z = -pose_closest[1]
+
+                        angle = np.arctan(
+                            tf_msg.transform.translation.y
+                            / tf_msg.transform.translation.x
+                        )
+
+                        # Rotate to have 'human' x axis looking towards the robot
+                        rot = Rotation()
+                        rot.DoRotZ(angle)
+                        rot.DoRotY(np.pi)
+                        qx, qy, qz, qw = rot.GetQuaternion()
+
+                        tf_msg.transform.rotation.x = qx
+                        tf_msg.transform.rotation.y = qy
+                        tf_msg.transform.rotation.z = qz
+                        tf_msg.transform.rotation.w = qw
+
+                        dist = np.sqrt(
+                            tf_msg.transform.translation.x**2 + tf_msg.transform.translation.y**2 + tf_msg.transform.translation.z**2
+                        )
+                        if dist < self.args.max_distance: # meters
+                            self.goal_pub.publish(tf_msg)
+                            prSuccess(
+                                "Publishing coordinates {:.2f} {:.2f} {:.2f}".format(
+                                    pose_closest[0], pose_closest[1], pose_closest[2]
+                                )
+                            )
+
+                            self.tf_br.sendTransform(tf_msg)
+
+                        if not self.args.no_show:
+                            self.plot_det_text_info(pose_closest)
+
+                    else:
+                        
+                        if not self.args.no_show:
+                            self.plot_det_text_info(None)
+
+
+                    self.last_inferred_seq = current_frame_processing
+
+                    if self.args.save and not self.args.no_show:
+                        self.save_output_image(image_count, image_seq_unique, timestamp)
+
+                else:
+                    prWarning(
+                        "No inference because the current RGB frame has already been processed last_inferred_seq {} vs rgb_current_seq {}".format(
+                            self.last_inferred_seq, self.rgb_current_seq
+                        )
+                    )
+
+                if not self.args.no_show:
+                    depth_array_disp = depth_array.copy()
+                    depth_array_disp[depth_array_disp > 3000] = 3000
+                    depth_array_norm = ((depth_array_disp - depth_array_disp.min())) / (
+                        depth_array_disp.max() - depth_array_disp.min()
+                    )
+                    depth_array_norm = depth_array_norm * 255
+                    depth_array_norm = depth_array_norm.astype(np.uint8)
+                    depth_array_norm_colored = (
+                        self.depth_cmap[depth_array_norm] * 255
+                    ).astype(np.uint8)
+
+                if self.args.save or self.args.light_save:
+                    self.save_pcl(image_count, image_seq_unique, timestamp)
+
+                if self.vis_img is not None:
+                    full_display_array = np.zeros(
+                        (rgb_array.shape[0] * 2, rgb_array.shape[1], 3), dtype=np.uint8
+                    )
+                    full_display_array[: rgb_array.shape[0], :, :] = self.vis_img
+                    full_display_array[rgb_array.shape[0] :, :, :] = (
+                        depth_array_norm_colored
+                    )
+
+                    if not self.args.no_show:
+                        cv2.imshow("RGBD window", full_display_array)
+                        cv2.waitKey(3)
+
+            else:
+                print("Images are None !")
+
+            self.loop_rate.sleep()
+
+
+if __name__ == "__main__":
+
+    ## Parser with params
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--det_config",
+        type=str,
+        default="./configs/detection/yolov3_d53_320_273e_coco.py",
+        help="Config file for detection | default = %(default)s",
+    )
+    parser.add_argument(
+        "--det_checkpoint",
+        type=str,
+        default="./models/yolov3_d53_320_273e_coco-421362b6.pth",
+        help="Checkpoint file for detection | default = %(default)s",
+    )
+    parser.add_argument(
+        "--pose_config",
+        type=str,
+        default="./configs/pose/ViTPose_small_coco_256x192.py",
+        help="Config file for pose | default = %(default)s",
+    )
+    parser.add_argument(
+        "--pose_checkpoint",
+        type=str,
+        default="./models/vitpose_small.pth",
+        help="Checkpoint file for pose | default = %(default)s",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda:0",
+        help="Device used for inference | default = %(default)s",
+    )
+    parser.add_argument(
+        "--det_cat_id",
+        type=int,
+        default=1,
+        help="Category id for bounding box detection model (person) | default = %(default)s",
+    )
+    parser.add_argument(
+        "--bbox_thr",
+        type=float,
+        default=0.3,
+        help="Bounding box score threshold | default = %(default)s",
+    )
+    parser.add_argument(
+        "--kpt_thr",
+        type=float,
+        default=0.3,
+        help="Keypoint score threshold | default = %(default)s",
+    )
+    parser.add_argument(
+        "--tracking_thr",
+        type=float,
+        default=0.3,
+        help="Tracking threshold | default = %(default)s",
+    )
+    parser.add_argument(
+        "--euro", action="store_true", help="Using One_Euro_Filter for smoothing"
+    )
+    # parser.add_argument('--rgb_topic', default = "orbbec/rgb", type=str, help='ROS topic for RGB image')
+    # parser.add_argument('--depth_topic', default = "orbbec/depth", type=str, help='ROS topic for depth image')
+    # parser.add_argument('--pcl_topic', default = "orbbec/pcl", type=str, help='ROS topic for pcl')
+    parser.add_argument(
+        "--namespace",
+        default="orbbec_head",
+        type=str,
+        help="ROS topic namespace for rgb, depth, pcl | default = %(default)s",
+    )
+    parser.add_argument(
+        "--no_show",
+        action="store_true",
+        default=False,
+        help="whether to show visualizations | default = %(default)s",
+    )
+    parser.add_argument(
+        "--save",
+        action="store_true",
+        default=False,
+        help="whether to save images (rgb and d and predictions and pcl) | default = %(default)s",
+    )
+    parser.add_argument(
+        "--flip",
+        action="store_true",
+        default=True,
+        help="whether to flip images | default = %(default)s",
+    )
+    parser.add_argument(
+        "--light_save",
+        action="store_true",
+        default=False,
+        help="whether to save only rgb and pcl (not optimized use the light_save of visualizer for optimized saving) | default = %(default)s",
+    )
+    parser.add_argument(
+        "--display_all_detection",
+        "-dad",
+        action="store_true",
+        default=False,
+        help="whether to display all detections or only human | default = %(default)s",
+    )
+    parser.add_argument(
+        "--light_display",
+        "-ld",
+        action="store_true",
+        default=False,
+        help="whether to display only skeletons | default = %(default)s",
+    )
+    parser.add_argument("--fps", type=int, default=10, help="Node and recording fps")
+    parser.add_argument(
+        "--depth_cmap",
+        default="jet",
+        type=str,
+        help="mpl colormap for depth image | default = %(default)s",
+    )
+    parser.add_argument(
+        "--max_distance",
+        type=float,
+        default=2.5,
+        help="Maximum distance allowed for publishing human pose | default = %(default)s",
+    )
+
+    args = parser.parse_args()
+
+    assert has_mmdet, "Please install mmdet to run the demo."
+    assert args.det_config is not None
+    assert args.det_checkpoint is not None
+
+    prInfo("Loaded with args : {}".format(args))
+
+    rospy.init_node("python_orbbec_inference", anonymous=True)
+    my_node = InferenceNodeRGBD(args)
+    my_node.start()
+    cv2.destroyAllWindows()
diff --git a/rgbd_detect_3d_dir.py b/rgbd_detect_3d_dir.py
new file mode 100644
index 0000000..bbbc26e
--- /dev/null
+++ b/rgbd_detect_3d_dir.py
@@ -0,0 +1,2025 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# TODO : sort imports
+
+# mmdet and mmpose import
+from mmpose.apis import (
+    get_track_id,
+    inference_top_down_pose_model,
+    init_pose_model,
+    process_mmdet_results,
+)
+from mmpose.datasets import DatasetInfo
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+
+    has_mmdet = True
+except (ImportError, ModuleNotFoundError):
+    has_mmdet = False
+
+# ros related import
+import rospy
+from sensor_msgs.msg import Image, PointCloud2
+from geometry_msgs.msg import TransformStamped
+from cv_bridge import CvBridge
+import tf2_ros
+
+# other import
+import cv2
+import os
+import matplotlib.pyplot as plt
+from argparse import ArgumentParser
+from datetime import datetime
+import time
+import json
+import warnings
+import numpy as np
+from PyKDL import Rotation
+import copy
+# import imageio
+from PIL import Image as PILImage
+
+from utils import *
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+
+try:
+    has_mb = True
+    # motion bert import
+    from lib.utils.tools import *
+    from lib.utils.learning import *
+    from lib.utils.utils_data import flip_data
+    from lib.data.dataset_wild import WildDetDataset
+    from lib.utils.vismo import render_and_save
+except:
+    has_mb = False
+    prWarning("No MotionBERT import, fail")
+    
+try:
+    # gafa import (optional)
+    has_gafa = True
+    from gafa_utils import body_transform, head_transform, head_transform_rest, normalize_img, body_transform_from_bb, normalize_img_torch, head_transform_face
+    from gafa.models.gazenet import GazeNet
+except:
+    has_gafa = False
+    prWarning("No GAFA import, fail")
+    from gafa_utils import body_transform, head_transform, head_transform_rest, normalize_img, body_transform_from_bb, normalize_img_torch, head_transform_face
+    
+# 6D Rep import
+try:
+    has_sixdrep = True
+    from sixdrep.util import FaceDetector, compute_euler
+    from sixdrep.utils import sixdreptransform
+except:
+    has_sixdrep = False
+    prWarning("No 6D Rep import, fail")
+    
+    
+# gaze estimation simple models import
+try:
+    has_gaze_est = True
+    from gaze_estimation.models import resnet18, mobilenet_v2, mobileone_s0
+    from gaze_estimation.utils import pre_process
+    from gaze_estimation.models import SCRFD
+except:
+    has_gaze_est = False
+    prWarning("No GazeEst import, fail")
+
+
+# remove numpy scientific notation
+np.set_printoptions(suppress=True)
+
+
+class InferenceNodeRGBD(object):
+    def __init__(self, args):
+
+        # init args
+        self.args = args
+
+        # init detector and pose
+        self.det_model = init_detector(
+            args.det_config, args.det_checkpoint, device=args.device.lower()
+        )
+
+        self.pose_model = init_pose_model(
+            args.pose_config, args.pose_checkpoint, device=args.device.lower()
+        )
+
+        # if enabled, init MotionBERT
+        if self.args.use_mb:
+            # init 3d MotionBERT model
+            prInfo('Initialiazing 3D Pose Lifter {}'.format(args.mb_checkpoint))        
+            mb_3d_args = get_config(args.mb_3d_config)
+            self.motionbert_3d_model = load_backbone(mb_3d_args)
+            if torch.cuda.is_available():
+                self.motionbert_3d_model = nn.DataParallel(self.motionbert_3d_model)
+                self.motionbert_3d_model = self.motionbert_3d_model.cuda()
+            else:
+                prError("Expect cuda to be available but is_available returned false")
+                exit(0)
+
+            prInfo('Loading checkpoint {}'.format(args.mb_checkpoint))
+            mb_checkpoint = torch.load(args.mb_checkpoint, map_location=lambda storage, loc: storage)
+            self.motionbert_3d_model.load_state_dict(mb_checkpoint['model_pos'], strict=True)
+            self.motionbert_3d_model.eval()
+            prInfo('Loaded motionbert_3d_model')
+            # no need for the whole WildDetDataset stuff, just manually make the input trajectories for the tracks
+
+        # if enabled, init GAFA
+        if self.args.use_gafa:
+            self.gafa_model = GazeNet(n_frames=self.args.gafa_n_frames)
+            self.gafa_model.load_state_dict(torch.load(
+                self.args.gafa_checkpoint)) #, map_location=torch.device("cpu"))['state_dict'])
+
+            self.gafa_model.cuda()
+            self.gafa_model.eval()
+
+            prInfo(
+                "Loaded GAFA model from {}".format(
+                    self.args.gafa_checkpoint))
+        
+        # if enabled, init gaze resnet
+        if self.args.use_gaze_resnet:
+            self.face_detector = SCRFD(model_path="./gaze_estimation/weights/det_10g.onnx")
+            self.gaze_estimation_model = resnet18(pretrained = False, num_classes = 90)     
+            state_dict = torch.load("./gaze_estimation/weights/resnet18.pt", map_location=args.device.lower())
+            self.gaze_estimation_model.load_state_dict(state_dict)
+            self.gaze_estimation_model.to(args.device.lower())
+            self.gaze_estimation_model.eval()
+            prInfo('Loaded ResNet18 for gaze estimation')
+        
+        # if enabled, init 6DRep
+        if self.args.use_six_d_rep:
+            
+            self.sixdrep_model = torch.load(f='./sixdrep/weights/best.pt', map_location='cuda')
+            self.sixdrep_model = self.sixdrep_model['model'].float().fuse()
+            self.sixdrep_detector = FaceDetector('./sixdrep/weights/detection.onnx')
+    
+            self.sixdrep_model.half()
+            self.sixdrep_model.eval()
+
+        
+        # dataset for detection and pose
+        self.dataset = self.pose_model.cfg.data["test"]["type"]
+        self.dataset_info = self.pose_model.cfg.data["test"].get(
+            "self.dataset_info", None
+        )
+        if self.dataset_info is None:
+            warnings.warn(
+                "Please set `self.dataset_info` in the config."
+                "Check https://github.com/open-mmlab/mmpose/pull/663 for details.",
+                DeprecationWarning,
+            )
+        else:
+            self.dataset_info = DatasetInfo(self.dataset_info)
+
+        self.return_heatmap = False
+
+        # variables to keep tracks along time or in the current frame
+        self.next_id = 0
+        self.pose_results = []
+        self.tracks_in_current_image = {}
+        self.tracks = {} # all the tracks along time, we need to keep and history with some data
+        
+        # shared variables for the received images and pcl
+        self.rgb = None  # Image frame
+        self.depth = None  # Image frame
+
+        self.pcl_array_rgb = None
+        self.pcl_array_xyz = None
+
+        # viewing options
+        self.depth_array_max_threshold = 20000
+        self.depth_cmap = get_mpl_colormap(args.depth_cmap)
+        self.confidence_cmap = get_mpl_colormap("viridis")
+        self.vis_img = None  # output image RGB + detections
+        self.view_all_classes_dets = True
+        self.display_all_detection = args.display_all_detection
+        self.light_display = args.light_display
+        
+        # counter for the incoming frames
+        self.pcl_current_seq = -1
+        self.rgb_current_seq = -1
+        self.last_inferred_seq = -1
+        self.depth_current_seq = -1
+        self.current_image_count = 0
+        self.rgb_frame_id = None # received from ROS image
+
+        # CV Bridge for receiving frames
+        self.br = CvBridge()
+
+        # Set ROS node rate
+        prInfo("Setting node rate to {} fps".format(args.fps))
+        self.loop_rate = rospy.Rate(args.fps)
+
+        # create the output path
+        now = datetime.now()
+        timestamp = now.strftime("%Y_%m_%d_%H_%M_%S")
+        self.save_dir = os.path.join("output", "record_{:s}".format(timestamp))
+        self.metadata = os.path.join(self.save_dir, "metadata.json")
+        self.save_dir_rgb = os.path.join(self.save_dir, "rgb")
+        self.save_dir_depth = os.path.join(self.save_dir, "depth")
+        self.save_dir_result = os.path.join(self.save_dir, "output")
+        self.save_dir_pcl_bin = os.path.join(self.save_dir, "pcl")
+
+        if args.save or args.light_save:
+            prInfo(
+                "Saving to {}/[rgb][depth][depth_color][output][pcl]".format(
+                    self.save_dir
+                )
+            )
+            if not os.path.exists(self.save_dir):
+                prInfo(
+                    "Creating directories to {}/[rgb][depth][depth_color][output][pcl]".format(
+                        self.save_dir
+                    )
+                )
+                os.makedirs(self.save_dir)
+                os.makedirs(self.save_dir_rgb)
+                os.makedirs(self.save_dir_pcl_bin)
+
+                if args.save:
+                    os.makedirs(self.save_dir_depth)
+                    os.makedirs(self.save_dir_result)
+
+                args_dic = vars(args)
+                with open(self.metadata, "w") as fp:
+                    json.dump(args_dic, fp)
+
+                prSuccess(
+                    "Created directories to {}/[rgb][depth][depth_color][output][pcl]".format(
+                        self.save_dir
+                    )
+                )
+                time.sleep(1)
+
+        # ROS publishers
+        self.goal_pub = rospy.Publisher(
+            args.namespace + "/human", TransformStamped, queue_size=1
+        )
+
+        self.tf_br = tf2_ros.TransformBroadcaster()
+
+        # ROS subscribers
+        rgb_topic = args.namespace + "/rgb"
+        depth_topic = args.namespace + "/depth"
+        pcl_topic = args.namespace + "/pcl"
+        prInfo("Subscribing to {} for RGB".format(rgb_topic))
+        rospy.Subscriber(rgb_topic, Image, self.callback_rgb)
+        prInfo("Subscribing to {} for depth".format(depth_topic))
+        rospy.Subscriber(depth_topic, Image, self.callback_depth)
+        prInfo("Subscribing to {} for PCL".format(pcl_topic))
+        rospy.Subscriber(pcl_topic, PointCloud2, self.callback_pcl)
+
+
+    def callback_pcl(self, msg):
+        if self.args.flip:
+            pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape(
+                (msg.height, msg.width, -1)
+            )[::-1, ::-1, :]
+        else:
+            pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape(
+                (msg.height, msg.width, -1)
+            )
+
+        # pcl_array = pcl_array[::-1, :, :]
+        self.pcl_array_xyz = pcl_array[:, :, :3]
+        # self.pcl_array_rgb = pcl_array[:,:,3:]
+        self.pcl_current_seq = msg.header.seq
+        # rospy.loginfo('pcl received ({})...'.format(msg.header.seq))
+
+    def callback_rgb(self, msg):
+        if self.rgb_frame_id != msg.header.frame_id:
+            self.rgb_frame_id = msg.header.frame_id
+        if self.args.flip:
+            self.rgb = cv2.flip(self.br.imgmsg_to_cv2(msg, "bgr8"), -1)
+        else:
+            self.rgb = self.br.imgmsg_to_cv2(msg, "bgr8")
+
+        # self.rgb = cv2.rotate(self.rgb, cv2.ROTATE_180)
+        self.rgb_current_seq = msg.header.seq
+        # rospy.loginfo('RGB received ({})...'.format(msg.header.seq))
+        self.rgb_timestamp = msg.header.stamp
+
+    def callback_depth(self, msg):
+        if self.args.flip:
+            self.depth = cv2.flip(self.br.imgmsg_to_cv2(msg, "mono16"), -1)
+        else:
+            self.depth = self.br.imgmsg_to_cv2(msg, "mono16")
+
+        self.depth_current_seq = msg.header.seq
+        # rospy.loginfo('Depth received ({})...'.format(msg.header.seq))
+
+    def is_ready(self):
+        ready = (
+            (self.rgb is not None)
+            and (self.depth is not None)
+            and (self.pcl_array_xyz is not None)
+        )
+        return ready
+    
+    @timeit
+    def save_rgb(self, image_count, image_seq_unique, timestamp):
+        prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save")
+        rgb_path = os.path.join(
+            self.save_dir_rgb,
+            "{:08d}_seq_{:010d}_ts_{}.png".format(
+                image_count, image_seq_unique, timestamp
+            ),
+        )
+        cv2.imwrite(rgb_path, self.rgb)
+        prSuccess("Saved RGB to {}".format(rgb_path))
+
+    @timeit
+    def save_depth(self, image_count, image_seq_unique, timestamp):
+        prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save")
+        depth_path = os.path.join(
+            self.save_dir_depth,
+            "{:08d}_seq_{:010d}_ts_{}.png".format(
+                image_count, image_seq_unique, timestamp
+            ),
+        )
+        cv2.imwrite(depth_path, self.depth)
+        prSuccess("Saved depth to {}".format(depth_path))
+
+    @timeit
+    def save_output_image(self, image_count, image_seq_unique, timestamp):
+        prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save")
+        results_path = os.path.join(
+            self.save_dir_result,
+            "{:08d}_seq_{:010d}_ts_{}.png".format(
+                image_count, image_seq_unique, timestamp
+            ),
+        )
+        cv2.imwrite(results_path, self.vis_img)
+        prSuccess("Saved result to {}".format(results_path))
+
+    @timeit
+    def save_pcl(self, image_count, image_seq_unique, timestamp):
+        prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save")
+        pcl_path = os.path.join(
+            self.save_dir_pcl_bin,
+            "{:08d}_seq_{:010d}_ts_{}.bin".format(
+                image_count, image_seq_unique, timestamp
+            ),
+        )
+        self.pcl_array_xyz.tofile(pcl_path)
+        prSuccess("Saved pcl to {}".format(pcl_path))
+
+    @timeit
+    def plot_mmdet_bbox(self, mmdet_results, array_shape):
+        for c in range(len(mmdet_results)):
+            if len(mmdet_results[c]) > 0:
+                for bi in range(mmdet_results[c].shape[0]):
+                    if mmdet_results[c][bi, 4] > self.args.bbox_thr:
+                        bbox = (
+                            mmdet_results[c][bi, :4]
+                            .copy()
+                            .astype(np.int32)
+                        )
+                        bbox_ints = [
+                            int(bbox[0]),
+                            int(bbox[1]),
+                            int(bbox[2]),
+                            int(bbox[3]),
+                        ]
+                        pt1 = (
+                            min(
+                                max(0, bbox_ints[0]),
+                                array_shape[1],
+                            ),
+                            min(
+                                max(0, bbox_ints[1]),
+                                array_shape[0],
+                            ),
+                        )
+                        pt2 = (
+                            min(
+                                max(0, bbox_ints[2]),
+                                array_shape[1],
+                            ),
+                            min(
+                                max(0, bbox_ints[3]),
+                                array_shape[0],
+                            ),
+                        )
+                        cv2.rectangle(
+                            self.vis_img, pt1, pt2, (255, 255, 255), 1
+                        )
+                        cv2.putText(
+                            self.vis_img,
+                            "{:s} ({:.0f}%)".format(
+                                YOLO_COCO_80_CLASSES[c],
+                                mmdet_results[c][bi, 4] * 100,
+                            ),
+                            pt1,
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.5 * TEXT_SCALE,
+                            (255, 255, 255),
+                            1,
+                        )
+
+    @timeit           
+    def plot_xyxy_person_bbox(self, idx, bbox, array_shape, track, poses_torso = None):
+        bbox_ints = [
+            int(bbox[0]),
+            int(bbox[1]),
+            int(bbox[2]),
+            int(bbox[3]),
+        ]
+        pt1 = (
+            min(max(0, bbox_ints[0]), array_shape[1]),
+            min(max(0, bbox_ints[1]), array_shape[0]),
+        )
+        pt2 = (
+            min(max(0, bbox_ints[2]), array_shape[1]),
+            min(max(0, bbox_ints[3]), array_shape[0]),
+        )
+        color = RANDOM_COLORS[idx]
+        # color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+        color_tuple = (255,255,255)
+        
+        # yolo score
+        score = bbox[4]
+        
+        # current gaze
+        if len(track["gaze_yaw_rad"]) > 0:
+            yaw_g = int(np.rad2deg(track["gaze_yaw_rad"][-1]))
+            pitch_g = int(np.rad2deg(track["gaze_pitch_rad"][-1]))
+            if yaw_g == 180 or pitch_g == 180:
+                yaw_g = "Unk"
+                pitch_g = "Unk"
+        else:
+            yaw_g = "Unk"
+            pitch_g = "Unk"
+            
+        # curent depth
+        if len(track["depth_face"]) > 0:
+            depth_f = track["depth_face"][-1]
+        else:
+            depth_f = "Unk"
+        
+        # position
+        if poses_torso is not None:
+            pose_body = np.array(poses_torso).mean(axis=0)
+            pose_body_n = pose_body.copy()
+            if type(pose_body) == np.ndarray:
+                pose_body_n[0] = pose_body[2]
+                pose_body_n[1] = -pose_body[0]
+                pose_body_n[2] = -pose_body[1]
+            else:
+                pose_body_n = ["Unk", "Unk", "Unk"]
+        else:
+            pose_body_n = ["Unk", "Unk", "Unk"]
+            
+        # attention
+        heat_count = 0
+        history = 20
+        length = min(len(track["depth_face"]), history)
+        for i in range(length):
+            index = len(track["depth_face"]) - i - 1
+            yaw = np.rad2deg(track["gaze_yaw_rad"][index])
+            pitch = np.rad2deg(track["gaze_pitch_rad"][index])
+            depth = track["depth_face"][index]
+                        
+            thresh = (int(depth / 1000) + 1) * 3 # 3 deg per meter
+            if np.abs(yaw) < thresh and pitch < 0:
+                heat_count += 1
+                # suppose we are looking down
+        
+        attention_score = int( min((heat_count * 2 / history), 1) * 100)
+        
+        draw_bbox_with_corners(self.vis_img, bbox_ints, color = color_tuple, thickness = 5, proportion = 0.2)
+        
+        text = "Person : {}% | Attention : {}%".format(score, attention_score)
+        if poses_torso is not None and  type(pose_body) == np.ndarray:
+            text2 = "Yaw = {} | Pitch = {} | pos = ({:.2f}, {:.2f}, {:.2f})".format(yaw_g, pitch_g, pose_body_n[0], pose_body_n[1], pose_body_n[2])
+        
+        cv2.putText(
+            self.vis_img,
+            text,
+            (bbox_ints[0], bbox_ints[1] - 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5 * TEXT_SCALE,
+            color_tuple,
+            1,
+        )
+        
+        if poses_torso is not None  and  type(pose_body) == np.ndarray:
+            cv2.putText(
+                self.vis_img,
+                text2,
+                (bbox_ints[0], bbox_ints[1] - 15),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.35 * TEXT_SCALE,
+                color_tuple,
+                1,
+            )
+
+        # cv2.rectangle(self.vis_img, pt1, pt2, color_tuple, 2)
+             
+    @timeit   
+    def process_keypoints(self, keypoints, depth_array, idx):
+        body_center_joints = (
+            []
+        )  # to store center of lsho, rsho, lhip, rhip in pixels
+        color = RANDOM_COLORS[idx]
+        # color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+        color_tuple = (255,255,255)
+
+        for j in range(keypoints.shape[0]):
+
+            kp = keypoints[j, :]
+            confidence = int(kp[2] * 255)
+            confidence_color = (
+                self.confidence_cmap[min(255, confidence)] * 255
+            ).astype(np.uint8)
+
+            if (
+                kp[2] > self.args.kpt_thr
+                and kp[0] > 0
+                and kp[1] > 0
+                and kp[0] < depth_array.shape[1]
+                and kp[1] < depth_array.shape[0]
+            ):
+
+                if (j == 5) or (j == 6) or (j == 11) or (j == 12):
+                    # one keypoint of the torso
+                    body_center_joints.append(kp)
+
+                if not self.args.no_show and not self.args.light_display:
+                    # kp_color_tuple = (int(confidence_color[0]), int(confidence_color[1]), int(confidence_color[2]))
+                    cv2.circle(
+                        self.vis_img,
+                        (int(kp[0]), int(kp[1])),
+                        2,
+                        color_tuple,
+                        thickness=3,
+                    )
+
+                # if wrists, find depth and pose
+
+                if j == 10:
+                    # right wrist
+                    depth_wrist = depth_array[int(kp[1]), int(kp[0])]
+                    pose_wrist = self.pcl_array_xyz[
+                        int(kp[1]), int(kp[0]), :
+                    ]
+                    self.tracks_in_current_image[idx][
+                        "right_wrist_depth"
+                    ] = depth_wrist
+                    self.tracks_in_current_image[idx][
+                        "right_wrist_pose"
+                    ] = pose_wrist
+                    if not self.light_display and not self.args.no_show:
+                        cv2.drawMarker(
+                            self.vis_img,
+                            (int(kp[0]), int(kp[1])),
+                            color=color_tuple,
+                            thickness=3,
+                            markerType=cv2.MARKER_CROSS,
+                            line_type=cv2.LINE_AA,
+                            markerSize=8,
+                        )
+                        # cv2.putText(
+                        #     self.vis_img,
+                        #     "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(
+                        #         depth_wrist / 10,
+                        #         pose_wrist[0],
+                        #         pose_wrist[1],
+                        #         pose_wrist[2],
+                        #     ),
+                        #     (int(kp[0]), int(kp[1])),
+                        #     cv2.FONT_HERSHEY_SIMPLEX,
+                        #     0.5 * TEXT_SCALE,
+                        #     (255,255,255),
+                        #     2,
+                        # )
+                        # cv2.putText(
+                        #     self.vis_img,
+                        #     "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(
+                        #         depth_wrist / 10,
+                        #         pose_wrist[0],
+                        #         pose_wrist[1],
+                        #         pose_wrist[2],
+                        #     ),
+                        #     (int(kp[0]), int(kp[1])),
+                        #     cv2.FONT_HERSHEY_SIMPLEX,
+                        #     0.5 * TEXT_SCALE,
+                        #     color_tuple,
+                        #     1,
+                        # )
+
+                elif j == 9:
+                    # left wrist
+                    depth_wrist = depth_array[int(kp[1]), int(kp[0])]
+                    pose_wrist = self.pcl_array_xyz[
+                        int(kp[1]), int(kp[0]), :
+                    ]
+                    self.tracks_in_current_image[idx][
+                        "left_wrist_depth"
+                    ] = depth_wrist
+                    self.tracks_in_current_image[idx][
+                        "left_wrist_pose"
+                    ] = pose_wrist
+                    if not self.light_display and not self.args.no_show:
+                        cv2.drawMarker(
+                            self.vis_img,
+                            (int(kp[0]), int(kp[1])),
+                            color=color_tuple,
+                            thickness=3,
+                            markerType=cv2.MARKER_CROSS,
+                            line_type=cv2.LINE_AA,
+                            markerSize=8,
+                        )
+                        # cv2.putText(
+                        #     self.vis_img,
+                        #     "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(
+                        #         depth_wrist / 10,
+                        #         pose_wrist[0],
+                        #         pose_wrist[1],
+                        #         pose_wrist[2],
+                        #     ),
+                        #     (int(kp[0]), int(kp[1])),
+                        #     cv2.FONT_HERSHEY_SIMPLEX,
+                        #     0.5 * TEXT_SCALE,
+                        #     (255,255,255),
+                        #     2,
+                        # )
+                        # cv2.putText(
+                        #     self.vis_img,
+                        #     "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(
+                        #         depth_wrist / 10,
+                        #         pose_wrist[0],
+                        #         pose_wrist[1],
+                        #         pose_wrist[2],
+                        #     ),
+                        #     (int(kp[0]), int(kp[1])),
+                        #     cv2.FONT_HERSHEY_SIMPLEX,
+                        #     0.5 * TEXT_SCALE,
+                        #     color_tuple,
+                        #     1,
+                        # )
+                        
+        return body_center_joints
+
+    @timeit
+    def get_depth_and_poses_of_torso(self, depth_array, lsho, rsho, lhip, rhip, idx):
+
+        color = RANDOM_COLORS[idx]
+        # color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+        color_tuple = (255,255,255)
+
+        # find 4 points between lsho and rhip and 4 points between rsho and lhip to find something more precise
+        seg_steps = [0.0, 0.25, 0.50, 0.75, 1.0]
+        depths_torso = []
+        poses_torso = []
+        for step in seg_steps:
+
+            p1 = step * lsho + (1 - step) * rhip
+            if (
+                p1[0] < depth_array.shape[1]
+                and p1[1] < depth_array.shape[0]
+            ):
+                depth_p1 = depth_array[int(p1[1]), int(p1[0])]
+                pose_p1 = self.pcl_array_xyz[
+                    int(p1[1]), int(p1[0]), :
+                ]
+                if depth_p1 > 0:
+                    depths_torso.append(depth_p1)
+                    poses_torso.append(pose_p1)
+
+            p2 = step * rsho + (1 - step) * lhip
+            if (
+                p2[0] < depth_array.shape[1]
+                and p2[1] < depth_array.shape[0]
+            ):
+                depth_p2 = depth_array[int(p2[1]), int(p2[0])]
+                pose_p2 = self.pcl_array_xyz[
+                    int(p2[1]), int(p2[0]), :
+                ]
+                if depth_p2 > 0:
+                    depths_torso.append(depth_p2)
+                    poses_torso.append(pose_p2)
+
+            if not self.args.no_show:
+                # draw to check
+                cv2.drawMarker(
+                    self.vis_img,
+                    (int(p1[0]), int(p1[1])),
+                    color=color_tuple,
+                    thickness=1,
+                    markerType=cv2.MARKER_DIAMOND,
+                    line_type=cv2.LINE_AA,
+                    markerSize=8,
+                )
+                cv2.drawMarker(
+                    self.vis_img,
+                    (int(p2[0]), int(p2[1])),
+                    color=color_tuple,
+                    thickness=1,
+                    markerType=cv2.MARKER_DIAMOND,
+                    line_type=cv2.LINE_AA,
+                    markerSize=8,
+                )
+        
+        return depths_torso, poses_torso        
+    
+    @timeit
+    def plot_body_pose_data(self, body_center, depth_body, pose_body, idx):
+
+        color = RANDOM_COLORS[idx]
+        # color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+        color_tuple = (255,255,255)
+
+        cv2.drawMarker(
+            self.vis_img,
+            body_center,
+            color = color_tuple,
+            thickness=1,
+            markerType=cv2.MARKER_TILTED_CROSS,
+            line_type=cv2.LINE_AA,
+            markerSize=16,
+        )
+        # cv2.putText(
+        #     self.vis_img,
+        #     "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(
+        #         depth_body / 10,
+        #         pose_body[0],
+        #         pose_body[1],
+        #         pose_body[2],
+        #     ),
+        #     (int(body_center[0]), int(body_center[1])),
+        #     cv2.FONT_HERSHEY_SIMPLEX,
+        #     0.8 * TEXT_SCALE,
+        #     (0, 255, 0),
+        #     3,
+        # )
+        cv2.putText(
+            self.vis_img,
+            "{:.0f}cm".format(
+                depth_body / 10
+            ),
+            (int(body_center[0]), int(body_center[1])),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5 * TEXT_SCALE,
+            (255, 255, 255),
+            1,
+        )
+
+    @timeit
+    def plot_skeleton_2d(self, keypoints, idx):
+
+        color = RANDOM_COLORS[idx]
+        # color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+        color_tuple = (255,255,255)
+
+        for limb in COCO17_JOINTS_LIMBS:
+            start = keypoints[limb[0], :]
+            end = keypoints[limb[1], :]
+            start_point = (int(start[0]), int(start[1]))
+            end_point = (int(end[0]), int(end[1]))
+            if (start[2] > self.args.kpt_thr) and (
+                end[2] > self.args.kpt_thr
+            ):
+                cv2.line(
+                    self.vis_img,
+                    start_point,
+                    end_point,
+                    color = color_tuple,
+                    thickness=1,
+                )
+    @timeit
+    def plot_det_text_info(self, pose_closest):
+        if pose_closest is not None:
+            cv2.putText(
+                self.vis_img,
+                "{:.2f} {:.2f} {:.2f}".format(
+                    pose_closest[0], pose_closest[1], pose_closest[2]
+                ),
+                (30, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.2 * TEXT_SCALE,
+                (255, 255, 255),
+                5,
+            )
+            cv2.putText(
+                self.vis_img,
+                "{:.2f} {:.2f} {:.2f}".format(
+                    pose_closest[0], pose_closest[1], pose_closest[2]
+                ),
+                (30, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.2 * TEXT_SCALE,
+                (0, 0, 0),
+                3,
+            )
+        else:
+            cv2.putText(
+                self.vis_img,
+                "No tracks with pose found",
+                (30, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.2 * TEXT_SCALE,
+                (255, 255, 255),
+                5,
+            )
+            cv2.putText(
+                self.vis_img,
+                "No tracks with pose found",
+                (30, 30),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.2 * TEXT_SCALE,
+                (0, 0, 0),
+                3,
+            )
+
+    @timeit
+    def plot_gaze_text_info(self, gaze_res, head_outputs, body_outputs, head_bb_abs, idx):
+        prediction = gaze_res['direction']
+        kappa = gaze_res['kappa'][0, -1].item()
+        prediction_body = body_outputs['direction']
+        prediction_head = head_outputs['direction']
+        
+        prediction_show = prediction.clone().cpu().detach().numpy()[0, -1, :]
+        prediction_show_body = prediction_body.clone().cpu().detach().numpy()[0, -1, :]
+        prediction_show_head = prediction_head.clone().cpu().detach().numpy()[0, -1, :]
+
+        prediction_show_norm = prediction_show / np.linalg.norm(prediction_show)
+        prediction_show_norm_body = prediction_show_body / np.linalg.norm(prediction_show_body)
+        prediction_show_norm_head = prediction_show_head / np.linalg.norm(prediction_show_head)
+        
+        cv2.putText(
+            self.vis_img,
+            "Gaze {:.2f} {:.2f} {:.2f} ({:.2f})".format(
+                prediction_show_norm[0], prediction_show_norm[1], prediction_show_norm[2], kappa
+            ),
+            (30, 70),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            1 * TEXT_SCALE,
+            (255, 255, 255),
+            5,
+        )
+        
+        cv2.putText(
+            self.vis_img,
+            "Gaze {:.2f} {:.2f} {:.2f} ({:.2f})".format(
+                prediction_show_norm[0], prediction_show_norm[1], prediction_show_norm[2], kappa
+            ),
+            (30, 70),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            1 * TEXT_SCALE,
+            (0, 255, 0),
+            3,
+        )
+
+
+    @timeit
+    def plot_gaze_and_body_dir(self, gaze_res, head_outputs, body_outputs, head_bb_abs, body_bbox):
+        head_bb_abs[2] += head_bb_abs[0]
+        head_bb_abs[3] += head_bb_abs[1]
+        
+        prediction = gaze_res['direction']
+        prediction_body = body_outputs['direction']
+        prediction_head = head_outputs['direction']
+        
+        prediction_show = prediction.clone().cpu().detach().numpy()[0, -1, :]
+        prediction_show_body = prediction_body.clone().cpu().detach().numpy()[0, -1, :]
+        prediction_show_head = prediction_head.clone().cpu().detach().numpy()[0, -1, :]
+
+        prediction_show_norm = prediction_show / np.linalg.norm(prediction_show)
+        prediction_show_norm_body = prediction_show_body / np.linalg.norm(prediction_show_body)
+        prediction_show_norm_head = prediction_show_head / np.linalg.norm(prediction_show_head)
+        
+        gaze_dir_2d = prediction_show_norm[0:2]
+        body_dir_2d = prediction_show_norm_body[0:2]
+        head_dir_2d = prediction_show_norm_head[0:2]
+                
+        body_center = (int((body_bbox[0] + body_bbox[2]) / 2), int((body_bbox[1] + body_bbox[3]) / 2))
+        head_center = (int(head_bb_abs[0] / 2 + head_bb_abs[2] / 2), int(head_bb_abs[1] / 2 + head_bb_abs[3] / 2))
+
+        des = (head_center[0] + int(gaze_dir_2d[0]*150), int(head_center[1] + gaze_dir_2d[1]*150))
+        des_body = (body_center[0] + int(body_dir_2d[0]*150), int(body_center[1] + body_dir_2d[1]*150))
+        des_head = (head_center[0] + int(head_dir_2d[0]*150), int(head_center[1] + head_dir_2d[1]*150))
+  
+        cv2.arrowedLine(self.vis_img, head_center, des, (0, 255, 0), 3, tipLength=0.3)        
+        cv2.arrowedLine(self.vis_img, body_center, des_body, (0, 255, 255), 3, tipLength=0.3)        
+        cv2.arrowedLine(self.vis_img, head_center, des_head, (255, 255, 255), 3, tipLength=0.3)        
+
+
+    @timeit
+    def plot_gaze_from_pitch_yaw(self, pitch, yaw, head_bb_abs, idx, keypoints):
+        
+        # color = RANDOM_COLORS[idx]
+        # color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+        color_tuple = (0,0,255)
+
+        head_bb_abs[2] += head_bb_abs[0]
+        head_bb_abs[3] += head_bb_abs[1]
+        
+        prediction_show = np.zeros(3)
+        prediction_show[0] = -np.sin(pitch) * np.cos(yaw)
+        prediction_show[1] = -np.sin(yaw)
+        prediction_show[2] = 999
+
+        # prediction_show_norm = prediction_show / np.linalg.norm(prediction_show)
+
+        gaze_dir_2d = prediction_show[0:2]
+     
+        # head_center = (int(head_bb_abs[0] / 2 + head_bb_abs[2] / 2), int(head_bb_abs[1] / 2 + head_bb_abs[3] / 2))
+        head_center =  (int(keypoints[1,0] / 2 + keypoints[2,0] / 2), int(keypoints[1,1] / 2 + keypoints[2,1] / 2))
+        
+        des = (head_center[0] + int(gaze_dir_2d[0]*150), int(head_center[1] + gaze_dir_2d[1]*150))
+
+        # cv2.arrowedLine(self.vis_img, head_center, des, (255,255,255), 3, tipLength=0.3)        
+        cv2.arrowedLine(self.vis_img, head_center, des, color_tuple, 2, tipLength=0.1)        
+        cv2.circle(self.vis_img, head_center, 5, color = color_tuple, thickness=-1)        
+
+    @timeit
+    def plot_gaze_angle_info(self, pitch, yaw, head_bb, idx):
+        color = RANDOM_COLORS[idx]
+        color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+
+        cv2.putText(
+            self.vis_img,
+            "{:.2f} {:.2f} deg".format(
+                pitch, yaw
+            ),
+            (head_bb[0] + 30, head_bb[1] + 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5 * TEXT_SCALE,
+            (255,255,255),
+            2,
+        )
+        cv2.putText(
+            self.vis_img,
+            "{:.2f} {:.2f} deg".format(
+                pitch, yaw
+            ),
+            (head_bb[0] + 30, head_bb[1] + 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5 * TEXT_SCALE,
+            color_tuple,
+            1,
+        )
+        
+    @timeit
+    def get_gafa_input_from_current_image(self, image, keypoints, body_yolo_bbox):
+
+        body_yolo_bbox_int = {}
+        body_yolo_bbox_int["u"] = int(body_yolo_bbox[0])
+        body_yolo_bbox_int["v"] = int(body_yolo_bbox[1])
+        body_yolo_bbox_int["w"] = int(body_yolo_bbox[2] - body_yolo_bbox[0])
+        body_yolo_bbox_int["h"] = int(body_yolo_bbox[3] - body_yolo_bbox[1])
+
+        # use torch instead of PIL because faster conversion
+        # image_pil = PILImage.fromarray(image)
+        image_torch = torch.from_numpy(image.copy()).moveaxis(2, 0)
+                
+        item = {
+            "image": image_torch,
+            "keypoints": keypoints[:, :2],
+        }
+
+        # get head bb in pixels
+        head_trans = head_transform(item)
+        head_bb = head_trans['bb']
+        head_bb = np.array([head_bb['u'], head_bb['v'], head_bb['w'], head_bb['h']]).astype(np.float32)
+        
+        # get body bb in pixels
+        # body_trans = body_transform(item) 
+        body_trans = body_transform(item)
+        body_bb = body_trans['bb']
+        body_bb = np.array([body_bb['u'], body_bb['v'], body_bb['w'], body_bb['h']])
+        body_image = body_trans['image'] # keep as tensor
+        
+        # change head bb to relative to body bb
+        head_bb_abs = head_bb.copy()
+        
+        head_bb[0] -= body_bb[0]
+        head_bb[1] -= body_bb[1]
+        
+        head_bb[0] = head_bb[0] / body_bb[2]
+        head_bb[1] = head_bb[1] / body_bb[3]
+        head_bb[2] = head_bb[2] / body_bb[2]
+        head_bb[3] = head_bb[3] / body_bb[3]
+                
+        # store body center
+        norm_body_center = (body_bb[[0, 1]] + body_bb[[2, 3]] / 2) / body_bb[[2,3]]
+        
+        # normalize image
+        # img = normalize_img(image = body_image)['image'] # with albumnentations normalization
+        # img = img.transpose(2, 0, 1) # with albumnentations normalization
+        img = normalize_img_torch((body_image.float())/255) # ith torchvision normalization, to float and in range [0-1] before normalization
+
+        assert(img.shape[0] == 3)
+        assert(img.shape[1] == 256)
+        assert(img.shape[2] == 192)
+        
+        # create mask of head bounding box
+        head_mask = np.zeros((1, img.shape[1], img.shape[2]))
+        head_bb_int = head_bb.copy()
+        head_bb_int[[0, 2]] *= img.shape[2]
+        head_bb_int[[1, 3]] *= img.shape[1]
+        head_bb_int[2] += head_bb_int[0]
+        head_bb_int[3] += head_bb_int[1]
+        head_bb_int = head_bb_int.astype(np.int64)
+        head_bb_int[head_bb_int < 0] = 0
+
+        head_mask[:, head_bb_int[1]:head_bb_int[3], head_bb_int[0]:head_bb_int[2]] = 1
+                        
+        return img, head_mask, norm_body_center, head_bb_abs
+    
+    @timeit
+    def plot_overlay_face_attention(self, track, head_bbox):
+        x_min, y_min, x_max, y_max = map(int, head_bbox[:4])
+        
+        valid_depths = []
+        valid_yaws = []
+        valid_pitchs = []
+        
+        heat_count = 0
+        history = 1
+        length = min(len(track["depth_face"]), history)
+        for i in range(length):
+            index = len(track["depth_face"]) - i - 1
+            yaw = np.rad2deg(track["gaze_yaw_rad"][index])
+            pitch = np.rad2deg(track["gaze_pitch_rad"][index])
+            depth = track["depth_face"][index]
+            
+            thresh = (int(depth / 1000) + 1) * 5 # 5 deg per meter
+            if np.abs(yaw) < thresh and np.abs(pitch) < thresh:
+                heat_count += 1
+            
+        cv2.putText(
+            self.vis_img,
+            "{:d}".format(
+                heat_count
+            ),
+            (x_min - 30, y_min + 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5 * TEXT_SCALE,
+            (255,0,255),
+            2,
+        )
+        
+        overlay_img = self.vis_img.copy()
+        cv2.rectangle(overlay_img, (x_min,y_min), (x_max,y_max), color = (0,255,0), thickness = -1)
+        strength = (heat_count / history) * 0.75
+        self.vis_img = cv2.addWeighted(self.vis_img,(1-strength),overlay_img,strength,0)
+
+    @timeit
+    def plot_overlay_face_attention_6d(self, track, head_bbox, keypoints):
+        x_min, y_min, x_max, y_max = map(int, head_bbox[:4])
+        
+        valid_depths = []
+        valid_yaws = []
+        valid_pitchs = []
+        
+        heat_count = 0
+        history = 20
+        length = min(len(track["depth_face"]), history)
+        for i in range(length):
+            index = len(track["depth_face"]) - i - 1
+            yaw = np.rad2deg(track["gaze_yaw_rad"][index])
+            pitch = np.rad2deg(track["gaze_pitch_rad"][index])
+            depth = track["depth_face"][index]
+                        
+            thresh = (int(depth / 1000) + 1) * 5 # 5 deg per meter
+            if np.abs(yaw) < thresh and pitch < 0:
+                heat_count += 1
+                # suppose we are looking down
+
+            
+        # cv2.putText(
+        #     self.vis_img,
+        #     "{:d}".format(
+        #         heat_count
+        #     ),
+        #     (x_min - 30, y_min + 30),
+        #     cv2.FONT_HERSHEY_SIMPLEX,
+        #     0.5 * TEXT_SCALE,
+        #     (255,0,255),
+        #     2,
+        # )
+        
+        overlay_img = self.vis_img.copy()
+
+        nose = keypoints[0,:2]        
+        leye = keypoints[1,:2]
+        reye = keypoints[2,:2]
+
+        colorval = min(((heat_count * 2) / history), 1.0)
+        strength = 0.5 + (heat_count / history) * 0.5 #(heat_count / history)
+        cmap = get_mpl_colormap("Reds")
+        color = (cmap[int(colorval * 255)] * 255)
+        color_tuple = (int(color[0]), int(color[1]), int(color[2]))
+                
+        # radius = np.linalg.norm(reye - leye) / 4
+        # cv2.circle(overlay_img, (int(leye[0]), int(leye[1])), int(radius), color = color_tuple, thickness = -1)
+        # cv2.circle(self.vis_img, (int(leye[0]), int(leye[1])), int(radius), color = (0, 0, 0), thickness = 1)
+        
+        # cv2.circle(overlay_img, (int(reye[0]), int(reye[1])), int(radius), color = color_tuple, thickness = -1)
+        # cv2.circle(self.vis_img, (int(reye[0]), int(reye[1])), int(radius), color = (0, 0, 0), thickness = 1)
+        
+        ellipse_center = (int(leye[0] / 2 + reye[0] / 2), int(leye[1] / 2 + reye[1] / 2))
+        ellipse_height = int(nose[1] - (leye[1] / 2 + reye[1] / 2))
+        ellipse_width = int((leye[0] - reye[0]) * 1.1)
+        if ellipse_width > 0 and ellipse_height > 0:
+            cv2.ellipse(overlay_img, ellipse_center, (ellipse_width, ellipse_height), 0, 0, 360, color_tuple, 3)
+        
+        # cv2.rectangle(overlay_img, (x_min,y_min), (x_max,y_max), color = (0,255,0), thickness = -1)
+        # cv2.rectangle(self.vis_img, (x_min,y_min), (x_max,y_max), color = (255,255,255), thickness = 1)
+        
+        self.vis_img = cv2.addWeighted(self.vis_img,(1-strength),overlay_img,strength,0)
+         
+
+    def start(self):
+
+        while not rospy.is_shutdown():
+
+            if self.is_ready():
+
+                image_count = self.current_image_count
+                image_seq_unique = self.rgb_current_seq
+                now = datetime.now()
+                timestamp = now.strftime("%Y_%m_%d_%H_%M_%S_%f")
+
+                if self.args.save or self.args.light_save:
+                    self.save_rgb(image_count, image_seq_unique, timestamp)
+
+                rgb_array = self.rgb.copy()
+
+                if self.args.save:
+                    self.save_depth(image_count, image_seq_unique, timestamp)
+
+                depth_array = np.array(self.depth)
+                depth_array[depth_array > self.depth_array_max_threshold] = (
+                    self.depth_array_max_threshold
+                )
+
+                assert depth_array.shape[0] == rgb_array.shape[0]
+                assert depth_array.shape[1] == rgb_array.shape[1]
+
+                # Process RGB array
+                if self.last_inferred_seq < self.rgb_current_seq:
+
+                    current_frame_processing = self.rgb_current_seq
+                    current_timestamp = self.rgb_timestamp
+                    current_frame_id = self.rgb_frame_id
+                    prInfo("Do inference on frame {}".format(current_frame_processing))
+
+                    # keep old poses for tracking
+                    pose_results_last = self.pose_results
+
+                    tic = time.time()
+                    mmdet_results = inference_detector(
+                        self.det_model, rgb_array
+                    )  # list of detection rectangle i.e [(x1,y1,x2,y2), ...]
+                    tac = time.time()
+                    prTimer("YOLO detection", tic, tac)
+                    
+                    # keep the person class bounding boxes.
+                    person_results = process_mmdet_results(
+                        mmdet_results, self.args.det_cat_id
+                    )
+
+                    new_persons = []
+                    for person in person_results:
+                        bbox = person["bbox"]
+                        pt1 = (max(0, min(bbox[0], depth_array.shape[1]-1)), max(0,min(bbox[1], depth_array.shape[0]-1)) )
+                        pt2 = (max(0, min(bbox[2], depth_array.shape[1]-1)), max(0,min(bbox[3], depth_array.shape[0]-1)) )
+                        
+                        # depth1 = depth_array[int(pt1[1]), int(pt1[0])]
+                        # depth2 = depth_array[int(pt2[1]), int(pt2[0])]
+                        # if depth1 > self.args.depth_limit_threshold or depth1 == 0 or depth2 > self.args.depth_limit_threshold or depth2 == 0:
+                        #     pass
+                        # else:
+                        if abs(pt1[0] - pt2[0]) > self.args.bb_min_threshold/2 or abs(pt1[1]-pt2[1]) > self.args.bb_min_threshold:
+                            new_persons.append(person)                            
+                            
+                    person_results = new_persons
+                    
+                    tic = time.time()
+                    # test a single image, with a list of bboxes.
+                    self.pose_results, returned_outputs = inference_top_down_pose_model(
+                        self.pose_model,
+                        rgb_array,
+                        person_results,
+                        bbox_thr=self.args.bbox_thr,
+                        format="xyxy",
+                        dataset=self.dataset,
+                        dataset_info=self.dataset_info,
+                        return_heatmap=self.return_heatmap,
+                        outputs=None,
+                    )
+                    tac = time.time()
+                    prTimer("ViTPose", tic, tac)
+                    # get track id for each person instance
+                    self.pose_results, self.next_id = get_track_id(
+                        self.pose_results,
+                        pose_results_last,
+                        self.next_id,
+                        use_oks=False,
+                        tracking_thr=self.args.tracking_thr,
+                        use_one_euro=self.args.euro,
+                        fps=10,
+                    )
+
+                    # produce an output image
+                    if not self.args.no_show:
+                        self.vis_img = rgb_array.copy()
+
+                    if self.display_all_detection and not self.args.no_show:
+                        self.plot_mmdet_bbox(mmdet_results, depth_array.shape)
+
+                    #### post processing, 3D lifting (if enabled) and gaze estimation (if enabled) ####
+
+                    # remove too old tracks
+                    for idx, track in list(self.tracks.items()):
+                        if abs(image_count - track["last_seen"]) > self.args.max_frames_remove_tracks: 
+                            prInfo("Removing track {}, not seen since frame {}, current is {}".format(idx, track["last_seen"], image_count))
+                            self.tracks.pop(idx)
+                            
+                    self.tracks_in_current_image = {}
+
+                    for res in self.pose_results:
+
+                        # for each instance
+                        bbox = res["bbox"]
+                        keypoints = res["keypoints"]        
+                        idx = res["track_id"] % 255
+                        
+                        if idx in self.tracks_in_current_image.keys():
+                            prWarning("Track with idx {} (track_id {} from results) already in the current image, maybe because there are more than 255 detections in the image".format(
+                                idx, res["track_id"]
+                            ))
+                            continue
+
+                        if idx not in self.tracks.keys():
+                            prInfo("Adding a new track with idx {}".format(idx))
+                            self.tracks[idx] = {}
+                            self.tracks[idx]["last_seen"] = image_count
+                            self.tracks[idx]["keypoints_2d"] = []
+                            self.tracks[idx]["images_crop"] = []
+                            self.tracks[idx]["head_masks"] = []
+                            self.tracks[idx]["norm_body_centers"] = []
+                            self.tracks[idx]["bboxes"] = []
+                            self.tracks[idx]["depth_face"] = []
+                            self.tracks[idx]["gaze_yaw_rad"] = []
+                            self.tracks[idx]["gaze_pitch_rad"] = []
+                        
+                        # add keypoint to the current track
+                        self.tracks[idx]["last_seen"] = image_count
+                        self.tracks[idx]["keypoints_2d"].append(keypoints)
+                        self.tracks[idx]["bboxes"].append(bbox)
+                        
+                        self.tracks_in_current_image[idx] = {
+                            "right_wrist_depth": None,
+                            "right_wrist_pose": None,
+                            "left_wrist_depth": None,
+                            "left_wrist_pose": None,
+                            "depth_center": None,
+                            "pose_center": None,
+                            "pose_from": None,
+                            "depth_face": None,
+                            "gaze_yaw_rad": None,
+                            "gaze_pitch_rad": None,
+                        }
+
+                        # if history is long enough, process the trajectory with MotionBERT
+                        if self.args.use_mb and len(self.tracks[idx]["keypoints_2d"]) >= self.args.mb_clip_len:
+                            prInfo("Running MotionBERT for track {}".format(idx))
+                            
+                            # prepare motion
+                            motion = np.asarray(self.tracks[idx]["keypoints_2d"]) # T, 17, 3
+                            motion = motion[-self.args.mb_clip_len:, :, :] # keep only the required len
+                            assert(motion.shape[1] == 17)
+                            assert(motion.shape[2] == 3)
+                            motion_h36 = coco2h36m(motion) # input is h36 format
+                            motion_h36_scaled = crop_scale(motion_h36) # scale [1,1], normalize, crop
+                            
+                            with torch.no_grad():
+                                current_input = torch.Tensor(motion_h36_scaled).unsqueeze(0).cuda()
+                                tic = time.time()
+                                predicted_3d_pos = self.motionbert_3d_model(current_input)
+                                tac = time.time()
+                                prTimer("MotionBERT", tic, tac)                  
+                                # root relative
+                                predicted_3d_pos[:,:,0,:] = 0  # [1,T,17,3]
+                                
+                                predicted_3d_pos_np = predicted_3d_pos[0,-1,:,:].cpu().numpy() # keep only the last prediction
+                                if "keypoints_3d" in self.tracks[idx].keys():
+                                    self.tracks[idx]["keypoints_3d"].append(predicted_3d_pos_np)
+                                else:
+                                    self.tracks[idx]["keypoints_3d"] = [predicted_3d_pos_np] * self.args.mb_clip_len # add fake padding at the begining so the lists align
+                                
+                                # print("len compare", idx, len(self.tracks[idx]["keypoints_3d"]), len(self.tracks[idx]["keypoints_2d"]), color = "yan")
+                        
+                        
+                        # (run for every track or only closest ?) add input for gafa processing
+                        # if for everyone should run in batch                                                
+                        if self.args.use_gafa and (len(self.tracks[idx]["images_crop"]) >= self.args.gafa_n_frames or self.args.gafa_no_history):
+                            gafa_tic = time.time()
+                            
+                            # Make sure that the image is rgb and not bgr, may need conversion !
+                            # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+                            # im_pil = Image.fromarray(img)
+                            crop_img, head_mask, norm_body_center, head_bb_abs = self.get_gafa_input_from_current_image(rgb_array[:,:,::-1], keypoints, bbox)             
+
+                            if self.args.gafa_no_history:
+                                # no history : duplicate the last image
+                                images = np.repeat(crop_img[np.newaxis, :, :, :], self.args.gafa_n_frames, axis = 0) # torch.Tensor of size [n_frames, 3, 256, 192]
+                                head_masks = np.repeat(head_mask[np.newaxis, :, :, :], self.args.gafa_n_frames, axis = 0) # numpy.ndarray of size [n_frames, 3, 256, 192]
+                                body_dvs = np.zeros((self.args.gafa_n_frames, 2)) # numpy.ndarray of size n_frames, 2
+                                
+                            else:
+                                # history : use the last saved n images
+                                self.tracks[idx]["images_crop"].append(crop_img)
+                                self.tracks[idx]["head_masks"].append(head_mask)
+                                self.tracks[idx]["norm_body_centers"].append(norm_body_center)
+                            
+                                images = torch.stack(self.tracks[idx]["images_crop"][-self.args.gafa_n_frames:], dim = 0) # torch.Tensor of size  n_frames, 3, 256, 192
+                                head_masks = np.asarray(self.tracks[idx]["head_masks"][-self.args.gafa_n_frames:]) #  numpy.ndarray of size n_frames, 1, 256, 192
+                                norm_body_centers = np.asarray(self.tracks[idx]["norm_body_centers"][-self.args.gafa_n_frames:]) # numpy.ndarray of size n_frames, 2
+                                body_dvs = norm_body_centers - np.roll(norm_body_centers, shift=1, axis=0) # numpy.ndarray of size n_frames, 2
+                                
+                            with torch.no_grad():
+                                # debug_dic = {}
+                                
+                                images = images.unsqueeze(0) #.cuda().float()              
+                                head_masks = torch.from_numpy(head_masks).unsqueeze(0) #.cuda().float()              
+                                body_dvs = torch.from_numpy(body_dvs).unsqueeze(0) #.cuda().float()     
+                                
+                                # last_img = images[0, -1, : ,: ,:].clone()
+                                # for i in range(7):
+                                #     images[0, i, : ,: ,:] = last_img
+                                images = images.cuda().float()
+                                                                
+                                # last_mask = head_masks[0, -1, : ,: ,:].clone()
+                                # for i in range(7):
+                                #     head_masks[0, i, : ,: ,:] = last_mask
+                                head_masks = head_masks.cuda().float()
+                                
+                                # body_dvs = torch.zeros(body_dvs.shape)
+                                body_dvs = body_dvs.cuda().float()
+                                                                
+                                # debug_dic["images"] = images.clone().cpu().numpy()
+                                # debug_dic["head_masks"] = head_masks.clone().cpu().numpy()
+                                # debug_dic["body_dvs"] = body_dvs.clone().cpu().numpy()
+                                
+                                tic = time.time()
+                                gaze_res, head_outputs, body_outputs = self.gafa_model(images, head_masks, body_dvs)
+                                tac = time.time()
+                                prTimer("GAFA", tic, tac)
+
+                                # debug_dic["gaze_res"] = gaze_res["direction"].clone().cpu().numpy()
+                                # debug_dic["head_outputs"] = head_outputs["direction"].clone().cpu().numpy()
+                                # debug_dic["body_outputs"] = body_outputs["direction"].clone().cpu().numpy()
+                                
+                                # with open('./debug_dic.pickle', 'wb') as handle:
+                                #     pickle.dump(debug_dic, handle)
+                                
+                                # print("GAFA done", color = "green", background = "white")
+                                
+                                if not self.args.no_show:
+                                    self.plot_gaze_and_body_dir(gaze_res, head_outputs, body_outputs, head_bb_abs, bbox)
+                                    self.plot_xyxy_person_bbox(idx, head_bb_abs, depth_array.shape)
+                                    self.plot_gaze_text_info(gaze_res, head_outputs, body_outputs, head_bb_abs, idx)
+                                
+                                # For debug only
+                                # plt.clf()
+                                # plt.imshow(np.moveaxis(images.clone().cpu().numpy()[0,-1,:,:,:], 0, 2))
+                                # plt.title("Custom data")
+                                # plt.pause(0.01)
+
+                            
+                            gafa_tac = time.time()
+                            prTimer("GAFA full", gafa_tic, gafa_tac)
+                                                                            
+                        else:
+                            if self.args.use_gafa and self.args.gafa_no_history:
+                                prInfo("Did not add inputs because no GAFA history required")
+                            elif self.args.use_gafa:
+                                # do not accumulate if unused
+                                crop_img, head_mask, norm_body_center, head_bb_abs = self.get_gafa_input_from_current_image(rgb_array[:,:,::-1], keypoints, bbox)             
+                                self.tracks[idx]["images_crop"].append(crop_img)
+                                self.tracks[idx]["head_masks"].append(head_mask)
+                                self.tracks[idx]["norm_body_centers"].append(norm_body_center)                        
+                                prInfo("Didn't run GAFA yet, waiting for history")
+
+                        
+                        if self.args.use_gaze_resnet:
+                            with torch.no_grad():
+                                
+                                # debug_dic = {"image_full": rgb_array}
+                                
+                                item = {"keypoints" : keypoints[:,:2]}
+                                head_trans = head_transform_face(item)
+                                head_bb = head_trans["bb"]
+                                head_bb = np.array([head_bb['u'], head_bb['v'], head_bb['w'], head_bb['h']]).astype(np.int32)
+                                
+                                tic = time.time()
+                                if self.args.use_face_detector:
+                                    face_bboxes, fd_kp = self.face_detector.detect(rgb_array) # or convert to bgr ?? ## only use the body detection so that we match easily...
+                                    prWarning("Using face_detector does not provide any matching to the current idx of the frame, only using first detection !")
+                                else:
+                                    face_bboxes = np.array([[head_bb[0],head_bb[1],head_bb[0]+head_bb[2],head_bb[1]+head_bb[3]]])
+                                tac = time.time()
+                                prTimer("Face detetction", tic, tac)
+                                
+                                if (face_bboxes.shape[0] > 0):
+                                    
+                                    x_min, y_min, x_max, y_max = map(int, face_bboxes[0,:4])
+                                    head_image = rgb_array[y_min:y_max, x_min:x_max]
+                                    
+                                    if (head_image.shape[0] > 10) and (head_image.shape[1] > 10):
+                                        head_image = pre_process(head_image)
+                                    
+                                        # For debug
+                                        # plt.clf()
+                                        # plt.imshow(np.moveaxis(head_image.clone().cpu().numpy()[0,:,:,:], 0, 2))
+                                        # plt.title("Custom data")
+                                        # plt.pause(0.01)
+
+                                        # debug_dic["image"] = head_image
+                                        
+                                        pitch, yaw = self.gaze_estimation_model(head_image)
+
+                                        # debug_dic["pitch"] = pitch
+                                        # debug_dic["yaw"] = yaw
+                                        
+                                        # with open('debuig_dic.pkl', 'wb') as fp: 
+                                        #     pickle.dump(debug_dic, fp)
+
+                                        # Softmax beofre sum
+                                        pitch_predicted, yaw_predicted = F.softmax(pitch, dim=1), F.softmax(yaw, dim=1)
+
+                                        # Mapping from binned (0 to 90) to angles (-180 to 180) or (0 to 28) to angles (-42, 42)
+                                        idx_tensor = torch.arange(90, device=self.args.device.lower(), dtype=torch.float32)
+                                        
+                                        pitch_predicted = torch.sum(pitch_predicted * idx_tensor, dim=1) * 4 - 180
+                                        yaw_predicted = torch.sum(yaw_predicted * idx_tensor, dim=1) * 4 - 180
+                                        
+                                        pitch_predicted = pitch_predicted.cpu().numpy()
+                                        yaw_predicted = yaw_predicted.cpu().numpy()
+                                        
+                                        # Degrees to Radians
+                                        pitch_predicted_rad = np.radians(pitch_predicted)
+                                        yaw_predicted_rad = np.radians(yaw_predicted)
+                                        
+                                        self.tracks_in_current_image[idx]["gaze_pitch_rad"] = pitch_predicted_rad
+                                        self.tracks_in_current_image[idx]["gaze_yaw_rad"] = yaw_predicted_rad
+                                        self.tracks[idx]["gaze_pitch_rad"].append(pitch_predicted_rad)
+                                        self.tracks[idx]["gaze_yaw_rad"].append(yaw_predicted_rad)
+                                           
+                                        self.plot_gaze_from_pitch_yaw(pitch_predicted_rad[0], yaw_predicted_rad[0], head_bb, idx, keypoints)
+                                        self.plot_gaze_angle_info(pitch_predicted[0], yaw_predicted[0], head_bb, idx)
+                                                                            
+                                        # get face depth
+                                        nose = keypoints[0,:2].astype(np.uint32)
+                                        leye = keypoints[1,:2].astype(np.uint32)
+                                        reye = keypoints[2,:2].astype(np.uint32)
+                                        
+                                        depth_nose = depth_array[np.clip(nose[1], 0, depth_array.shape[0] - 1), np.clip(nose[0], 0, depth_array.shape[1] - 1)]
+                                        depth_leye = depth_array[np.clip(leye[1], 0, depth_array.shape[0] - 1), np.clip(leye[0], 0, depth_array.shape[1] - 1)]
+                                        depth_reye = depth_array[np.clip(reye[1], 0, depth_array.shape[0] - 1), np.clip(reye[0], 0, depth_array.shape[1] - 1)]
+                                                                                
+                                        depth_face = np.median([depth_nose, depth_leye, depth_reye])
+                                                                                
+                                        self.tracks_in_current_image[idx]["depth_face"] = depth_face
+                                        self.tracks[idx]["depth_face"].append(depth_face)
+                                                                                
+                                        self.plot_overlay_face_attention(self.tracks[idx], face_bboxes[0,:4])
+    
+                                        
+
+                        if self.args.use_six_d_rep:
+                            with torch.no_grad():
+                                
+                                # debug_dic = {"image_full": rgb_array}
+                                
+                                item = {"keypoints" : keypoints[:,:2]}
+                                head_trans = head_transform(item)
+                                head_bb = head_trans["bb"]
+                                head_bb = np.array([head_bb['u'], head_bb['v'], head_bb['w'], head_bb['h']]).astype(np.int32)
+                                
+                                tic = time.time()
+                                if self.args.use_face_detector:
+                                    face_bboxes = self.sixdrep_detector.detect(rgb_array, (640,640)) # or convert to bgr ?? ## only use the body detection so that we match easily...
+                                    face_bboxes = face_bboxes.astype('int32')
+                                    prWarning("Using face_detector does not provide any matching to the current idx of the frame, only using first detection !")
+                                else:
+                                    face_bboxes = np.array([[head_bb[0],head_bb[1],head_bb[0]+head_bb[2],head_bb[1]+head_bb[3]]])
+                                tac = time.time()
+                                prTimer("Face detetction", tic, tac)
+                                                                
+                                facing_camera = ((keypoints[3,0] - keypoints[4,0]) > 20)
+                                
+                                if (face_bboxes.shape[0] > 0) and facing_camera:
+                                    x_min = face_bboxes[0,0]
+                                    y_min = face_bboxes[0,1]
+                                    x_max = face_bboxes[0,2]
+                                    y_max = face_bboxes[0,3]
+                                    box_w = abs(x_max - x_min)
+                                    box_h = abs(y_max - y_min)
+
+                                    x_min = max(0, x_min - int(0.2 * box_h))
+                                    y_min = max(0, y_min - int(0.2 * box_w))
+                                    x_max = x_max + int(0.2 * box_h)
+                                    y_max = y_max + int(0.2 * box_w)
+
+                                    head_image = rgb_array[y_min:y_max, x_min:x_max, :]
+                                    
+                                    if (head_image.shape[0] > 10) and (head_image.shape[1] > 10):
+
+                                        
+                                        head_image = PILImage.fromarray(head_image)
+                                        head_image = head_image.convert('RGB')
+                                        head_image = sixdreptransform(head_image)
+                                        head_image = head_image.unsqueeze(0)
+
+                                        head_image = head_image.cuda()
+                                        head_image = head_image.half()
+
+                                        tic = time.time()
+                                        output = self.sixdrep_model(head_image)
+                                        tac = time.time()
+                                        prTimer("SixDRep", tic, tac)
+                                                                        
+                                        output = compute_euler(output) * 180 / np.pi
+
+                                        p_output = output[:, 0].cpu()
+                                        y_output = output[:, 1].cpu()
+                                        r_output = output[:, 2].cpu()
+                                                                        
+                                        self.tracks_in_current_image[idx]["gaze_pitch_rad"] = np.deg2rad(p_output.item())
+                                        self.tracks_in_current_image[idx]["gaze_yaw_rad"] = np.deg2rad(y_output.item())
+                                        self.tracks[idx]["gaze_pitch_rad"].append(np.deg2rad(p_output.item()))
+                                        self.tracks[idx]["gaze_yaw_rad"].append(np.deg2rad(y_output.item()))
+                                           
+                                        self.plot_gaze_from_pitch_yaw(np.deg2rad(y_output.item()), np.deg2rad(p_output.item()), head_bb, idx, keypoints) # invert pitch compared to resnet
+                                        # self.plot_gaze_angle_info(y_output.item(), p_output.item(), head_bb, idx) # invert pitch compared to resnet
+                                                                            
+                                        # get face depth
+                                        nose = keypoints[0,:2].astype(np.uint32)
+                                        leye = keypoints[1,:2].astype(np.uint32)
+                                        reye = keypoints[2,:2].astype(np.uint32)
+                                        
+                                        depth_nose = depth_array[np.clip(nose[1], 0, depth_array.shape[0] - 1), np.clip(nose[0], 0, depth_array.shape[1] - 1)]
+                                        depth_leye = depth_array[np.clip(leye[1], 0, depth_array.shape[0] - 1), np.clip(leye[0], 0, depth_array.shape[1] - 1)]
+                                        depth_reye = depth_array[np.clip(reye[1], 0, depth_array.shape[0] - 1), np.clip(reye[0], 0, depth_array.shape[1] - 1)]
+                                                                                
+                                        depth_face = np.median([depth_nose, depth_leye, depth_reye])
+                                                                                
+                                        self.tracks_in_current_image[idx]["depth_face"] = depth_face
+                                        self.tracks[idx]["depth_face"].append(depth_face)
+                                                                                
+                                        self.plot_overlay_face_attention_6d(self.tracks[idx], face_bboxes[0,:4], keypoints)
+                                    else:
+                                        self.tracks[idx]["gaze_pitch_rad"].append(np.deg2rad(180))
+                                        self.tracks[idx]["gaze_yaw_rad"].append(np.deg2rad(180))
+                                        
+                                        nose = keypoints[0,:2].astype(np.uint32)
+                                        leye = keypoints[1,:2].astype(np.uint32)
+                                        reye = keypoints[2,:2].astype(np.uint32)
+                            
+                                        depth_nose = depth_array[np.clip(nose[1], 0, depth_array.shape[0] - 1), np.clip(nose[0], 0, depth_array.shape[1] - 1)]
+                                        depth_leye = depth_array[np.clip(leye[1], 0, depth_array.shape[0] - 1), np.clip(leye[0], 0, depth_array.shape[1] - 1)]
+                                        depth_reye = depth_array[np.clip(reye[1], 0, depth_array.shape[0] - 1), np.clip(reye[0], 0, depth_array.shape[1] - 1)]
+                                        depth_face = np.median([depth_nose, depth_leye, depth_reye])
+
+                                        self.tracks[idx]["depth_face"].append(depth_face)
+                                
+                                else:
+                                    self.tracks[idx]["gaze_pitch_rad"].append(np.deg2rad(180))
+                                    self.tracks[idx]["gaze_yaw_rad"].append(np.deg2rad(180))
+                                    
+                                    nose = keypoints[0,:2].astype(np.uint32)
+                                    leye = keypoints[1,:2].astype(np.uint32)
+                                    reye = keypoints[2,:2].astype(np.uint32)
+                        
+                                    depth_nose = depth_array[np.clip(nose[1], 0, depth_array.shape[0] - 1), np.clip(nose[0], 0, depth_array.shape[1] - 1)]
+                                    depth_leye = depth_array[np.clip(leye[1], 0, depth_array.shape[0] - 1), np.clip(leye[0], 0, depth_array.shape[1] - 1)]
+                                    depth_reye = depth_array[np.clip(reye[1], 0, depth_array.shape[0] - 1), np.clip(reye[0], 0, depth_array.shape[1] - 1)]
+                                    depth_face = np.median([depth_nose, depth_leye, depth_reye])
+
+                                    self.tracks[idx]["depth_face"].append(depth_face)
+                    
+                        # Draw bb              
+                        bbox[4] *= 100
+                        bbox = bbox.astype(np.int32)
+                        
+                        if not self.args.no_show:
+                            self.plot_xyxy_person_bbox(idx, bbox, depth_array.shape, self.tracks[idx])
+
+                        # return the list of body center joints and also fill self.tracks_in_current_image[idx]
+                        body_center_joints = self.process_keypoints(keypoints, depth_array, idx)
+
+                        # find the body center
+                        if len(body_center_joints) == 4:
+                            # if we managed to find the 4 points of the torso, search on the torso
+                            body_center_joints = np.array(
+                                body_center_joints
+                            )  # lsho, rsho, lhip, rhip
+                            lsho = body_center_joints[0, :]
+                            rsho = body_center_joints[1, :]
+                            lhip = body_center_joints[2, :]
+                            rhip = body_center_joints[3, :]
+
+                            depths_torso, poses_torso = self.get_depth_and_poses_of_torso(depth_array, lsho, rsho, lhip, rhip, idx)
+
+                            # redraw bb with more info
+                            if not self.args.no_show:
+                                self.plot_xyxy_person_bbox(idx, bbox, depth_array.shape, self.tracks[idx], poses_torso)
+
+                            if len(depths_torso) > 3:
+                                # at least 4 points to average decently
+                                depth_body = np.array(depths_torso).mean()
+                                pose_body = np.array(poses_torso).mean(axis=0)
+                                self.tracks_in_current_image[idx][
+                                    "depth_center"
+                                ] = depth_body  # mm
+                                self.tracks_in_current_image[idx][
+                                    "pose_center"
+                                ] = pose_body  # m
+                                self.tracks_in_current_image[idx]["pose_from"] = "torso"
+
+                                # just for drawing
+                                body_center = np.mean(body_center_joints, axis=0)
+                                # Draw center of body
+                                body_center = (int(body_center[0]), int(body_center[1]))
+                                
+                                if not self.light_display and not self.args.no_show:
+                                    self.plot_body_pose_data(body_center, depth_body, pose_body, idx)
+
+                        else:
+                            # if we did not managed to find the 4 points of the torso, search in the bbox
+                            prWarning(
+                                "Can't use body center from shoulders and hips for track {} : do nothing".format(
+                                    idx
+                                )
+                            )
+
+                        # draw skeleton
+                        if not self.args.no_show and not self.args.light_display:
+                            self.plot_skeleton_2d(keypoints, idx)
+
+                    min_depth = 1e6  # mm
+                    min_depth_idx = -1
+                    for idx, track_info in self.tracks_in_current_image.items():
+                        depth = track_info["depth_center"]
+                        if depth is not None:
+                            if depth < min_depth:
+                                min_depth = depth
+                                min_depth_idx = idx
+
+                    if min_depth_idx != -1:
+                        pose_closest = self.tracks_in_current_image[min_depth_idx][
+                            "pose_center"
+                        ]
+                        yaw_closest_gaze = self.tracks_in_current_image[min_depth_idx]["gaze_yaw_rad"]
+                        if yaw_closest_gaze is None:
+                            yaw_closest = np.deg2rad(-180.0)
+                        else:
+                            yaw_closest = yaw_closest_gaze
+                        prInfo(
+                            "Using track {} as it is the closest".format(min_depth_idx)
+                        )
+                        tf_msg = TransformStamped()
+                        tf_msg.child_frame_id = args.namespace + "/human"
+                        tf_msg.header.seq = current_frame_processing
+                        tf_msg.header.stamp = current_timestamp
+                        tf_msg.header.frame_id = current_frame_id
+                        # adapt to robot camera frame convention on the robot
+                        tf_msg.transform.translation.x = pose_closest[2]
+                        tf_msg.transform.translation.y = -pose_closest[0]
+                        tf_msg.transform.translation.z = -pose_closest[1]
+
+                        angle = np.arctan(
+                            tf_msg.transform.translation.y
+                            / tf_msg.transform.translation.x
+                        )
+
+                        # Rotate to have 'human' x axis looking towards the robot
+                        rot = Rotation()
+                        rot.DoRotZ(angle)
+                        rot.DoRotY(np.pi)
+                        qx, qy, qz, qw = rot.GetQuaternion()
+
+                        tf_msg.transform.rotation.x = qx
+                        tf_msg.transform.rotation.y = qy
+                        tf_msg.transform.rotation.z = qz
+                        tf_msg.transform.rotation.w = qw
+
+
+                        dist = np.sqrt(
+                            tf_msg.transform.translation.x**2 + tf_msg.transform.translation.y**2 + tf_msg.transform.translation.z**2
+                        )
+
+                        if dist < self.args.max_distance: # meters
+                            self.goal_pub.publish(tf_msg)
+                            prSuccess(
+                                "Publishing coordinates {:.2f} {:.2f} {:.2f}".format(
+                                    pose_closest[0], pose_closest[1], pose_closest[2]
+                                )
+                            )
+
+                            self.tf_br.sendTransform(tf_msg)
+
+                        prSuccess(
+                            "Publishing coordinates {:.2f} {:.2f} {:.2f} and yaw {:.2f}".format(
+                                pose_closest[0], pose_closest[1], pose_closest[2], np.rad2deg(yaw_closest)
+                            )
+                        )
+
+                        self.tf_br.sendTransform(tf_msg)
+
+                        if not self.args.no_show:
+                            # self.plot_det_text_info(pose_closest)
+                            pass
+
+                    else:
+                        
+                        if not self.args.no_show:
+                            # self.plot_det_text_info(None)
+                            pass
+
+                    self.last_inferred_seq = current_frame_processing
+
+                    if self.args.save and not self.args.no_show:
+                        self.save_output_image(image_count, image_seq_unique, timestamp)
+
+                else:
+                    prWarning(
+                        "No inference because the current RGB frame has already been processed last_inferred_seq {} vs rgb_current_seq {}".format(
+                            self.last_inferred_seq, self.rgb_current_seq
+                        )
+                    )
+
+                if not self.args.no_show:
+                    depth_array_disp = depth_array.copy()
+                    depth_array_disp[depth_array_disp > 3000] = 3000
+                    depth_array_norm = ((depth_array_disp - depth_array_disp.min())) / (
+                        depth_array_disp.max() - depth_array_disp.min()
+                    )
+                    depth_array_norm = depth_array_norm * 255
+                    depth_array_norm = depth_array_norm.astype(np.uint8)
+                    depth_array_norm_colored = (
+                        self.depth_cmap[depth_array_norm] * 255
+                    ).astype(np.uint8)
+
+                if self.args.save or self.args.light_save:
+                    self.save_pcl(image_count, image_seq_unique, timestamp)
+
+                if self.vis_img is not None:
+                    full_display_array = np.zeros(
+                        (rgb_array.shape[0] * 2, rgb_array.shape[1], 3), dtype=np.uint8
+                    )
+                    full_display_array[: rgb_array.shape[0], :, :] = self.vis_img
+                    full_display_array[rgb_array.shape[0] :, :, :] = (
+                        depth_array_norm_colored
+                    )
+
+                    if not self.args.no_show:
+                        cv2.imshow("RGBD window", full_display_array)
+                        cv2.waitKey(3)
+
+            else:
+                print("Images are None !")
+
+            self.loop_rate.sleep()
+
+
+if __name__ == "__main__":
+
+    ## Parser with params
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--det_config",
+        type=str,
+        default="./configs/detection/yolov3_d53_320_273e_coco.py",
+        help="Config file for detection | default = %(default)s",
+    )
+    parser.add_argument(
+        "--det_checkpoint",
+        type=str,
+        default="./models/yolov3_d53_320_273e_coco-421362b6.pth",
+        help="Checkpoint file for detection | default = %(default)s",
+    )
+    parser.add_argument(
+        "--pose_config",
+        type=str,
+        default="./configs/pose/ViTPose_small_coco_256x192.py",
+        help="Config file for pose | default = %(default)s",
+    )
+    parser.add_argument(
+        "--pose_checkpoint",
+        type=str,
+        default="./models/vitpose_small.pth",
+        help="Checkpoint file for pose | default = %(default)s",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda:0",
+        help="Device used for inference | default = %(default)s",
+    )
+    parser.add_argument(
+        "--det_cat_id",
+        type=int,
+        default=1,
+        help="Category id for bounding box detection model (person) | default = %(default)s",
+    )
+    parser.add_argument(
+        "--bbox_thr",
+        type=float,
+        default=0.3,
+        help="Bounding box score threshold | default = %(default)s",
+    )
+    parser.add_argument(
+        "--kpt_thr",
+        type=float,
+        default=0.3,
+        help="Keypoint score threshold | default = %(default)s",
+    )
+    parser.add_argument(
+        "--tracking_thr",
+        type=float,
+        default=0.3,
+        help="Tracking threshold | default = %(default)s",
+    )
+    parser.add_argument(
+        "--euro", action="store_true", help="Using One_Euro_Filter for smoothing"
+    )
+    # parser.add_argument('--rgb_topic', default = "orbbec/rgb", type=str, help='ROS topic for RGB image')
+    # parser.add_argument('--depth_topic', default = "orbbec/depth", type=str, help='ROS topic for depth image')
+    # parser.add_argument('--pcl_topic', default = "orbbec/pcl", type=str, help='ROS topic for pcl')
+    parser.add_argument(
+        "--namespace",
+        default="orbbec",
+        type=str,
+        help="ROS topic namespace for rgb, depth, pcl | default = %(default)s",
+    )
+    parser.add_argument(
+        "--no_show",
+        action="store_true",
+        default=False,
+        help="whether to show visualizations | default = %(default)s",
+    )
+    parser.add_argument(
+        "--save",
+        action="store_true",
+        default=False,
+        help="whether to save images (rgb and d and predictions and pcl) | default = %(default)s",
+    )
+    parser.add_argument(
+        "--flip",
+        action="store_true",
+        default=False,
+        help="whether to flip images | default = %(default)s",
+    )
+    parser.add_argument(
+        "--light_save",
+        action="store_true",
+        default=False,
+        help="whether to save only rgb and pcl (not optimized use the light_save of visualizer for optimized saving) | default = %(default)s",
+    )
+    parser.add_argument(
+        "--display_all_detection",
+        "-dad",
+        action="store_true",
+        default=False,
+        help="whether to display all detections or only human | default = %(default)s",
+    )
+    parser.add_argument(
+        "--light_display",
+        "-ld",
+        action="store_true",
+        default=False,
+        help="whether to display only skeletons | default = %(default)s",
+    )
+    parser.add_argument("--fps", type=int, default=10, help="Node and recording fps")
+    parser.add_argument(
+        "--depth_cmap",
+        default="jet",
+        type=str,
+        help="mpl colormap for depth image | default = %(default)s",
+    )
+
+    parser.add_argument('--mb_3d_config', type=str, default = "./configs/pose3d/MB_ft_h36m.yaml", help='Config file for 3D poses | default = %(default)s')
+    parser.add_argument('--mb_checkpoint', type=str, default = "./checkpoint/pose3d/MB_train_h36m/best_epoch.bin", help='Checkpoint file for 3D poses | default = %(default)s')
+    parser.add_argument(
+        '--mb_clip_len',
+        type=int,
+        default=10,
+        help='Number of past frames to use for MotionBERT (default in model is 243) | default = %(default)s')
+    parser.add_argument(
+        '--max_frames_remove_tracks',
+        type=int,
+        default=2,
+        help='Number frames without the track present to keep going before removing a track | default = %(default)s')
+    parser.add_argument(
+        "--use_mb",
+        "-mb",
+        action="store_true",
+        default=False,
+        help="whether to use MotionBERT 3D Lifter | default = %(default)s",
+    )
+
+    parser.add_argument('--gafa_checkpoint', type=str, default = "./checkpoint/gafa/GazeNet_PyTorch.pt", help='Checkpoint file for 3D gaze estimation GAFA | default = %(default)s')
+    parser.add_argument(
+        '--gafa_n_frames',
+        type=int,
+        default=7,
+        help='Number of past frames to use for GAFA (default in model is 7) | default = %(default)s')
+    parser.add_argument(
+        "--use_gafa",
+        "-gafa",
+        action="store_true",
+        default=False,
+        help="whether to use GAFA 3D Gaze Estimation | default = %(default)s",
+    )
+    parser.add_argument(
+        "--gafa_no_history",
+        "-gnh",
+        action="store_true",
+        default=False,
+        help="whether to use history in the GAFA sequence or fake it by copying last image | default = %(default)s",
+    )
+
+    parser.add_argument(
+        "--use_gaze_resnet",
+        "-resnet",
+        action="store_true",
+        default=False,
+        help="whether to use Gaze ResNet18 3D Gaze Estimation | default = %(default)s",
+    )
+    parser.add_argument(
+        "--use_face_detector",
+        "-ufd",
+        action="store_true",
+        default=False,
+        help="whether to use Face Detector before gaze ResNet18 3D Gaze Estimation, or juste use bbox from keypoints | default = %(default)s",
+    )
+    parser.add_argument(
+        "--use_six_d_rep",
+        "-sixdrep",
+        action="store_true",
+        default=False,
+        help="whether to use 6D rep head pose estimation instead of gaze estimation | default = %(default)s",
+    )
+    parser.add_argument(
+        "--bb_min_threshold",
+        "-bbmt",
+        type=int,
+        default=0,
+        help="Minimum height of bb in pixels | default = %(default)s",
+    )
+    parser.add_argument(
+        "--max_distance",
+        type=float,
+        default=2.5,
+        help="Maximum distance allowed for publishing human pose | default = %(default)s",
+    )
+    
+    args = parser.parse_args()
+
+    assert has_mmdet, "Please install mmdet to run the demo."
+    assert args.det_config is not None
+    assert args.det_checkpoint is not None
+    
+    if args.use_mb:
+        assert(has_mb), "Option --use_mb requires MotionBERT install"
+
+    if args.use_gafa:
+        assert(args.use_gaze_resnet == False), "Option --use_gafa and --use_gaze_resnet are not compatible"
+        assert(args.use_six_d_rep == False), "Option --use_gafa and --use_six_d_rep are not compatible"
+
+    if args.use_gaze_resnet:
+        assert(args.use_gafa == False), "Option --use_gaze_resnet and --use_gafa are not compatible"
+        assert(args.use_six_d_rep == False), "Option --use_gaze_resnet and --use_six_d_rep are not compatible"
+
+    if args.use_six_d_rep:
+        assert(args.use_gaze_resnet == False), "Option --use_six_d_rep and --use_gaze_resnet are not compatible"
+        assert(args.use_gafa == False), "Option --use_six_d_rep and --use_gafa are not compatible"
+
+    if args.use_gafa:
+        assert(has_gafa), "Option --use_gafa requires GAFA install"
+
+    if args.use_six_d_rep:
+        assert(has_sixdrep),  "Option --use_six_d_rep requires 6D Rep install"
+    
+    if args.use_gaze_resnet:
+        assert(has_gaze_est), "Option --use_gaze_resnet requires Gaze Estimation"
+    
+    prInfo("Loaded with args : {}".format(args))
+
+    rospy.init_node("python_orbbec_inference", anonymous=True)
+    my_node = InferenceNodeRGBD(args)
+    my_node.start()
+    cv2.destroyAllWindows()
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..13f12b8
--- /dev/null
+++ b/run.sh
@@ -0,0 +1 @@
+python rgbd_detect_3d_dir.py --flip -bbmt 200 --namespace orbbec --max_distance 2.5 -sixdrep
\ No newline at end of file
diff --git a/sixdrep/util.py b/sixdrep/util.py
new file mode 100644
index 0000000..5f078be
--- /dev/null
+++ b/sixdrep/util.py
@@ -0,0 +1,442 @@
+import math
+import os
+import random
+
+import cv2
+import numpy
+import torch
+from PIL import Image
+from PIL import ImageEnhance
+
+
+def setup_seed():
+    """
+    Setup random seed.
+    """
+    random.seed(0)
+    numpy.random.seed(0)
+    torch.manual_seed(0)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+
+
+def setup_multi_processes():
+    """
+    Setup multi-processing environment variables.
+    """
+    import cv2
+    from os import environ
+    from platform import system
+
+    # set multiprocess start method as `fork` to speed up the training
+    if system() != 'Windows':
+        torch.multiprocessing.set_start_method('fork', force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    cv2.setNumThreads(0)
+
+    # setup OMP threads
+    if 'OMP_NUM_THREADS' not in environ:
+        environ['OMP_NUM_THREADS'] = '1'
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in environ:
+        environ['MKL_NUM_THREADS'] = '1'
+
+
+def plot_lr(args, optimizer, scheduler):
+    import copy
+    from matplotlib import pyplot
+
+    optimizer = copy.copy(optimizer)
+    scheduler = copy.copy(scheduler)
+
+    y = []
+    for epoch in range(args.epochs):
+        y.append(optimizer.param_groups[-1]['lr'])
+        scheduler.step(epoch + 1, optimizer)
+
+    pyplot.plot(y, '.-', label='LR')
+    pyplot.xlabel('epoch')
+    pyplot.ylabel('LR')
+    pyplot.grid()
+    pyplot.xlim(0, args.epochs)
+    pyplot.ylim(0)
+    pyplot.savefig('./weights/lr.png', dpi=200)
+    pyplot.close()
+
+
+def strip_optimizer(filename):
+    x = torch.load(filename, map_location=torch.device('cpu'))
+    x['model'].half()  # to FP16
+    for p in x['model'].parameters():
+        p.requires_grad = False
+    torch.save(x, filename)
+
+
+def resample():
+    return random.choice((Image.BILINEAR, Image.BICUBIC))
+
+
+def load_weights(model, ckpt):
+    dst = model.state_dict()
+    src = torch.load(ckpt)['model']
+    src = src.cpu().float().state_dict()
+
+    ckpt = {}
+    for k, v in src.items():
+        if k in dst and v.shape == dst[k].shape:
+            ckpt[k] = v
+    model.load_state_dict(state_dict=ckpt, strict=False)
+    return model
+
+
+def compute_euler(matrices):
+    shape = matrices.shape
+    sy = matrices[:, 0, 0] * matrices[:, 0, 0] + matrices[:, 1, 0] * matrices[:, 1, 0]
+    sy = torch.sqrt(sy)
+    singular = (sy < 1E-6).float()
+
+    x = torch.atan2(matrices[:, 2, 1], matrices[:, 2, 2])
+    y = torch.atan2(-matrices[:, 2, 0], sy)
+    z = torch.atan2(matrices[:, 1, 0], matrices[:, 0, 0])
+
+    xs = torch.atan2(-matrices[:, 1, 2], matrices[:, 1, 1])
+    ys = torch.atan2(-matrices[:, 2, 0], sy)
+    zs = torch.zeros_like(z)
+
+    device = matrices.device
+    out_euler = torch.zeros(shape[0], 3, device=device)
+    out_euler[:, 0] = x * (1 - singular) + xs * singular
+    out_euler[:, 1] = y * (1 - singular) + ys * singular
+    out_euler[:, 2] = z * (1 - singular) + zs * singular
+    return out_euler
+
+
+def params(model, lr):
+    return [{'params': model.p1.parameters(), 'lr': lr},
+            {'params': model.p2.parameters(), 'lr': lr},
+            {'params': model.p3.parameters(), 'lr': lr},
+            {'params': model.p4.parameters(), 'lr': lr},
+            {'params': model.p5.parameters(), 'lr': lr},
+            {'params': model.fc.parameters(), 'lr': lr * 10}]
+
+
+class Resize:
+    def __init__(self, size: int):
+        self.size = size
+
+    def __call__(self, image):
+        size = self.size
+        i, j, h, w = self.params(image.size)
+        image = image.crop((j, i, j + w, i + h))
+        return image.resize([size, size], resample())
+
+    @staticmethod
+    def params(size):
+        scale = (0.8, 1.0)
+        ratio = (3. / 4., 4. / 3.)
+        for _ in range(10):
+            target_area = random.uniform(*scale) * size[0] * size[1]
+            aspect_ratio = math.exp(random.uniform(*(math.log(ratio[0]), math.log(ratio[1]))))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w <= size[0] and h <= size[1]:
+                i = random.randint(0, size[1] - h)
+                j = random.randint(0, size[0] - w)
+                return i, j, h, w
+
+        if (size[0] / size[1]) < min(ratio):
+            w = size[0]
+            h = int(round(w / min(ratio)))
+        elif (size[0] / size[1]) > max(ratio):
+            h = size[1]
+            w = int(round(h * max(ratio)))
+        else:
+            w = size[0]
+            h = size[1]
+        i = (size[1] - h) // 2
+        j = (size[0] - w) // 2
+        return i, j, h, w
+
+
+class ColorJitter:
+    def __init__(self,
+                 p: float = 0.5,
+                 brightness: float = 0.1,
+                 saturation: float = 0.1,
+                 contrast: float = 0.1):
+        self.brightness = (1 - brightness, 1 + brightness)
+        self.saturation = (1 - saturation, 1 + saturation)
+        self.contrast = (1 - contrast, 1 + contrast)
+        self.indices = [0, 1, 2]
+        self.p = p
+
+    def __call__(self, image):
+        if random.random() > self.p:
+            return image
+
+        b = random.uniform(self.brightness[0], self.brightness[1])
+        s = random.uniform(self.saturation[0], self.saturation[1])
+        c = random.uniform(self.contrast[0], self.contrast[1])
+
+        random.shuffle(self.indices)
+
+        for i in self.indices:
+            if i == 0:
+                image = ImageEnhance.Brightness(image).enhance(b)  # brightness
+            elif i == 1:
+                image = ImageEnhance.Contrast(image).enhance(c)  # contrast
+            elif i == 2:
+                image = ImageEnhance.Color(image).enhance(s)  # saturation
+
+        return image
+
+
+class AverageMeter:
+    def __init__(self):
+        self.num = 0
+        self.sum = 0
+        self.avg = 0
+
+    def update(self, v, n):
+        self.num = self.num + n
+        self.sum = self.sum + v * n
+        self.avg = self.sum / self.num
+
+
+def plot_pose_cube(image, yaw, pitch, roll, tdx=None, tdy=None, size=150.):
+    p = pitch * numpy.pi / 180
+    y = -(yaw * numpy.pi / 180)
+    r = roll * numpy.pi / 180
+    if (tdx is not None) and (tdy is not None):
+        face_x = tdx - 0.50 * size
+        face_y = tdy - 0.50 * size
+    else:
+        height, width = image.shape[:2]
+        face_x = width / 2 - 0.5 * size
+        face_y = height / 2 - 0.5 * size
+
+    x1 = size * (math.cos(y) * math.cos(r)) + face_x
+    y1 = size * (math.cos(p) * math.sin(r) + math.cos(r) * math.sin(p) * math.sin(y)) + face_y
+    x2 = size * (-math.cos(y) * math.sin(r)) + face_x
+    y2 = size * (math.cos(p) * math.cos(r) - math.sin(p) * math.sin(y) * math.sin(r)) + face_y
+    x3 = size * (math.sin(y)) + face_x
+    y3 = size * (-math.cos(y) * math.sin(p)) + face_y
+
+    # Draw base in red
+    cv2.line(image, (int(face_x), int(face_y)), (int(x1), int(y1)), (0, 0, 255), 3)
+    cv2.line(image, (int(face_x), int(face_y)), (int(x2), int(y2)), (0, 0, 255), 3)
+    cv2.line(image, (int(x2), int(y2)), (int(x2 + x1 - face_x), int(y2 + y1 - face_y)), (0, 0, 255), 3)
+    cv2.line(image, (int(x1), int(y1)), (int(x1 + x2 - face_x), int(y1 + y2 - face_y)), (0, 0, 255), 3)
+    # Draw pillars in blue
+    cv2.line(image, (int(face_x), int(face_y)), (int(x3), int(y3)), (255, 0, 0), 2)
+    cv2.line(image, (int(x1), int(y1)), (int(x1 + x3 - face_x), int(y1 + y3 - face_y)), (255, 0, 0), 2)
+    cv2.line(image, (int(x2), int(y2)), (int(x2 + x3 - face_x), int(y2 + y3 - face_y)), (255, 0, 0), 2)
+    cv2.line(image, (int(x2 + x1 - face_x), int(y2 + y1 - face_y)),
+             (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (255, 0, 0), 2)
+    # Draw top in green
+    cv2.line(image, (int(x3 + x1 - face_x), int(y3 + y1 - face_y)),
+             (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (0, 255, 0), 2)
+    cv2.line(image, (int(x2 + x3 - face_x), int(y2 + y3 - face_y)),
+             (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (0, 255, 0), 2)
+    cv2.line(image, (int(x3), int(y3)), (int(x3 + x1 - face_x), int(y3 + y1 - face_y)), (0, 255, 0), 2)
+    cv2.line(image, (int(x3), int(y3)), (int(x3 + x2 - face_x), int(y3 + y2 - face_y)), (0, 255, 0), 2)
+
+    return image
+
+
+def distance2box(points, distance, max_shape=None):
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return numpy.stack([x1, y1, x2, y2], axis=-1)
+
+
+def distance2kps(points, distance, max_shape=None):
+    outputs = []
+    for i in range(0, distance.shape[1], 2):
+        p_x = points[:, i % 2] + distance[:, i]
+        p_y = points[:, i % 2 + 1] + distance[:, i + 1]
+        if max_shape is not None:
+            p_x = p_x.clamp(min=0, max=max_shape[1])
+            p_y = p_y.clamp(min=0, max=max_shape[0])
+        outputs.append(p_x)
+        outputs.append(p_y)
+    return numpy.stack(outputs, axis=-1)
+
+
+class FaceDetector:
+    def __init__(self, onnx_path=None, session=None):
+        from onnxruntime import InferenceSession
+        self.session = session
+
+        self.batched = False
+        if self.session is None:
+            assert onnx_path is not None
+            assert os.path.exists(onnx_path)
+            self.session = InferenceSession(onnx_path,
+                                            providers=['CUDAExecutionProvider'])
+        self.nms_thresh = 0.4
+        self.center_cache = {}
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        if isinstance(input_shape[2], str):
+            self.input_size = None
+        else:
+            self.input_size = tuple(input_shape[2:4][::-1])
+        input_name = input_cfg.name
+        outputs = self.session.get_outputs()
+        if len(outputs[0].shape) == 3:
+            self.batched = True
+        output_names = []
+        for output in outputs:
+            output_names.append(output.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.use_kps = False
+        self._num_anchors = 1
+        if len(outputs) == 6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(outputs) == 9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(outputs) == 10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(outputs) == 15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+
+    def forward(self, x, score_thresh):
+        scores_list = []
+        bboxes_list = []
+        points_list = []
+        input_size = tuple(x.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(x,
+                                     1.0 / 128,
+                                     input_size,
+                                     (127.5, 127.5, 127.5), swapRB=True)
+        outputs = self.session.run(self.output_names, {self.input_name: blob})
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            if self.batched:
+                scores = outputs[idx][0]
+                boxes = outputs[idx + fmc][0]
+                boxes = boxes * stride
+            else:
+                scores = outputs[idx]
+                boxes = outputs[idx + fmc]
+                boxes = boxes * stride
+
+            height = input_height // stride
+            width = input_width // stride
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                anchor_centers = numpy.stack(numpy.mgrid[:height, :width][::-1], axis=-1)
+                anchor_centers = anchor_centers.astype(numpy.float32)
+
+                anchor_centers = (anchor_centers * stride).reshape((-1, 2))
+                if self._num_anchors > 1:
+                    anchor_centers = numpy.stack([anchor_centers] * self._num_anchors, axis=1)
+                    anchor_centers = anchor_centers.reshape((-1, 2))
+                if len(self.center_cache) < 100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_indices = numpy.where(scores >= score_thresh)[0]
+            bboxes = distance2box(anchor_centers, boxes)
+            pos_scores = scores[pos_indices]
+            pos_bboxes = bboxes[pos_indices]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+        return scores_list, bboxes_list
+
+    def detect(self, image, input_size=None, score_threshold=0.5, max_num=0, metric='default'):
+        assert input_size is not None or self.input_size is not None
+        input_size = self.input_size if input_size is None else input_size
+        image_ratio = float(image.shape[0]) / image.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if image_ratio > model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / image_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * image_ratio)
+        det_scale = float(new_height) / image.shape[0]
+        resized_img = cv2.resize(image, (new_width, new_height))
+        det_img = numpy.zeros((input_size[1], input_size[0], 3), dtype=numpy.uint8)
+        det_img[:new_height, :new_width, :] = resized_img
+
+        scores_list, bboxes_list = self.forward(det_img, score_threshold)
+
+        scores = numpy.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = numpy.vstack(bboxes_list) / det_scale
+        pre_det = numpy.hstack((bboxes, scores)).astype(numpy.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if 0 < max_num < det.shape[0]:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1])
+            img_center = image.shape[0] // 2, image.shape[1] // 2
+            offsets = numpy.vstack([(det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                                    (det[:, 1] + det[:, 3]) / 2 - img_center[0]])
+            offset_dist_squared = numpy.sum(numpy.power(offsets, 2.0), 0)
+            if metric == 'max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            index = numpy.argsort(values)[::-1]  # some extra weight on the centering
+            index = index[0:max_num]
+            det = det[index, :]
+        return det
+
+    def nms(self, outputs):
+        thresh = self.nms_thresh
+        x1 = outputs[:, 0]
+        y1 = outputs[:, 1]
+        x2 = outputs[:, 2]
+        y2 = outputs[:, 3]
+        scores = outputs[:, 4]
+
+        order = scores.argsort()[::-1]
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = numpy.maximum(x1[i], x1[order[1:]])
+            yy1 = numpy.maximum(y1[i], y1[order[1:]])
+            xx2 = numpy.minimum(x2[i], x2[order[1:]])
+            yy2 = numpy.minimum(y2[i], y2[order[1:]])
+
+            w = numpy.maximum(0.0, xx2 - xx1 + 1)
+            h = numpy.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            indices = numpy.where(ovr <= thresh)[0]
+            order = order[indices + 1]
+
+        return keep
diff --git a/sixdrep/utils.py b/sixdrep/utils.py
new file mode 100644
index 0000000..b6366b6
--- /dev/null
+++ b/sixdrep/utils.py
@@ -0,0 +1,8 @@
+from torch.utils import data
+from torchvision import transforms
+
+normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+sixdreptransform = transforms.Compose([transforms.Resize(224 + 32),
+                                    transforms.CenterCrop(224),
+                                    transforms.ToTensor(),
+                                    normalize])
\ No newline at end of file
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..67885e6
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,501 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from print_color import print
+import copy
+
+from functools import wraps
+import time
+import cv2
+
+TEXT_SCALE = 1.0
+
+def get_mpl_colormap(cmap_name):
+    cmap = plt.get_cmap(cmap_name)
+
+    # Initialize the matplotlib color map
+    sm = plt.cm.ScalarMappable(cmap=cmap)
+
+    # Obtain linear color range
+    color_range = sm.to_rgba(np.linspace(0, 1, 256), bytes=True)[:, 2::-1]
+
+    return color_range.reshape(256, 3).astype(np.float32) / 255.0
+
+
+def prSuccess(text):
+  print(text, tag = "ok", tag_color = "green", color = "white")
+
+def prInfo(text):
+  print(text, tag = "info", tag_color = "cyan", color = "white")
+
+def prTimer(text, tic, tac):
+  print("{} {:.0f} ms".format(text, (tac-tic)*1000), tag = "timer", tag_color = "purple", color = "white")
+
+def prInfoBold(text):
+  print(text, tag = "info", tag_color = "cyan", color = "white", format = "bold")
+
+def prDebug(text):
+  print(text, tag = "debug", tag_color = "red", background = "white", color = "white")
+
+def prWarning(text):
+  print(text, tag = "warning", tag_color = "yellow", color = "white")
+
+def prError(text):
+  print(text, tag = "error", tag_color = "red", color = "white")
+
+
+def draw_bbox_with_corners(image, bbox, color=(0, 255, 0), thickness=2, proportion=0.2):
+    x_min, y_min, x_max, y_max = map(int, bbox[:4])
+
+    width = x_max - x_min
+    height = y_max - y_min
+
+    corner_length = int(proportion * min(width, height))
+
+    # Draw the rectangle
+    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 1)
+
+    # Top-left corner
+    cv2.line(image, (x_min, y_min), (x_min + corner_length, y_min), color, thickness)
+    cv2.line(image, (x_min, y_min), (x_min, y_min + corner_length), color, thickness)
+
+    # Top-right corner
+    cv2.line(image, (x_max, y_min), (x_max - corner_length, y_min), color, thickness)
+    cv2.line(image, (x_max, y_min), (x_max, y_min + corner_length), color, thickness)
+
+    # Bottom-left corner
+    cv2.line(image, (x_min, y_max), (x_min, y_max - corner_length), color, thickness)
+    cv2.line(image, (x_min, y_max), (x_min + corner_length, y_max), color, thickness)
+
+    # Bottom-right corner
+    cv2.line(image, (x_max, y_max), (x_max, y_max - corner_length), color, thickness)
+    cv2.line(image, (x_max, y_max), (x_max - corner_length, y_max), color, thickness)
+
+YOLO_COCO_80_CLASSES = [
+"person",
+"bicycle",
+"car",
+"motorbike",
+"aeroplane",
+"bus",
+"train",
+"truck",
+"boat",
+"traffic light",
+"fire hydrant",
+"stop sign",
+"parking meter",
+"bench",
+"bird",
+"cat",
+"dog",
+"horse",
+"sheep",
+"cow",
+"elephant",
+"bear",
+"zebra",
+"giraffe",
+"backpack",
+"umbrella",
+"handbag",
+"tie",
+"suitcase",
+"frisbee",
+"skis",
+"snowboard",
+"sports ball",
+"kite",
+"baseball bat",
+"baseball glove",
+"skateboard",
+"surfboard",
+"tennis racket",
+"bottle",
+"wine glass",
+"cup",
+"fork",
+"knife",
+"spoon",
+"bowl",
+"banana",
+"apple",
+"sandwich",
+"orange",
+"broccoli",
+"carrot",
+"hot dog",
+"pizza",
+"donut",
+"cake",
+"chair",
+"sofa",
+"pottedplant",
+"bed",
+"diningtable",
+"toilet",
+"tvmonitor",
+"laptop",
+"mouse",
+"remote",
+"keyboard",
+"cell phone",
+"microwave",
+"oven",
+"toaster",
+"sink",
+"refrigerator",
+"book",
+"clock",
+"vase",
+"scissors",
+"teddy bear",
+"hair drier",
+"toothbrush"]
+
+COCO17_JOINTS_LIMBS =       [[0,1],  [0,2],  [1,2],  [1,3],  [4,2],  [3,5],  [4,6],  [5,7],[6,8],[7,9],[8,10],      [5,6],[11,12],    [5,11],[6,12],[11,13],[12,14],[13,15],[14,16]]  
+
+RANDOM_COLORS = np.array([
+  [205, 150, 194],
+  [ 17, 155, 211],
+  [162, 121, 186],
+  [194, 242,  27],
+  [248,  79,  81],
+  [134, 159, 164],
+  [163,   7,  30],
+  [ 93,   9, 121],
+  [ 95,  54, 131],
+  [ 77,  23,  22],
+  [ 43,  17, 191],
+  [ 34, 198, 162],
+  [ 53,   5, 221],
+  [ 37,  74,  55],
+  [ 88, 204, 179],
+  [200,  84, 192],
+  [ 71,  75,  96],
+  [  5, 250, 149],
+  [  9, 216, 221],
+  [ 54, 115,  69],
+  [109,  92,  97],
+  [186, 191, 222],
+  [ 14,  41, 194],
+  [ 75, 246, 175],
+  [135, 112,  74],
+  [ 18, 185,  33],
+  [236, 129,  68],
+  [ 58, 226, 186],
+  [ 56,  63,  90],
+  [231,  40, 251],
+  [222, 112, 249],
+  [ 77,  37, 189],
+  [137,  94, 131],
+  [170, 233,  53],
+  [235,  29,  21],
+  [ 66,  96,  46],
+  [ 62,  29, 142],
+  [ 12, 193,  90],
+  [224, 151, 242],
+  [132, 221, 176],
+  [ 94,  75, 130],
+  [157, 220, 166],
+  [156,  47, 225],
+  [ 76, 176, 108],
+  [186, 189,  33],
+  [139, 223,  78],
+  [ 98, 169,  49],
+  [ 39, 154,  71],
+  [ 49, 191, 100],
+  [128, 170,  25],
+  [ 90, 127, 185],
+  [180, 213, 170],
+  [ 53, 153, 220],
+  [109, 211,  12],
+  [ 72, 125,  73],
+  [126, 220, 193],
+  [238,  38, 220],
+  [ 77,  76,  46],
+  [254, 186, 161],
+  [126, 226, 187],
+  [190, 142,  14],
+  [132, 146, 254],
+  [ 34,  39, 219],
+  [ 78, 114, 127],
+  [248, 145, 165],
+  [145,  64,  10],
+  [237,  84,  14],
+  [ 18, 245, 229],
+  [246,  40, 125],
+  [187, 210,  10],
+  [128, 197, 159],
+  [152, 179, 221],
+  [ 18, 159,  88],
+  [ 17, 205, 133],
+  [243, 111, 152],
+  [ 86,  60, 202],
+  [178,  71, 105],
+  [ 49, 141, 244],
+  [238, 169,  59],
+  [ 91, 190,  81],
+  [194, 113, 124],
+  [209, 214, 138],
+  [ 61, 251, 148],
+  [113,  75, 124],
+  [182, 147,   1],
+  [ 86, 119, 160],
+  [ 12, 253, 136],
+  [149,  38,  41],
+  [183, 161,  19],
+  [153,   4,  68],
+  [195, 147, 156],
+  [165,  30, 189],
+  [ 82,  55, 244],
+  [ 33,  25, 248],
+  [ 71, 193, 228],
+  [244,  37, 174],
+  [203,   6, 202],
+  [118, 209, 136],
+  [248, 144,  49],
+  [  8, 145, 128],
+  [164,  24,   0],
+  [ 97, 196,  92],
+  [243, 146, 179],
+  [ 77, 144, 104],
+  [134,  63,  50],
+  [108, 155, 104],
+  [200, 124, 251],
+  [ 70,  35, 156],
+  [115,  57, 148],
+  [249, 236,   2],
+  [119, 245,  43],
+  [ 49, 101,  88],
+  [ 27, 188,  88],
+  [225,  20,  89],
+  [ 94, 249, 118],
+  [  1, 150,  65],
+  [161,  77, 221],
+  [144, 227, 134],
+  [ 28, 231,  69],
+  [165, 141, 223],
+  [134, 124, 162],
+  [151,  18, 210],
+  [ 15,  39, 228],
+  [ 88, 192,  62],
+  [179,  36, 209],
+  [ 99,  11, 191],
+  [145,  76, 117],
+  [183, 212, 247],
+  [ 10,  52, 119],
+  [154, 218, 200],
+  [194, 227, 179],
+  [  9,  73,   9],
+  [ 66,  19,  65],
+  [ 62, 201, 224],
+  [ 18, 100, 101],
+  [  4,  29, 246],
+  [ 94,  47, 167],
+  [ 57,  85, 162],
+  [196, 245, 113],
+  [234,  87, 229],
+  [ 30, 199,  34],
+  [ 41, 216, 200],
+  [ 93, 155, 214],
+  [236, 132,  87],
+  [193, 191,  13],
+  [222, 140, 102],
+  [ 50, 194,  63],
+  [244, 103,  90],
+  [ 63, 234,  10],
+  [ 45, 138, 147],
+  [107,  11, 164],
+  [ 93, 196,  79],
+  [ 85,  20, 227],
+  [  2,  74,   5],
+  [155, 243,  68],
+  [133, 102,  92],
+  [ 85,  27, 104],
+  [ 73,  69,  71],
+  [176, 159, 175],
+  [124, 113, 197],
+  [102, 221,  40],
+  [167, 164, 166],
+  [214,   8,  43],
+  [183, 139, 224],
+  [130,  21,  83],
+  [172,  11, 186],
+  [199, 183, 201],
+  [180, 166,  98],
+  [ 28,  22, 177],
+  [  4, 227,  64],
+  [131,   2,  95],
+  [  2, 164,  73],
+  [ 89, 247,   7],
+  [235,  93, 169],
+  [ 51, 230,  61],
+  [144, 144, 234],
+  [157,  22,  89],
+  [  0,  48, 113],
+  [207,  63, 161],
+  [200,   3, 166],
+  [ 25,  92, 209],
+  [243, 201, 247],
+  [117,  78, 126],
+  [229,  99, 105],
+  [ 52, 184, 198],
+  [ 29, 127, 174],
+  [251, 113,  46],
+  [220, 148,  28],
+  [ 18, 228,  18],
+  [216, 178,  17],
+  [ 78,  54, 148],
+  [223, 253, 150],
+  [105,  69,  50],
+  [229, 162,  35],
+  [140,  47, 200],
+  [103, 195, 216],
+  [169,  23,  47],
+  [ 73, 208,  20],
+  [ 53, 184, 113],
+  [225, 211,  40],
+  [135, 163, 142],
+  [243, 236,  67],
+  [ 14,  20,  61],
+  [ 11,  27, 107],
+  [ 24, 145,  99],
+  [155, 150, 243],
+  [254, 153, 114],
+  [ 91, 182, 222],
+  [ 71, 216,  39],
+  [  9,  55, 216],
+  [144,   1, 144],
+  [163, 166, 208],
+  [149,  53,  64],
+  [230,  45,  52],
+  [171, 157,   2],
+  [191,  43, 172],
+  [180,  84, 131],
+  [  8,  40,  88],
+  [155,  63, 149],
+  [196, 150, 149],
+  [123, 219,  46],
+  [  9,  63, 186],
+  [ 19,  54, 155],
+  [ 25,  43,  88],
+  [140, 174, 131],
+  [ 23, 158,  90],
+  [152, 141, 207],
+  [ 28, 160,  67],
+  [ 17,  54, 220],
+  [ 12, 186,   7],
+  [129,  17,  94],
+  [221,  84, 128],
+  [142, 172, 202],
+  [161, 214, 106],
+  [ 75, 208, 229],
+  [140,  39, 192],
+  [183, 116, 110],
+  [ 73, 104, 186],
+  [152, 191, 227],
+  [254,   1,  97],
+  [193, 189,  73],
+  [187, 108, 152],
+  [ 86, 224,  29],
+  [212, 192, 223],
+  [130, 109,  55],
+  [149, 130, 121],
+  [ 70, 125,  16],
+  [203,  54, 194],
+  [ 23,  91, 249],
+  [ 43,  73,   5],
+  [  5, 165, 112],
+  [189, 148, 214],
+  [170,  56, 203],
+  [ 69,  45,  90],
+  [ 27, 169, 222],
+  [187,  80,  33]
+])
+
+
+
+def crop_scale(motion, scale_range=[1, 1]):
+    '''
+        For input of MotionBERT
+        Motion: [(M), T, 17, 3].
+        Normalize to [-1, 1]
+    '''
+    result = copy.deepcopy(motion)
+    valid_coords = motion[motion[..., 2]!=0][:,:2]
+    if len(valid_coords) < 4:
+        return np.zeros(motion.shape)
+    xmin = min(valid_coords[:,0])
+    xmax = max(valid_coords[:,0])
+    ymin = min(valid_coords[:,1])
+    ymax = max(valid_coords[:,1])
+    ratio = np.random.uniform(low=scale_range[0], high=scale_range[1], size=1)[0]
+    scale = max(xmax-xmin, ymax-ymin) * ratio
+    if scale==0:
+        return np.zeros(motion.shape)
+    xs = (xmin+xmax-scale) / 2
+    ys = (ymin+ymax-scale) / 2
+    result[...,:2] = (motion[..., :2]- [xs,ys]) / scale
+    result[...,:2] = (result[..., :2] - 0.5) * 2
+    result = np.clip(result, -1, 1)
+    return result
+
+
+def coco2h36m(x):
+    '''
+        Input: x ((M )x T x V x C)
+        
+        COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank}
+        
+        H36M:
+        0: 'root',
+        1: 'rhip',
+        2: 'rkne',
+        3: 'rank',
+        4: 'lhip',
+        5: 'lkne',
+        6: 'lank',
+        7: 'belly',
+        8: 'neck',
+        9: 'nose',
+        10: 'head',
+        11: 'lsho',
+        12: 'lelb',
+        13: 'lwri',
+        14: 'rsho',
+        15: 'relb',
+        16: 'rwri'
+    '''
+    y = np.zeros(x.shape)
+    y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5
+    y[:,1,:] = x[:,12,:]
+    y[:,2,:] = x[:,14,:]
+    y[:,3,:] = x[:,16,:]
+    y[:,4,:] = x[:,11,:]
+    y[:,5,:] = x[:,13,:]
+    y[:,6,:] = x[:,15,:]
+    y[:,8,:] = (x[:,5,:] + x[:,6,:]) * 0.5
+    y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5
+    y[:,9,:] = x[:,0,:]
+    y[:,10,:] = (x[:,1,:] + x[:,2,:]) * 0.5
+    y[:,11,:] = x[:,5,:]
+    y[:,12,:] = x[:,7,:]
+    y[:,13,:] = x[:,9,:]
+    y[:,14,:] = x[:,6,:]
+    y[:,15,:] = x[:,8,:]
+    y[:,16,:] = x[:,10,:]
+    return y
+  
+  
+def timeit(func): 
+    @wraps(func)
+    def wrapper_function(*args, **kwargs): 
+        tic = time.time()
+        res = func(*args,  **kwargs) 
+        tac = time.time()
+        print("{} {:.0f} ms".format(func.__name__, (tac-tic)*1000), tag = "timer", tag_color = "purple", color = "white")
+        return res
+    return wrapper_function 
diff --git a/visualizer.py b/visualizer.py
new file mode 100755
index 0000000..4794746
--- /dev/null
+++ b/visualizer.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import rospy
+from sensor_msgs.msg import Image, PointCloud2
+from geometry_msgs.msg import Pose, Point
+from cv_bridge import CvBridge
+import cv2
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from argparse import ArgumentParser
+from datetime import datetime
+import time
+import json
+from utils import *
+
+# remove numpy scientific notation
+np.set_printoptions(suppress=True)
+
+class VisualizerNode(object):
+    def __init__(self, args):
+        
+        self.args = args
+        
+        self.rgb = None # Image frame
+        self.depth = None # Image frame
+        
+        self.pcl_array_rgb = None
+        self.pcl_array_xyz = None
+        
+        self.depth_cmap = get_mpl_colormap(args.depth_cmap)
+        self.depth_array_max_threshold = 3000
+
+        self.pcl_current_seq = -1        
+        self.rgb_current_seq = -1
+        self.depth_current_seq = -1
+        self.current_image_count = 0
+
+        self.br = CvBridge()
+
+        prInfo("Setting node rate to {} fps".format(args.fps))
+        self.loop_rate = rospy.Rate(args.fps)
+
+        # make the output path
+        now = datetime.now()
+        timestamp = now.strftime("%Y_%m_%d_%H_%M_%S")
+        self.save_dir = os.path.join("output", "record_{:s}".format(timestamp))        
+        self.metadata = os.path.join(self.save_dir, "metadata.json")
+        self.save_dir_rgb = os.path.join(self.save_dir, "rgb")
+        self.save_dir_depth = os.path.join(self.save_dir, "depth")
+        self.save_dir_depth_color = os.path.join(self.save_dir, "depth_color")
+        self.save_dir_pcl_bin = os.path.join(self.save_dir, "pcl")
+
+        if args.save or args.light_save:
+            prInfo("Saving to {}/[rgb][depth][depth_color]".format(self.save_dir))
+            if not os.path.exists(self.save_dir):
+                prInfo("Creating directories to {}/[rgb][depth][depth_color]".format(self.save_dir))
+                os.makedirs(self.save_dir)
+                os.makedirs(self.save_dir_rgb)
+                if not args.no_pcl:
+                    os.makedirs(self.save_dir_pcl_bin)
+                    
+                if not args.no_depth and args.save:
+                    os.makedirs(self.save_dir_depth)
+                    os.makedirs(self.save_dir_depth_color)
+                    
+                args_dic = vars(args)
+                with open(self.metadata, 'w') as fp:
+                    json.dump(args_dic, fp)
+                    
+                prSuccess("Created directories to {}/[rgb][depth][depth_color][pcl]".format(self.save_dir))
+                time.sleep(1)
+            
+ 
+        # Subscribers
+        prInfo("Subscribing to {} for RGB".format(args.rgb_topic))
+        self.rgb_sub = rospy.Subscriber(args.rgb_topic, Image, self.callback_rgb)
+                
+        if args.no_pcl:
+            prWarning("No PCL subscriber because option --no_pcl is enabled")
+        else:
+            prInfo("Subscribing to {} for PCL".format(args.pcl_topic))
+            self.pcl_sub = rospy.Subscriber(args.pcl_topic, PointCloud2, self.callback_pcl)
+        
+        if args.no_depth:
+            prWarning("No depth subscriber because option --no_depth is enabled")
+        else:
+            prInfo("Subscribing to {} for depth".format(args.depth_topic))
+            self.depth_sub = rospy.Subscriber(args.depth_topic,Image, self.callback_depth)
+
+    def callback_pcl(self, msg):
+        pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape((msg.height, msg.width, -1))
+        self.pcl_array_xyz = pcl_array[:,:,:3]
+        self.pcl_array_rgb = pcl_array[:,:,3:]
+        self.pcl_current_seq = msg.header.seq
+        rospy.loginfo('pcl received ({})...'.format(msg.header.seq))
+
+    def callback_rgb(self, msg):
+        self.rgb = self.br.imgmsg_to_cv2(msg, "bgr8")
+        self.rgb_current_seq = msg.header.seq
+        rospy.loginfo('RGB received ({})...'.format(msg.header.seq))
+
+    def callback_depth(self, msg):
+        self.depth = self.br.imgmsg_to_cv2(msg, "mono16")
+        self.depth_current_seq = msg.header.seq
+        rospy.loginfo('Depth received ({})...'.format(msg.header.seq))
+
+    def is_ready(self):
+        ready = (self.rgb is not None) and (self.args.no_depth or self.depth is not None) and (self.args.no_pcl or self.pcl_array_xyz is not None)
+        return ready
+
+    def start(self):
+        
+        if self.args.light_save:
+            # create dict for saving afterwards and avoid losing time
+            saving_pcl = {}
+            saving_rgb = {}
+            
+        while not rospy.is_shutdown():
+                       
+            if self.is_ready():
+                
+                image_count = self.current_image_count
+                image_seq_unique = self.rgb_current_seq
+                now = datetime.now()
+                timestamp = now.strftime("%Y_%m_%d_%H_%M_%S_%f")
+        
+                if self.args.save or self.args.light_save:
+                    rgb_path = os.path.join(self.save_dir_rgb, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp))
+                    if self.args.save:
+                        cv2.imwrite(rgb_path,  self.rgb)
+                        prSuccess("Saved RGB to {}".format(rgb_path))
+                    else:
+                        saving_rgb[rgb_path] = self.rgb
+                        
+                rgb_array = np.asarray(self.rgb)
+
+                if not self.args.no_show:
+                    full_display_height = rgb_array.shape[0] if self.args.no_depth else rgb_array.shape[0] * 2
+                    full_display_width = rgb_array.shape[1] if self.args.no_pcl else rgb_array.shape[1] * 2
+                    full_display_array = np.zeros((full_display_height, full_display_width, 3), dtype = np.uint8)
+                        
+                    full_display_array[:rgb_array.shape[0], :rgb_array.shape[1] ,:] = rgb_array
+
+                if self.args.no_depth:
+                    depth_array = None
+                else:
+                    
+                    if self.args.save:
+                        depth_path = os.path.join(self.save_dir_depth, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp))
+                        cv2.imwrite(depth_path,  self.depth)
+                        prSuccess("Saved depth to {}".format(depth_path))   
+                    
+                    depth_array = np.asarray(self.depth)
+                    depth_array[depth_array > self.depth_array_max_threshold] = self.depth_array_max_threshold
+
+                    depth_array_disp = depth_array.copy()
+                    depth_array_disp[depth_array_disp > 3000] = 3000              
+                    depth_array_norm = ((depth_array_disp - depth_array_disp.min())) / (depth_array_disp.max() - depth_array_disp.min())
+                    # depth_array_norm = ((depth_array - depth_array.min())) / (depth_array.max() - depth_array.min())
+                    depth_array_norm = depth_array_norm * 255
+                    depth_array_norm = depth_array_norm.astype(np.uint8)
+                    depth_array_norm_colored = (self.depth_cmap[depth_array_norm] * 255).astype(np.uint8)
+
+                    if self.args.save:
+                        depth_color_path = os.path.join(self.save_dir_depth_color, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp))
+                        cv2.imwrite(depth_color_path,  depth_array_norm_colored)
+                        prSuccess("Saved depth color (scaled) to {}".format(depth_color_path))   
+                    
+                    if not self.args.no_show:
+                        full_display_array[rgb_array.shape[0]:, :rgb_array.shape[1] ,:] = depth_array_norm_colored
+                
+                if self.args.no_pcl:
+                    pcl_rgb_norm = None
+                    pcl_xyz_norm = None
+                else:
+                    if self.args.save or self.args.light_save:
+                        pcl_path = os.path.join(self.save_dir_pcl_bin, "{:08d}_seq_{:010d}_ts_{}.bin".format(image_count, image_seq_unique, timestamp))
+                        
+                        if self.args.save:
+                            self.pcl_array_xyz.tofile(pcl_path)
+                            prSuccess("Saved pcl to {}".format(pcl_path))   
+                        elif self.args.light_save:
+                            saving_pcl[pcl_path] = self.pcl_array_xyz
+                            
+                    if not self.args.no_show:
+                        pcl_rgb_color = (self.pcl_array_rgb * 255).astype(np.uint8)
+                        max_dist = 3.0 # 3m in any dimension
+                        min_dist = -3.0 # 3m in any dimension
+                        pcl_xyz_crop = self.pcl_array_xyz.copy()
+                        pcl_xyz_crop[pcl_xyz_crop > max_dist] = max_dist
+                        pcl_xyz_crop[pcl_xyz_crop < min_dist] = min_dist
+                        pcl_dist_norm = (pcl_xyz_crop - min_dist) / (max_dist - min_dist)
+                        pcl_dist_color = (pcl_dist_norm * 255).astype(np.uint8)
+                        full_display_array[rgb_array.shape[0]:, rgb_array.shape[1]: ,:] = pcl_rgb_color[:,:,::-1]
+                        full_display_array[:rgb_array.shape[0], rgb_array.shape[1]: ,:] = pcl_dist_color
+
+                if not self.args.no_show:
+                    #format(self.rgb_current_seq, self.depth_current_seq, self.pcl_current_seq)
+                    cv2.imshow("RGBD window", full_display_array)
+                    cv2.waitKey(3)
+                    
+                self.current_image_count += 1
+                
+                if (self.current_image_count > 1000 and self.args.light_save):
+                    prWarning("Finish here and save all 100 images !")
+                    self.rgb_sub.unregister()
+                    if not self.args.no_pcl:
+                        self.pcl_sub.unregister()
+                    if not self.args.no_depth:
+                        self.depth_sub.unregister()
+                    break
+                elif self.args.light_save:
+                    prInfo("Collected image {} / 1000 before closing".format(self.current_image_count))            
+                
+            else:
+                rospy.logwarn("Do not display/save images because not initialized (rgb or depth or pcl)")
+           
+            self.loop_rate.sleep()
+        
+        if self.args.light_save:
+            
+            prWarning("Please wait while we save images and pcl !")
+            
+            if not self.args.no_pcl:
+                for key, value in saving_pcl.items():
+                    value.tofile(key)
+                    prSuccess("Saved pcl to {}".format(key)) 
+                      
+            for key, value in saving_rgb.items():
+                cv2.imwrite(key, value)
+                prSuccess("Saved rgb to {}".format(key))   
+                
+if __name__ == '__main__':
+
+    ## Parser with params
+    parser = ArgumentParser()
+    parser.add_argument('--rgb_topic', default = "orbbec/rgb", type=str, help='ROS topic for RGB image')
+    parser.add_argument('--depth_topic', default = "orbbec/depth", type=str, help='ROS topic for depth image')
+    parser.add_argument('--pcl_topic', default = "orbbec/pcl", type=str, help='ROS topic for pcl')
+    parser.add_argument(
+        '--no_depth',
+        action='store_true',
+        default=False,
+        help='Do not use depth subscriber / recorder / visualizer')
+    parser.add_argument(
+        '--no_pcl',
+        action='store_true',
+        default=False,
+        help='Do not use pcl subscriber / recorder / visualizer')
+    parser.add_argument(
+        '--no_show',
+        action='store_true',
+        default=False,
+        help='whether to show visualizations.')
+    parser.add_argument(
+        '--save',
+        action='store_true',
+        default=False,
+        help='whether to save images (rgb and d and pcl)')
+    parser.add_argument(
+        '--light_save',
+        action='store_true',
+        default=False,
+        help='whether to save only rgb and pcl')
+    parser.add_argument(
+        '--fps',
+        type=int,
+        default=30,
+        help='Node and recording fps')
+    parser.add_argument('--depth_cmap', default = "jet", type=str, help='mpl colormap for depth image')
+
+    args = parser.parse_args()
+    prInfo("Loaded with args : {}".format(args))
+    
+    rospy.init_node("python_orbbec_vis_save", anonymous=True)
+    my_node = VisualizerNode(args)
+    my_node.start()
+    cv2.destroyAllWindows()
\ No newline at end of file