From cad1eb55f165768f79acc319dfd7e22952ef7878 Mon Sep 17 00:00:00 2001 From: rlorenzo Date: Tue, 17 Dec 2024 13:37:58 +0100 Subject: [PATCH] Add code without changes --- .gitignore | 7 + Dockerfile | 58 + README.md | 25 +- configs/_base_/datasets/300w.py | 384 ++++ configs/_base_/datasets/aflw.py | 83 + configs/_base_/datasets/aic.py | 140 ++ configs/_base_/datasets/aic_info.py | 140 ++ configs/_base_/datasets/animalpose.py | 166 ++ configs/_base_/datasets/ap10k.py | 142 ++ configs/_base_/datasets/ap10k_info.py | 142 ++ configs/_base_/datasets/atrw.py | 144 ++ configs/_base_/datasets/coco.py | 181 ++ configs/_base_/datasets/coco_wholebody.py | 1154 ++++++++++ .../_base_/datasets/coco_wholebody_face.py | 448 ++++ .../_base_/datasets/coco_wholebody_hand.py | 147 ++ .../_base_/datasets/coco_wholebody_info.py | 1154 ++++++++++ configs/_base_/datasets/cofw.py | 134 ++ configs/_base_/datasets/crowdpose.py | 147 ++ configs/_base_/datasets/deepfashion_full.py | 74 + configs/_base_/datasets/deepfashion_lower.py | 46 + configs/_base_/datasets/deepfashion_upper.py | 60 + configs/_base_/datasets/fly.py | 237 ++ configs/_base_/datasets/freihand2d.py | 144 ++ configs/_base_/datasets/h36m.py | 152 ++ configs/_base_/datasets/halpe.py | 1157 ++++++++++ configs/_base_/datasets/horse10.py | 201 ++ configs/_base_/datasets/interhand2d.py | 142 ++ configs/_base_/datasets/interhand3d.py | 487 ++++ configs/_base_/datasets/jhmdb.py | 129 ++ configs/_base_/datasets/locust.py | 263 +++ configs/_base_/datasets/macaque.py | 183 ++ configs/_base_/datasets/mhp.py | 156 ++ configs/_base_/datasets/mpi_inf_3dhp.py | 132 ++ configs/_base_/datasets/mpii.py | 155 ++ configs/_base_/datasets/mpii_info.py | 155 ++ configs/_base_/datasets/mpii_trb.py | 380 ++++ configs/_base_/datasets/ochuman.py | 181 ++ configs/_base_/datasets/onehand10k.py | 142 ++ configs/_base_/datasets/panoptic_body3d.py | 160 ++ configs/_base_/datasets/panoptic_hand2d.py | 143 ++ configs/_base_/datasets/posetrack18.py | 176 ++ configs/_base_/datasets/rhd2d.py | 141 ++ configs/_base_/datasets/wflw.py | 582 +++++ configs/_base_/datasets/zebra.py | 64 + configs/_base_/default_runtime.py | 19 + configs/_base_/filters/gausian_filter.py | 0 configs/detection/yolo_classes.py | 84 + configs/detection/yolov3_d53_320_273e_coco.py | 140 ++ configs/pose/ViTPose_base_coco_256x192.py | 170 ++ .../pose/ViTPose_base_simple_coco_256x192.py | 171 ++ configs/pose/ViTPose_small_coco_256x192.py | 170 ++ configs/pose3d/MB_ft_h36m.yaml | 50 + gafa_utils.py | 448 ++++ launch.sh | 29 + lib/data/augmentation.py | 99 + lib/data/datareader_h36m.py | 136 ++ lib/data/datareader_mesh.py | 59 + lib/data/dataset_action.py | 206 ++ lib/data/dataset_mesh.py | 97 + lib/data/dataset_motion_2d.py | 148 ++ lib/data/dataset_motion_3d.py | 68 + lib/data/dataset_wild.py | 185 ++ lib/model/DSTformer.py | 362 +++ lib/model/drop.py | 43 + lib/model/loss.py | 204 ++ lib/model/loss_mesh.py | 68 + lib/model/loss_supcon.py | 98 + lib/model/model_action.py | 71 + lib/model/model_mesh.py | 101 + lib/utils/learning.py | 102 + lib/utils/tools.py | 69 + lib/utils/utils_data.py | 112 + lib/utils/utils_mesh.py | 521 +++++ lib/utils/utils_smpl.py | 88 + lib/utils/vismo.py | 347 +++ mmpose/.mim/configs | 1 + mmpose/.mim/demo | 1 + mmpose/.mim/model-index.yml | 1 + mmpose/.mim/tools | 1 + mmpose/__init__.py | 29 + mmpose/apis/__init__.py | 20 + mmpose/apis/inference.py | 833 +++++++ mmpose/apis/inference_3d.py | 791 +++++++ mmpose/apis/inference_tracking.py | 347 +++ mmpose/apis/test.py | 191 ++ mmpose/apis/train.py | 200 ++ mmpose/core/__init__.py | 8 + mmpose/core/camera/__init__.py | 6 + mmpose/core/camera/camera_base.py | 45 + mmpose/core/camera/single_camera.py | 123 + mmpose/core/camera/single_camera_torch.py | 118 + mmpose/core/distributed_wrapper.py | 143 ++ mmpose/core/evaluation/__init__.py | 22 + mmpose/core/evaluation/bottom_up_eval.py | 333 +++ mmpose/core/evaluation/eval_hooks.py | 98 + mmpose/core/evaluation/mesh_eval.py | 66 + mmpose/core/evaluation/pose3d_eval.py | 171 ++ mmpose/core/evaluation/top_down_eval.py | 684 ++++++ mmpose/core/fp16/__init__.py | 9 + mmpose/core/fp16/decorators.py | 175 ++ mmpose/core/fp16/hooks.py | 167 ++ mmpose/core/fp16/utils.py | 34 + mmpose/core/optimizer/__init__.py | 4 + mmpose/core/optimizer/builder.py | 56 + mmpose/core/post_processing/__init__.py | 14 + mmpose/core/post_processing/group.py | 410 ++++ mmpose/core/post_processing/nms.py | 207 ++ .../core/post_processing/one_euro_filter.py | 102 + .../core/post_processing/post_transforms.py | 366 +++ mmpose/core/utils/__init__.py | 5 + mmpose/core/utils/dist_utils.py | 51 + mmpose/core/utils/regularizations.py | 86 + mmpose/core/visualization/__init__.py | 13 + mmpose/core/visualization/effects.py | 111 + mmpose/core/visualization/image.py | 442 ++++ mmpose/datasets/__init__.py | 42 + mmpose/datasets/builder.py | 162 ++ mmpose/datasets/dataset_info.py | 104 + mmpose/datasets/dataset_wrappers.py | 31 + mmpose/datasets/datasets/__init__.py | 45 + mmpose/datasets/datasets/animal/__init__.py | 15 + .../datasets/animal/animal_ap10k_dataset.py | 367 +++ .../datasets/animal/animal_atrw_dataset.py | 353 +++ .../datasets/animal/animal_base_dataset.py | 16 + .../datasets/animal/animal_fly_dataset.py | 215 ++ .../datasets/animal/animal_horse10_dataset.py | 220 ++ .../datasets/animal/animal_locust_dataset.py | 218 ++ .../datasets/animal/animal_macaque_dataset.py | 355 +++ .../datasets/animal/animal_pose_dataset.py | 359 +++ .../datasets/animal/animal_zebra_dataset.py | 193 ++ mmpose/datasets/datasets/base/__init__.py | 17 + .../kpt_2d_sview_rgb_img_bottom_up_dataset.py | 188 ++ .../kpt_2d_sview_rgb_img_top_down_dataset.py | 287 +++ .../kpt_2d_sview_rgb_vid_top_down_dataset.py | 200 ++ .../kpt_3d_mview_rgb_img_direct_dataset.py | 143 ++ .../base/kpt_3d_sview_kpt_2d_dataset.py | 226 ++ .../kpt_3d_sview_rgb_img_top_down_dataset.py | 256 +++ mmpose/datasets/datasets/body3d/__init__.py | 11 + .../datasets/body3d/body3d_base_dataset.py | 16 + .../datasets/body3d/body3d_h36m_dataset.py | 343 +++ .../body3d/body3d_mpi_inf_3dhp_dataset.py | 417 ++++ .../body3d_mview_direct_panoptic_dataset.py | 493 ++++ .../body3d/body3d_semi_supervision_dataset.py | 41 + .../datasets/datasets/bottom_up/__init__.py | 11 + .../datasets/bottom_up/bottom_up_aic.py | 105 + .../bottom_up/bottom_up_base_dataset.py | 14 + .../datasets/bottom_up/bottom_up_coco.py | 305 +++ .../bottom_up/bottom_up_coco_wholebody.py | 238 ++ .../datasets/bottom_up/bottom_up_crowdpose.py | 109 + .../datasets/bottom_up/bottom_up_mhp.py | 108 + mmpose/datasets/datasets/face/__init__.py | 11 + .../datasets/face/face_300w_dataset.py | 199 ++ .../datasets/face/face_aflw_dataset.py | 205 ++ .../datasets/face/face_base_dataset.py | 16 + .../face/face_coco_wholebody_dataset.py | 198 ++ .../datasets/face/face_cofw_dataset.py | 198 ++ .../datasets/face/face_wflw_dataset.py | 199 ++ mmpose/datasets/datasets/fashion/__init__.py | 4 + .../datasets/fashion/deepfashion_dataset.py | 225 ++ .../datasets/fashion/fashion_base_dataset.py | 16 + mmpose/datasets/datasets/hand/__init__.py | 14 + .../datasets/hand/freihand_dataset.py | 205 ++ .../datasets/hand/hand_base_dataset.py | 16 + .../hand/hand_coco_wholebody_dataset.py | 211 ++ .../datasets/hand/interhand2d_dataset.py | 306 +++ .../datasets/hand/interhand3d_dataset.py | 505 ++++ .../datasets/hand/onehand10k_dataset.py | 205 ++ .../datasets/hand/panoptic_hand2d_dataset.py | 208 ++ .../datasets/datasets/hand/rhd2d_dataset.py | 205 ++ mmpose/datasets/datasets/mesh/__init__.py | 10 + .../datasets/mesh/mesh_adv_dataset.py | 43 + .../datasets/mesh/mesh_base_dataset.py | 155 ++ .../datasets/mesh/mesh_h36m_dataset.py | 101 + .../datasets/mesh/mesh_mix_dataset.py | 73 + mmpose/datasets/datasets/mesh/mosh_dataset.py | 68 + mmpose/datasets/datasets/top_down/__init__.py | 30 + .../datasets/top_down/topdown_aic_dataset.py | 112 + .../datasets/top_down/topdown_base_dataset.py | 16 + .../datasets/top_down/topdown_coco_dataset.py | 405 ++++ .../topdown_coco_wholebody_dataset.py | 274 +++ .../top_down/topdown_crowdpose_dataset.py | 110 + .../datasets/top_down/topdown_h36m_dataset.py | 206 ++ .../top_down/topdown_halpe_dataset.py | 77 + .../top_down/topdown_jhmdb_dataset.py | 361 +++ .../datasets/top_down/topdown_mhp_dataset.py | 125 + .../datasets/top_down/topdown_mpii_dataset.py | 275 +++ .../top_down/topdown_mpii_trb_dataset.py | 310 +++ .../top_down/topdown_ochuman_dataset.py | 97 + .../top_down/topdown_posetrack18_dataset.py | 312 +++ .../topdown_posetrack18_video_dataset.py | 549 +++++ mmpose/datasets/pipelines/__init__.py | 8 + .../datasets/pipelines/bottom_up_transform.py | 816 +++++++ mmpose/datasets/pipelines/hand_transform.py | 63 + mmpose/datasets/pipelines/loading.py | 91 + mmpose/datasets/pipelines/mesh_transform.py | 399 ++++ mmpose/datasets/pipelines/pose3d_transform.py | 643 ++++++ mmpose/datasets/pipelines/shared_transform.py | 527 +++++ .../datasets/pipelines/top_down_transform.py | 736 ++++++ mmpose/datasets/registry.py | 13 + mmpose/datasets/samplers/__init__.py | 4 + .../datasets/samplers/distributed_sampler.py | 41 + mmpose/deprecated.py | 199 ++ mmpose/models/__init__.py | 16 + mmpose/models/backbones/__init__.py | 36 + mmpose/models/backbones/alexnet.py | 56 + mmpose/models/backbones/base_backbone.py | 43 + mmpose/models/backbones/cpm.py | 186 ++ mmpose/models/backbones/hourglass.py | 212 ++ mmpose/models/backbones/hourglass_ae.py | 212 ++ mmpose/models/backbones/hrformer.py | 746 ++++++ mmpose/models/backbones/hrnet.py | 604 +++++ mmpose/models/backbones/litehrnet.py | 984 ++++++++ mmpose/models/backbones/mobilenet_v2.py | 275 +++ mmpose/models/backbones/mobilenet_v3.py | 188 ++ mmpose/models/backbones/mspn.py | 513 +++++ mmpose/models/backbones/regnet.py | 317 +++ mmpose/models/backbones/resnest.py | 338 +++ mmpose/models/backbones/resnet.py | 701 ++++++ mmpose/models/backbones/resnext.py | 162 ++ mmpose/models/backbones/rsn.py | 616 +++++ mmpose/models/backbones/scnet.py | 248 ++ mmpose/models/backbones/seresnet.py | 125 + mmpose/models/backbones/seresnext.py | 168 ++ mmpose/models/backbones/shufflenet_v1.py | 329 +++ mmpose/models/backbones/shufflenet_v2.py | 302 +++ mmpose/models/backbones/tcn.py | 267 +++ mmpose/models/backbones/utils/__init__.py | 11 + .../models/backbones/utils/channel_shuffle.py | 29 + .../backbones/utils/inverted_residual.py | 128 ++ .../models/backbones/utils/make_divisible.py | 25 + mmpose/models/backbones/utils/se_layer.py | 54 + mmpose/models/backbones/utils/utils.py | 87 + mmpose/models/backbones/v2v_net.py | 257 +++ mmpose/models/backbones/vgg.py | 193 ++ mmpose/models/backbones/vipnas_mbv3.py | 179 ++ mmpose/models/backbones/vipnas_resnet.py | 589 +++++ mmpose/models/backbones/vit.py | 341 +++ mmpose/models/backbones/vit_moe.py | 385 ++++ mmpose/models/builder.py | 44 + mmpose/models/detectors/__init__.py | 17 + .../models/detectors/associative_embedding.py | 420 ++++ mmpose/models/detectors/base.py | 131 ++ mmpose/models/detectors/interhand_3d.py | 227 ++ mmpose/models/detectors/mesh.py | 438 ++++ mmpose/models/detectors/multi_task.py | 187 ++ mmpose/models/detectors/multiview_pose.py | 889 ++++++++ mmpose/models/detectors/pose_lifter.py | 392 ++++ mmpose/models/detectors/posewarper.py | 244 ++ mmpose/models/detectors/top_down.py | 307 +++ mmpose/models/detectors/top_down_moe.py | 351 +++ mmpose/models/heads/__init__.py | 24 + .../models/heads/ae_higher_resolution_head.py | 249 ++ mmpose/models/heads/ae_multi_stage_head.py | 222 ++ mmpose/models/heads/ae_simple_head.py | 99 + mmpose/models/heads/deconv_head.py | 295 +++ .../models/heads/deeppose_regression_head.py | 176 ++ mmpose/models/heads/hmr_head.py | 94 + mmpose/models/heads/interhand_3d_head.py | 521 +++++ .../models/heads/temporal_regression_head.py | 319 +++ .../models/heads/topdown_heatmap_base_head.py | 120 + .../heads/topdown_heatmap_multi_stage_head.py | 572 +++++ .../heads/topdown_heatmap_simple_head.py | 350 +++ .../heads/vipnas_heatmap_simple_head.py | 349 +++ mmpose/models/heads/voxelpose_head.py | 167 ++ mmpose/models/losses/__init__.py | 16 + mmpose/models/losses/classfication_loss.py | 41 + mmpose/models/losses/heatmap_loss.py | 86 + mmpose/models/losses/mesh_loss.py | 340 +++ mmpose/models/losses/mse_loss.py | 153 ++ mmpose/models/losses/multi_loss_factory.py | 281 +++ mmpose/models/losses/regression_loss.py | 448 ++++ mmpose/models/misc/__init__.py | 1 + mmpose/models/misc/discriminator.py | 307 +++ mmpose/models/necks/__init__.py | 5 + mmpose/models/necks/gap_neck.py | 37 + mmpose/models/necks/posewarper_neck.py | 329 +++ mmpose/models/registry.py | 13 + mmpose/models/utils/__init__.py | 4 + mmpose/models/utils/geometry.py | 68 + mmpose/models/utils/ops.py | 29 + mmpose/models/utils/smpl.py | 184 ++ mmpose/utils/__init__.py | 9 + mmpose/utils/collect_env.py | 16 + mmpose/utils/hooks.py | 60 + mmpose/utils/logger.py | 25 + mmpose/utils/setup_env.py | 47 + mmpose/utils/timer.py | 117 + mmpose/version.py | 19 + nets/nn.py | 278 +++ rgbd_3d.py | 765 +++++++ rgbd_detect.py | 1011 ++++++++ rgbd_detect_3d_dir.py | 2025 +++++++++++++++++ run.sh | 1 + sixdrep/util.py | 442 ++++ sixdrep/utils.py | 8 + utils.py | 501 ++++ visualizer.py | 279 +++ 297 files changed, 63275 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 configs/_base_/datasets/300w.py create mode 100644 configs/_base_/datasets/aflw.py create mode 100644 configs/_base_/datasets/aic.py create mode 100644 configs/_base_/datasets/aic_info.py create mode 100644 configs/_base_/datasets/animalpose.py create mode 100644 configs/_base_/datasets/ap10k.py create mode 100644 configs/_base_/datasets/ap10k_info.py create mode 100644 configs/_base_/datasets/atrw.py create mode 100644 configs/_base_/datasets/coco.py create mode 100644 configs/_base_/datasets/coco_wholebody.py create mode 100644 configs/_base_/datasets/coco_wholebody_face.py create mode 100644 configs/_base_/datasets/coco_wholebody_hand.py create mode 100644 configs/_base_/datasets/coco_wholebody_info.py create mode 100644 configs/_base_/datasets/cofw.py create mode 100644 configs/_base_/datasets/crowdpose.py create mode 100644 configs/_base_/datasets/deepfashion_full.py create mode 100644 configs/_base_/datasets/deepfashion_lower.py create mode 100644 configs/_base_/datasets/deepfashion_upper.py create mode 100644 configs/_base_/datasets/fly.py create mode 100644 configs/_base_/datasets/freihand2d.py create mode 100644 configs/_base_/datasets/h36m.py create mode 100644 configs/_base_/datasets/halpe.py create mode 100644 configs/_base_/datasets/horse10.py create mode 100644 configs/_base_/datasets/interhand2d.py create mode 100644 configs/_base_/datasets/interhand3d.py create mode 100644 configs/_base_/datasets/jhmdb.py create mode 100644 configs/_base_/datasets/locust.py create mode 100644 configs/_base_/datasets/macaque.py create mode 100644 configs/_base_/datasets/mhp.py create mode 100644 configs/_base_/datasets/mpi_inf_3dhp.py create mode 100644 configs/_base_/datasets/mpii.py create mode 100644 configs/_base_/datasets/mpii_info.py create mode 100644 configs/_base_/datasets/mpii_trb.py create mode 100644 configs/_base_/datasets/ochuman.py create mode 100644 configs/_base_/datasets/onehand10k.py create mode 100644 configs/_base_/datasets/panoptic_body3d.py create mode 100644 configs/_base_/datasets/panoptic_hand2d.py create mode 100644 configs/_base_/datasets/posetrack18.py create mode 100644 configs/_base_/datasets/rhd2d.py create mode 100644 configs/_base_/datasets/wflw.py create mode 100644 configs/_base_/datasets/zebra.py create mode 100644 configs/_base_/default_runtime.py create mode 100644 configs/_base_/filters/gausian_filter.py create mode 100644 configs/detection/yolo_classes.py create mode 100644 configs/detection/yolov3_d53_320_273e_coco.py create mode 100644 configs/pose/ViTPose_base_coco_256x192.py create mode 100644 configs/pose/ViTPose_base_simple_coco_256x192.py create mode 100644 configs/pose/ViTPose_small_coco_256x192.py create mode 100644 configs/pose3d/MB_ft_h36m.yaml create mode 100644 gafa_utils.py create mode 100644 launch.sh create mode 100644 lib/data/augmentation.py create mode 100644 lib/data/datareader_h36m.py create mode 100644 lib/data/datareader_mesh.py create mode 100644 lib/data/dataset_action.py create mode 100644 lib/data/dataset_mesh.py create mode 100644 lib/data/dataset_motion_2d.py create mode 100644 lib/data/dataset_motion_3d.py create mode 100644 lib/data/dataset_wild.py create mode 100644 lib/model/DSTformer.py create mode 100644 lib/model/drop.py create mode 100644 lib/model/loss.py create mode 100644 lib/model/loss_mesh.py create mode 100644 lib/model/loss_supcon.py create mode 100644 lib/model/model_action.py create mode 100644 lib/model/model_mesh.py create mode 100644 lib/utils/learning.py create mode 100644 lib/utils/tools.py create mode 100644 lib/utils/utils_data.py create mode 100644 lib/utils/utils_mesh.py create mode 100644 lib/utils/utils_smpl.py create mode 100644 lib/utils/vismo.py create mode 120000 mmpose/.mim/configs create mode 120000 mmpose/.mim/demo create mode 120000 mmpose/.mim/model-index.yml create mode 120000 mmpose/.mim/tools create mode 100644 mmpose/__init__.py create mode 100644 mmpose/apis/__init__.py create mode 100644 mmpose/apis/inference.py create mode 100644 mmpose/apis/inference_3d.py create mode 100644 mmpose/apis/inference_tracking.py create mode 100644 mmpose/apis/test.py create mode 100644 mmpose/apis/train.py create mode 100644 mmpose/core/__init__.py create mode 100644 mmpose/core/camera/__init__.py create mode 100644 mmpose/core/camera/camera_base.py create mode 100644 mmpose/core/camera/single_camera.py create mode 100644 mmpose/core/camera/single_camera_torch.py create mode 100644 mmpose/core/distributed_wrapper.py create mode 100644 mmpose/core/evaluation/__init__.py create mode 100644 mmpose/core/evaluation/bottom_up_eval.py create mode 100644 mmpose/core/evaluation/eval_hooks.py create mode 100644 mmpose/core/evaluation/mesh_eval.py create mode 100644 mmpose/core/evaluation/pose3d_eval.py create mode 100644 mmpose/core/evaluation/top_down_eval.py create mode 100644 mmpose/core/fp16/__init__.py create mode 100644 mmpose/core/fp16/decorators.py create mode 100644 mmpose/core/fp16/hooks.py create mode 100644 mmpose/core/fp16/utils.py create mode 100644 mmpose/core/optimizer/__init__.py create mode 100644 mmpose/core/optimizer/builder.py create mode 100644 mmpose/core/post_processing/__init__.py create mode 100644 mmpose/core/post_processing/group.py create mode 100644 mmpose/core/post_processing/nms.py create mode 100644 mmpose/core/post_processing/one_euro_filter.py create mode 100644 mmpose/core/post_processing/post_transforms.py create mode 100644 mmpose/core/utils/__init__.py create mode 100644 mmpose/core/utils/dist_utils.py create mode 100644 mmpose/core/utils/regularizations.py create mode 100644 mmpose/core/visualization/__init__.py create mode 100644 mmpose/core/visualization/effects.py create mode 100644 mmpose/core/visualization/image.py create mode 100644 mmpose/datasets/__init__.py create mode 100644 mmpose/datasets/builder.py create mode 100644 mmpose/datasets/dataset_info.py create mode 100644 mmpose/datasets/dataset_wrappers.py create mode 100644 mmpose/datasets/datasets/__init__.py create mode 100644 mmpose/datasets/datasets/animal/__init__.py create mode 100644 mmpose/datasets/datasets/animal/animal_ap10k_dataset.py create mode 100644 mmpose/datasets/datasets/animal/animal_atrw_dataset.py create mode 100644 mmpose/datasets/datasets/animal/animal_base_dataset.py create mode 100644 mmpose/datasets/datasets/animal/animal_fly_dataset.py create mode 100644 mmpose/datasets/datasets/animal/animal_horse10_dataset.py create mode 100644 mmpose/datasets/datasets/animal/animal_locust_dataset.py create mode 100644 mmpose/datasets/datasets/animal/animal_macaque_dataset.py create mode 100644 mmpose/datasets/datasets/animal/animal_pose_dataset.py create mode 100644 mmpose/datasets/datasets/animal/animal_zebra_dataset.py create mode 100644 mmpose/datasets/datasets/base/__init__.py create mode 100644 mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py create mode 100644 mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py create mode 100644 mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py create mode 100644 mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py create mode 100644 mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py create mode 100644 mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py create mode 100644 mmpose/datasets/datasets/body3d/__init__.py create mode 100644 mmpose/datasets/datasets/body3d/body3d_base_dataset.py create mode 100644 mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py create mode 100644 mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py create mode 100644 mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py create mode 100644 mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py create mode 100644 mmpose/datasets/datasets/bottom_up/__init__.py create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_aic.py create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_coco.py create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py create mode 100644 mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py create mode 100644 mmpose/datasets/datasets/face/__init__.py create mode 100644 mmpose/datasets/datasets/face/face_300w_dataset.py create mode 100644 mmpose/datasets/datasets/face/face_aflw_dataset.py create mode 100644 mmpose/datasets/datasets/face/face_base_dataset.py create mode 100644 mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py create mode 100644 mmpose/datasets/datasets/face/face_cofw_dataset.py create mode 100644 mmpose/datasets/datasets/face/face_wflw_dataset.py create mode 100644 mmpose/datasets/datasets/fashion/__init__.py create mode 100644 mmpose/datasets/datasets/fashion/deepfashion_dataset.py create mode 100644 mmpose/datasets/datasets/fashion/fashion_base_dataset.py create mode 100644 mmpose/datasets/datasets/hand/__init__.py create mode 100644 mmpose/datasets/datasets/hand/freihand_dataset.py create mode 100644 mmpose/datasets/datasets/hand/hand_base_dataset.py create mode 100644 mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py create mode 100644 mmpose/datasets/datasets/hand/interhand2d_dataset.py create mode 100644 mmpose/datasets/datasets/hand/interhand3d_dataset.py create mode 100644 mmpose/datasets/datasets/hand/onehand10k_dataset.py create mode 100644 mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py create mode 100644 mmpose/datasets/datasets/hand/rhd2d_dataset.py create mode 100644 mmpose/datasets/datasets/mesh/__init__.py create mode 100644 mmpose/datasets/datasets/mesh/mesh_adv_dataset.py create mode 100644 mmpose/datasets/datasets/mesh/mesh_base_dataset.py create mode 100644 mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py create mode 100644 mmpose/datasets/datasets/mesh/mesh_mix_dataset.py create mode 100644 mmpose/datasets/datasets/mesh/mosh_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/__init__.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_aic_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_base_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_coco_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py create mode 100644 mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py create mode 100644 mmpose/datasets/pipelines/__init__.py create mode 100644 mmpose/datasets/pipelines/bottom_up_transform.py create mode 100644 mmpose/datasets/pipelines/hand_transform.py create mode 100644 mmpose/datasets/pipelines/loading.py create mode 100644 mmpose/datasets/pipelines/mesh_transform.py create mode 100644 mmpose/datasets/pipelines/pose3d_transform.py create mode 100644 mmpose/datasets/pipelines/shared_transform.py create mode 100644 mmpose/datasets/pipelines/top_down_transform.py create mode 100644 mmpose/datasets/registry.py create mode 100644 mmpose/datasets/samplers/__init__.py create mode 100644 mmpose/datasets/samplers/distributed_sampler.py create mode 100644 mmpose/deprecated.py create mode 100644 mmpose/models/__init__.py create mode 100644 mmpose/models/backbones/__init__.py create mode 100644 mmpose/models/backbones/alexnet.py create mode 100644 mmpose/models/backbones/base_backbone.py create mode 100644 mmpose/models/backbones/cpm.py create mode 100644 mmpose/models/backbones/hourglass.py create mode 100644 mmpose/models/backbones/hourglass_ae.py create mode 100644 mmpose/models/backbones/hrformer.py create mode 100644 mmpose/models/backbones/hrnet.py create mode 100644 mmpose/models/backbones/litehrnet.py create mode 100644 mmpose/models/backbones/mobilenet_v2.py create mode 100644 mmpose/models/backbones/mobilenet_v3.py create mode 100644 mmpose/models/backbones/mspn.py create mode 100644 mmpose/models/backbones/regnet.py create mode 100644 mmpose/models/backbones/resnest.py create mode 100644 mmpose/models/backbones/resnet.py create mode 100644 mmpose/models/backbones/resnext.py create mode 100644 mmpose/models/backbones/rsn.py create mode 100644 mmpose/models/backbones/scnet.py create mode 100644 mmpose/models/backbones/seresnet.py create mode 100644 mmpose/models/backbones/seresnext.py create mode 100644 mmpose/models/backbones/shufflenet_v1.py create mode 100644 mmpose/models/backbones/shufflenet_v2.py create mode 100644 mmpose/models/backbones/tcn.py create mode 100644 mmpose/models/backbones/utils/__init__.py create mode 100644 mmpose/models/backbones/utils/channel_shuffle.py create mode 100644 mmpose/models/backbones/utils/inverted_residual.py create mode 100644 mmpose/models/backbones/utils/make_divisible.py create mode 100644 mmpose/models/backbones/utils/se_layer.py create mode 100644 mmpose/models/backbones/utils/utils.py create mode 100644 mmpose/models/backbones/v2v_net.py create mode 100644 mmpose/models/backbones/vgg.py create mode 100644 mmpose/models/backbones/vipnas_mbv3.py create mode 100644 mmpose/models/backbones/vipnas_resnet.py create mode 100644 mmpose/models/backbones/vit.py create mode 100644 mmpose/models/backbones/vit_moe.py create mode 100644 mmpose/models/builder.py create mode 100644 mmpose/models/detectors/__init__.py create mode 100644 mmpose/models/detectors/associative_embedding.py create mode 100644 mmpose/models/detectors/base.py create mode 100644 mmpose/models/detectors/interhand_3d.py create mode 100644 mmpose/models/detectors/mesh.py create mode 100644 mmpose/models/detectors/multi_task.py create mode 100644 mmpose/models/detectors/multiview_pose.py create mode 100644 mmpose/models/detectors/pose_lifter.py create mode 100644 mmpose/models/detectors/posewarper.py create mode 100644 mmpose/models/detectors/top_down.py create mode 100644 mmpose/models/detectors/top_down_moe.py create mode 100644 mmpose/models/heads/__init__.py create mode 100644 mmpose/models/heads/ae_higher_resolution_head.py create mode 100644 mmpose/models/heads/ae_multi_stage_head.py create mode 100644 mmpose/models/heads/ae_simple_head.py create mode 100644 mmpose/models/heads/deconv_head.py create mode 100644 mmpose/models/heads/deeppose_regression_head.py create mode 100644 mmpose/models/heads/hmr_head.py create mode 100644 mmpose/models/heads/interhand_3d_head.py create mode 100644 mmpose/models/heads/temporal_regression_head.py create mode 100644 mmpose/models/heads/topdown_heatmap_base_head.py create mode 100644 mmpose/models/heads/topdown_heatmap_multi_stage_head.py create mode 100644 mmpose/models/heads/topdown_heatmap_simple_head.py create mode 100644 mmpose/models/heads/vipnas_heatmap_simple_head.py create mode 100644 mmpose/models/heads/voxelpose_head.py create mode 100644 mmpose/models/losses/__init__.py create mode 100644 mmpose/models/losses/classfication_loss.py create mode 100644 mmpose/models/losses/heatmap_loss.py create mode 100644 mmpose/models/losses/mesh_loss.py create mode 100644 mmpose/models/losses/mse_loss.py create mode 100644 mmpose/models/losses/multi_loss_factory.py create mode 100644 mmpose/models/losses/regression_loss.py create mode 100644 mmpose/models/misc/__init__.py create mode 100644 mmpose/models/misc/discriminator.py create mode 100644 mmpose/models/necks/__init__.py create mode 100644 mmpose/models/necks/gap_neck.py create mode 100644 mmpose/models/necks/posewarper_neck.py create mode 100644 mmpose/models/registry.py create mode 100644 mmpose/models/utils/__init__.py create mode 100644 mmpose/models/utils/geometry.py create mode 100644 mmpose/models/utils/ops.py create mode 100644 mmpose/models/utils/smpl.py create mode 100644 mmpose/utils/__init__.py create mode 100644 mmpose/utils/collect_env.py create mode 100644 mmpose/utils/hooks.py create mode 100644 mmpose/utils/logger.py create mode 100644 mmpose/utils/setup_env.py create mode 100644 mmpose/utils/timer.py create mode 100644 mmpose/version.py create mode 100644 nets/nn.py create mode 100755 rgbd_3d.py create mode 100644 rgbd_detect.py create mode 100644 rgbd_detect_3d_dir.py create mode 100644 run.sh create mode 100644 sixdrep/util.py create mode 100644 sixdrep/utils.py create mode 100644 utils.py create mode 100755 visualizer.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0caa42d --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.pyc +*.pth +*.pt +*.zip +*.deb +*.bin +*.onnx \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2c4778a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,58 @@ +ARG PYTORCH="1.8.0" +ARG CUDA="11.1" +ARG CUDNN="8" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel +COPY ./cuda-keyring_1.0-1_all.deb cuda-keyring_1.0-1_all.deb +RUN rm /etc/apt/sources.list.d/cuda.list && rm /etc/apt/sources.list.d/nvidia-ml.list && dpkg -i cuda-keyring_1.0-1_all.deb +RUN apt-get update + +RUN apt-get install -y software-properties-common +RUN apt-get update +RUN add-apt-repository ppa:ubuntu-toolchain-r/test +RUN apt install -y gcc-9 +RUN apt-get install libstdc++6 +RUN apt-get update + + +ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX" +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" +ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" + +# Refer a advise in the issues page +#RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +#RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + + +RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install xtcocotools +RUN pip install cython +RUN pip install xtcocotools + +# Install MMCV +RUN pip install mmcv-full==1.3.17 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html + + +#System full upgrade +RUN apt-get update && apt-get --with-new-pkgs upgrade -y + + +#Install for python rgbdetect +RUN pip install mmdet==2.28.2 +RUN git clone https://github.com/ViTAE-Transformer/ViTPose/ +WORKDIR /workspace/ViTPose +RUN pip install -v -e . +RUN pip install timm==0.4.9 einops +RUN pip install print-color +RUN pip install --extra-index-url https://rospypi.github.io/simple/ rospy +RUN pip install -U --extra-index-url https://rospypi.github.io/simple/_pre sensor_msgs tf2_ros tf2_sensor_msgs tf tf2-py +RUN pip install -U --extra-index-url https://rospypi.github.io/simple/_pre cv_bridge +RUN pip install albumentations==1.1.0 +RUN pip install onnxruntime +RUN pip uninstall opencv-python -y +RUN pip install opencv-python==4.10.0.84 + +WORKDIR /workspace/rgbd_pose_and_depth diff --git a/README.md b/README.md index 09c38c3..316dc86 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,19 @@ The pipeline consists of the following main components : - Gaze estimation ## Installation -How to install with docker +How to install with docker and launch interactively + +- Build docker + +```bash +docker build -t inria_docker:rgbd_detect . +``` + +- Launch docker + +```bash +sh launch.sh +``` ## ROS Interface Input / output topics and format @@ -21,3 +33,14 @@ Flags to run with options ## Acknoledgments MMDet, MMPose, 6DRepNet... + + +## TODO +- Clean the code of unnecesary options (keep only sixdrep) +- Remove unecessary dependencies +- Keep only rgbd_detect_3d_dir.py +- Add models in external links, LFS or directly from their source in the corresponding repos +- Add a description of all options +- Add ROS interface description +- Add illustration +- Add instructions and `requirements.txt` for local installation \ No newline at end of file diff --git a/configs/_base_/datasets/300w.py b/configs/_base_/datasets/300w.py new file mode 100644 index 0000000..10c343a --- /dev/null +++ b/configs/_base_/datasets/300w.py @@ -0,0 +1,384 @@ +dataset_info = dict( + dataset_name='300w', + paper_info=dict( + author='Sagonas, Christos and Antonakos, Epameinondas ' + 'and Tzimiropoulos, Georgios and Zafeiriou, Stefanos ' + 'and Pantic, Maja', + title='300 faces in-the-wild challenge: ' + 'Database and results', + container='Image and vision computing', + year='2016', + homepage='https://ibug.doc.ic.ac.uk/resources/300-W/', + ), + keypoint_info={ + 0: + dict( + name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-16'), + 1: + dict( + name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-15'), + 2: + dict( + name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-14'), + 3: + dict( + name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-13'), + 4: + dict( + name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-12'), + 5: + dict( + name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-11'), + 6: + dict( + name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-10'), + 7: + dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-9'), + 8: + dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap=''), + 9: + dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-7'), + 10: + dict( + name='kpt-10', id=10, color=[255, 255, 255], type='', + swap='kpt-6'), + 11: + dict( + name='kpt-11', id=11, color=[255, 255, 255], type='', + swap='kpt-5'), + 12: + dict( + name='kpt-12', id=12, color=[255, 255, 255], type='', + swap='kpt-4'), + 13: + dict( + name='kpt-13', id=13, color=[255, 255, 255], type='', + swap='kpt-3'), + 14: + dict( + name='kpt-14', id=14, color=[255, 255, 255], type='', + swap='kpt-2'), + 15: + dict( + name='kpt-15', id=15, color=[255, 255, 255], type='', + swap='kpt-1'), + 16: + dict( + name='kpt-16', id=16, color=[255, 255, 255], type='', + swap='kpt-0'), + 17: + dict( + name='kpt-17', + id=17, + color=[255, 255, 255], + type='', + swap='kpt-26'), + 18: + dict( + name='kpt-18', + id=18, + color=[255, 255, 255], + type='', + swap='kpt-25'), + 19: + dict( + name='kpt-19', + id=19, + color=[255, 255, 255], + type='', + swap='kpt-24'), + 20: + dict( + name='kpt-20', + id=20, + color=[255, 255, 255], + type='', + swap='kpt-23'), + 21: + dict( + name='kpt-21', + id=21, + color=[255, 255, 255], + type='', + swap='kpt-22'), + 22: + dict( + name='kpt-22', + id=22, + color=[255, 255, 255], + type='', + swap='kpt-21'), + 23: + dict( + name='kpt-23', + id=23, + color=[255, 255, 255], + type='', + swap='kpt-20'), + 24: + dict( + name='kpt-24', + id=24, + color=[255, 255, 255], + type='', + swap='kpt-19'), + 25: + dict( + name='kpt-25', + id=25, + color=[255, 255, 255], + type='', + swap='kpt-18'), + 26: + dict( + name='kpt-26', + id=26, + color=[255, 255, 255], + type='', + swap='kpt-17'), + 27: + dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''), + 28: + dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap=''), + 29: + dict(name='kpt-29', id=29, color=[255, 255, 255], type='', swap=''), + 30: + dict(name='kpt-30', id=30, color=[255, 255, 255], type='', swap=''), + 31: + dict( + name='kpt-31', + id=31, + color=[255, 255, 255], + type='', + swap='kpt-35'), + 32: + dict( + name='kpt-32', + id=32, + color=[255, 255, 255], + type='', + swap='kpt-34'), + 33: + dict(name='kpt-33', id=33, color=[255, 255, 255], type='', swap=''), + 34: + dict( + name='kpt-34', + id=34, + color=[255, 255, 255], + type='', + swap='kpt-32'), + 35: + dict( + name='kpt-35', + id=35, + color=[255, 255, 255], + type='', + swap='kpt-31'), + 36: + dict( + name='kpt-36', + id=36, + color=[255, 255, 255], + type='', + swap='kpt-45'), + 37: + dict( + name='kpt-37', + id=37, + color=[255, 255, 255], + type='', + swap='kpt-44'), + 38: + dict( + name='kpt-38', + id=38, + color=[255, 255, 255], + type='', + swap='kpt-43'), + 39: + dict( + name='kpt-39', + id=39, + color=[255, 255, 255], + type='', + swap='kpt-42'), + 40: + dict( + name='kpt-40', + id=40, + color=[255, 255, 255], + type='', + swap='kpt-47'), + 41: + dict( + name='kpt-41', + id=41, + color=[255, 255, 255], + type='', + swap='kpt-46'), + 42: + dict( + name='kpt-42', + id=42, + color=[255, 255, 255], + type='', + swap='kpt-39'), + 43: + dict( + name='kpt-43', + id=43, + color=[255, 255, 255], + type='', + swap='kpt-38'), + 44: + dict( + name='kpt-44', + id=44, + color=[255, 255, 255], + type='', + swap='kpt-37'), + 45: + dict( + name='kpt-45', + id=45, + color=[255, 255, 255], + type='', + swap='kpt-36'), + 46: + dict( + name='kpt-46', + id=46, + color=[255, 255, 255], + type='', + swap='kpt-41'), + 47: + dict( + name='kpt-47', + id=47, + color=[255, 255, 255], + type='', + swap='kpt-40'), + 48: + dict( + name='kpt-48', + id=48, + color=[255, 255, 255], + type='', + swap='kpt-54'), + 49: + dict( + name='kpt-49', + id=49, + color=[255, 255, 255], + type='', + swap='kpt-53'), + 50: + dict( + name='kpt-50', + id=50, + color=[255, 255, 255], + type='', + swap='kpt-52'), + 51: + dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''), + 52: + dict( + name='kpt-52', + id=52, + color=[255, 255, 255], + type='', + swap='kpt-50'), + 53: + dict( + name='kpt-53', + id=53, + color=[255, 255, 255], + type='', + swap='kpt-49'), + 54: + dict( + name='kpt-54', + id=54, + color=[255, 255, 255], + type='', + swap='kpt-48'), + 55: + dict( + name='kpt-55', + id=55, + color=[255, 255, 255], + type='', + swap='kpt-59'), + 56: + dict( + name='kpt-56', + id=56, + color=[255, 255, 255], + type='', + swap='kpt-58'), + 57: + dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''), + 58: + dict( + name='kpt-58', + id=58, + color=[255, 255, 255], + type='', + swap='kpt-56'), + 59: + dict( + name='kpt-59', + id=59, + color=[255, 255, 255], + type='', + swap='kpt-55'), + 60: + dict( + name='kpt-60', + id=60, + color=[255, 255, 255], + type='', + swap='kpt-64'), + 61: + dict( + name='kpt-61', + id=61, + color=[255, 255, 255], + type='', + swap='kpt-63'), + 62: + dict(name='kpt-62', id=62, color=[255, 255, 255], type='', swap=''), + 63: + dict( + name='kpt-63', + id=63, + color=[255, 255, 255], + type='', + swap='kpt-61'), + 64: + dict( + name='kpt-64', + id=64, + color=[255, 255, 255], + type='', + swap='kpt-60'), + 65: + dict( + name='kpt-65', + id=65, + color=[255, 255, 255], + type='', + swap='kpt-67'), + 66: + dict(name='kpt-66', id=66, color=[255, 255, 255], type='', swap=''), + 67: + dict( + name='kpt-67', + id=67, + color=[255, 255, 255], + type='', + swap='kpt-65'), + }, + skeleton_info={}, + joint_weights=[1.] * 68, + sigmas=[]) diff --git a/configs/_base_/datasets/aflw.py b/configs/_base_/datasets/aflw.py new file mode 100644 index 0000000..bf534cb --- /dev/null +++ b/configs/_base_/datasets/aflw.py @@ -0,0 +1,83 @@ +dataset_info = dict( + dataset_name='aflw', + paper_info=dict( + author='Koestinger, Martin and Wohlhart, Paul and ' + 'Roth, Peter M and Bischof, Horst', + title='Annotated facial landmarks in the wild: ' + 'A large-scale, real-world database for facial ' + 'landmark localization', + container='2011 IEEE international conference on computer ' + 'vision workshops (ICCV workshops)', + year='2011', + homepage='https://www.tugraz.at/institute/icg/research/' + 'team-bischof/lrs/downloads/aflw/', + ), + keypoint_info={ + 0: + dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-5'), + 1: + dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-4'), + 2: + dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'), + 3: + dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'), + 4: + dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-1'), + 5: + dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-0'), + 6: + dict( + name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-11'), + 7: + dict( + name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-10'), + 8: + dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'), + 9: + dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'), + 10: + dict( + name='kpt-10', id=10, color=[255, 255, 255], type='', + swap='kpt-7'), + 11: + dict( + name='kpt-11', id=11, color=[255, 255, 255], type='', + swap='kpt-6'), + 12: + dict( + name='kpt-12', + id=12, + color=[255, 255, 255], + type='', + swap='kpt-14'), + 13: + dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap=''), + 14: + dict( + name='kpt-14', + id=14, + color=[255, 255, 255], + type='', + swap='kpt-12'), + 15: + dict( + name='kpt-15', + id=15, + color=[255, 255, 255], + type='', + swap='kpt-17'), + 16: + dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''), + 17: + dict( + name='kpt-17', + id=17, + color=[255, 255, 255], + type='', + swap='kpt-15'), + 18: + dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='') + }, + skeleton_info={}, + joint_weights=[1.] * 19, + sigmas=[]) diff --git a/configs/_base_/datasets/aic.py b/configs/_base_/datasets/aic.py new file mode 100644 index 0000000..9ecdbe3 --- /dev/null +++ b/configs/_base_/datasets/aic.py @@ -0,0 +1,140 @@ +dataset_info = dict( + dataset_name='aic', + paper_info=dict( + author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' + 'Li, Yixin and Yan, Baoming and Liang, Rui and ' + 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' + 'Fu, Yanwei and others', + title='Ai challenger: A large-scale dataset for going ' + 'deeper in image understanding', + container='arXiv', + year='2017', + homepage='https://github.com/AIChallenger/AI_Challenger_2017', + ), + keypoint_info={ + 0: + dict( + name='right_shoulder', + id=0, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 1: + dict( + name='right_elbow', + id=1, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 2: + dict( + name='right_wrist', + id=2, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 3: + dict( + name='left_shoulder', + id=3, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 4: + dict( + name='left_elbow', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 5: + dict( + name='left_wrist', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 6: + dict( + name='right_hip', + id=6, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 7: + dict( + name='right_knee', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 8: + dict( + name='right_ankle', + id=8, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 9: + dict( + name='left_hip', + id=9, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 10: + dict( + name='left_knee', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 11: + dict( + name='left_ankle', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 12: + dict( + name='head_top', + id=12, + color=[51, 153, 255], + type='upper', + swap=''), + 13: + dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]), + 1: dict( + link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]), + 2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]), + 3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]), + 4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]), + 7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]), + 8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]), + 9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]), + 10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]), + 11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), + 12: dict( + link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]), + 13: + dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1. + ], + + # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/' + # 'Evaluation/keypoint_eval/keypoint_eval.py#L50' + # delta = 2 x sigma + sigmas=[ + 0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144, + 0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081, + 0.01291456, 0.01236173 + ]) diff --git a/configs/_base_/datasets/aic_info.py b/configs/_base_/datasets/aic_info.py new file mode 100644 index 0000000..f143fd8 --- /dev/null +++ b/configs/_base_/datasets/aic_info.py @@ -0,0 +1,140 @@ +aic_info = dict( + dataset_name='aic', + paper_info=dict( + author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' + 'Li, Yixin and Yan, Baoming and Liang, Rui and ' + 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' + 'Fu, Yanwei and others', + title='Ai challenger: A large-scale dataset for going ' + 'deeper in image understanding', + container='arXiv', + year='2017', + homepage='https://github.com/AIChallenger/AI_Challenger_2017', + ), + keypoint_info={ + 0: + dict( + name='right_shoulder', + id=0, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 1: + dict( + name='right_elbow', + id=1, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 2: + dict( + name='right_wrist', + id=2, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 3: + dict( + name='left_shoulder', + id=3, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 4: + dict( + name='left_elbow', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 5: + dict( + name='left_wrist', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 6: + dict( + name='right_hip', + id=6, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 7: + dict( + name='right_knee', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 8: + dict( + name='right_ankle', + id=8, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 9: + dict( + name='left_hip', + id=9, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 10: + dict( + name='left_knee', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 11: + dict( + name='left_ankle', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 12: + dict( + name='head_top', + id=12, + color=[51, 153, 255], + type='upper', + swap=''), + 13: + dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]), + 1: dict( + link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]), + 2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]), + 3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]), + 4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]), + 7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]), + 8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]), + 9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]), + 10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]), + 11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), + 12: dict( + link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]), + 13: + dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1. + ], + + # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/' + # 'Evaluation/keypoint_eval/keypoint_eval.py#L50' + # delta = 2 x sigma + sigmas=[ + 0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144, + 0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081, + 0.01291456, 0.01236173 + ]) diff --git a/configs/_base_/datasets/animalpose.py b/configs/_base_/datasets/animalpose.py new file mode 100644 index 0000000..d5bb62d --- /dev/null +++ b/configs/_base_/datasets/animalpose.py @@ -0,0 +1,166 @@ +dataset_info = dict( + dataset_name='animalpose', + paper_info=dict( + author='Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and ' + 'Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing', + title='Cross-Domain Adaptation for Animal Pose Estimation', + container='The IEEE International Conference on ' + 'Computer Vision (ICCV)', + year='2019', + homepage='https://sites.google.com/view/animal-pose/', + ), + keypoint_info={ + 0: + dict( + name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'), + 1: + dict( + name='R_Eye', + id=1, + color=[255, 128, 0], + type='upper', + swap='L_Eye'), + 2: + dict( + name='L_EarBase', + id=2, + color=[0, 255, 0], + type='upper', + swap='R_EarBase'), + 3: + dict( + name='R_EarBase', + id=3, + color=[255, 128, 0], + type='upper', + swap='L_EarBase'), + 4: + dict(name='Nose', id=4, color=[51, 153, 255], type='upper', swap=''), + 5: + dict(name='Throat', id=5, color=[51, 153, 255], type='upper', swap=''), + 6: + dict( + name='TailBase', id=6, color=[51, 153, 255], type='lower', + swap=''), + 7: + dict( + name='Withers', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict( + name='L_F_Elbow', + id=8, + color=[0, 255, 0], + type='upper', + swap='R_F_Elbow'), + 9: + dict( + name='R_F_Elbow', + id=9, + color=[255, 128, 0], + type='upper', + swap='L_F_Elbow'), + 10: + dict( + name='L_B_Elbow', + id=10, + color=[0, 255, 0], + type='lower', + swap='R_B_Elbow'), + 11: + dict( + name='R_B_Elbow', + id=11, + color=[255, 128, 0], + type='lower', + swap='L_B_Elbow'), + 12: + dict( + name='L_F_Knee', + id=12, + color=[0, 255, 0], + type='upper', + swap='R_F_Knee'), + 13: + dict( + name='R_F_Knee', + id=13, + color=[255, 128, 0], + type='upper', + swap='L_F_Knee'), + 14: + dict( + name='L_B_Knee', + id=14, + color=[0, 255, 0], + type='lower', + swap='R_B_Knee'), + 15: + dict( + name='R_B_Knee', + id=15, + color=[255, 128, 0], + type='lower', + swap='L_B_Knee'), + 16: + dict( + name='L_F_Paw', + id=16, + color=[0, 255, 0], + type='upper', + swap='R_F_Paw'), + 17: + dict( + name='R_F_Paw', + id=17, + color=[255, 128, 0], + type='upper', + swap='L_F_Paw'), + 18: + dict( + name='L_B_Paw', + id=18, + color=[0, 255, 0], + type='lower', + swap='R_B_Paw'), + 19: + dict( + name='R_B_Paw', + id=19, + color=[255, 128, 0], + type='lower', + swap='L_B_Paw') + }, + skeleton_info={ + 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[51, 153, 255]), + 1: dict(link=('L_Eye', 'L_EarBase'), id=1, color=[0, 255, 0]), + 2: dict(link=('R_Eye', 'R_EarBase'), id=2, color=[255, 128, 0]), + 3: dict(link=('L_Eye', 'Nose'), id=3, color=[0, 255, 0]), + 4: dict(link=('R_Eye', 'Nose'), id=4, color=[255, 128, 0]), + 5: dict(link=('Nose', 'Throat'), id=5, color=[51, 153, 255]), + 6: dict(link=('Throat', 'Withers'), id=6, color=[51, 153, 255]), + 7: dict(link=('TailBase', 'Withers'), id=7, color=[51, 153, 255]), + 8: dict(link=('Throat', 'L_F_Elbow'), id=8, color=[0, 255, 0]), + 9: dict(link=('L_F_Elbow', 'L_F_Knee'), id=9, color=[0, 255, 0]), + 10: dict(link=('L_F_Knee', 'L_F_Paw'), id=10, color=[0, 255, 0]), + 11: dict(link=('Throat', 'R_F_Elbow'), id=11, color=[255, 128, 0]), + 12: dict(link=('R_F_Elbow', 'R_F_Knee'), id=12, color=[255, 128, 0]), + 13: dict(link=('R_F_Knee', 'R_F_Paw'), id=13, color=[255, 128, 0]), + 14: dict(link=('TailBase', 'L_B_Elbow'), id=14, color=[0, 255, 0]), + 15: dict(link=('L_B_Elbow', 'L_B_Knee'), id=15, color=[0, 255, 0]), + 16: dict(link=('L_B_Knee', 'L_B_Paw'), id=16, color=[0, 255, 0]), + 17: dict(link=('TailBase', 'R_B_Elbow'), id=17, color=[255, 128, 0]), + 18: dict(link=('R_B_Elbow', 'R_B_Knee'), id=18, color=[255, 128, 0]), + 19: dict(link=('R_B_Knee', 'R_B_Paw'), id=19, color=[255, 128, 0]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, + 1.5, 1.5, 1.5, 1.5 + ], + + # Note: The original paper did not provide enough information about + # the sigmas. We modified from 'https://github.com/cocodataset/' + # 'cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523' + sigmas=[ + 0.025, 0.025, 0.026, 0.035, 0.035, 0.10, 0.10, 0.10, 0.107, 0.107, + 0.107, 0.107, 0.087, 0.087, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089 + ]) diff --git a/configs/_base_/datasets/ap10k.py b/configs/_base_/datasets/ap10k.py new file mode 100644 index 0000000..c0df579 --- /dev/null +++ b/configs/_base_/datasets/ap10k.py @@ -0,0 +1,142 @@ +dataset_info = dict( + dataset_name='ap10k', + paper_info=dict( + author='Yu, Hang and Xu, Yufei and Zhang, Jing and ' + 'Zhao, Wei and Guan, Ziyu and Tao, Dacheng', + title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild', + container='35th Conference on Neural Information Processing Systems ' + '(NeurIPS 2021) Track on Datasets and Bench-marks.', + year='2021', + homepage='https://github.com/AlexTheBad/AP-10K', + ), + keypoint_info={ + 0: + dict( + name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'), + 1: + dict( + name='R_Eye', + id=1, + color=[255, 128, 0], + type='upper', + swap='L_Eye'), + 2: + dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''), + 3: + dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''), + 4: + dict( + name='Root of tail', + id=4, + color=[51, 153, 255], + type='lower', + swap=''), + 5: + dict( + name='L_Shoulder', + id=5, + color=[51, 153, 255], + type='upper', + swap='R_Shoulder'), + 6: + dict( + name='L_Elbow', + id=6, + color=[51, 153, 255], + type='upper', + swap='R_Elbow'), + 7: + dict( + name='L_F_Paw', + id=7, + color=[0, 255, 0], + type='upper', + swap='R_F_Paw'), + 8: + dict( + name='R_Shoulder', + id=8, + color=[0, 255, 0], + type='upper', + swap='L_Shoulder'), + 9: + dict( + name='R_Elbow', + id=9, + color=[255, 128, 0], + type='upper', + swap='L_Elbow'), + 10: + dict( + name='R_F_Paw', + id=10, + color=[0, 255, 0], + type='lower', + swap='L_F_Paw'), + 11: + dict( + name='L_Hip', + id=11, + color=[255, 128, 0], + type='lower', + swap='R_Hip'), + 12: + dict( + name='L_Knee', + id=12, + color=[255, 128, 0], + type='lower', + swap='R_Knee'), + 13: + dict( + name='L_B_Paw', + id=13, + color=[0, 255, 0], + type='lower', + swap='R_B_Paw'), + 14: + dict( + name='R_Hip', id=14, color=[0, 255, 0], type='lower', + swap='L_Hip'), + 15: + dict( + name='R_Knee', + id=15, + color=[0, 255, 0], + type='lower', + swap='L_Knee'), + 16: + dict( + name='R_B_Paw', + id=16, + color=[0, 255, 0], + type='lower', + swap='L_B_Paw'), + }, + skeleton_info={ + 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]), + 1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]), + 2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]), + 3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]), + 4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]), + 5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]), + 6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]), + 7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]), + 8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]), + 9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]), + 10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]), + 11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]), + 12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]), + 13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]), + 14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]), + 15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]), + 16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]), + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072, + 0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089 + ]) diff --git a/configs/_base_/datasets/ap10k_info.py b/configs/_base_/datasets/ap10k_info.py new file mode 100644 index 0000000..af2461c --- /dev/null +++ b/configs/_base_/datasets/ap10k_info.py @@ -0,0 +1,142 @@ +ap10k_info = dict( + dataset_name='ap10k', + paper_info=dict( + author='Yu, Hang and Xu, Yufei and Zhang, Jing and ' + 'Zhao, Wei and Guan, Ziyu and Tao, Dacheng', + title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild', + container='35th Conference on Neural Information Processing Systems ' + '(NeurIPS 2021) Track on Datasets and Bench-marks.', + year='2021', + homepage='https://github.com/AlexTheBad/AP-10K', + ), + keypoint_info={ + 0: + dict( + name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'), + 1: + dict( + name='R_Eye', + id=1, + color=[255, 128, 0], + type='upper', + swap='L_Eye'), + 2: + dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''), + 3: + dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''), + 4: + dict( + name='Root of tail', + id=4, + color=[51, 153, 255], + type='lower', + swap=''), + 5: + dict( + name='L_Shoulder', + id=5, + color=[51, 153, 255], + type='upper', + swap='R_Shoulder'), + 6: + dict( + name='L_Elbow', + id=6, + color=[51, 153, 255], + type='upper', + swap='R_Elbow'), + 7: + dict( + name='L_F_Paw', + id=7, + color=[0, 255, 0], + type='upper', + swap='R_F_Paw'), + 8: + dict( + name='R_Shoulder', + id=8, + color=[0, 255, 0], + type='upper', + swap='L_Shoulder'), + 9: + dict( + name='R_Elbow', + id=9, + color=[255, 128, 0], + type='upper', + swap='L_Elbow'), + 10: + dict( + name='R_F_Paw', + id=10, + color=[0, 255, 0], + type='lower', + swap='L_F_Paw'), + 11: + dict( + name='L_Hip', + id=11, + color=[255, 128, 0], + type='lower', + swap='R_Hip'), + 12: + dict( + name='L_Knee', + id=12, + color=[255, 128, 0], + type='lower', + swap='R_Knee'), + 13: + dict( + name='L_B_Paw', + id=13, + color=[0, 255, 0], + type='lower', + swap='R_B_Paw'), + 14: + dict( + name='R_Hip', id=14, color=[0, 255, 0], type='lower', + swap='L_Hip'), + 15: + dict( + name='R_Knee', + id=15, + color=[0, 255, 0], + type='lower', + swap='L_Knee'), + 16: + dict( + name='R_B_Paw', + id=16, + color=[0, 255, 0], + type='lower', + swap='L_B_Paw'), + }, + skeleton_info={ + 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]), + 1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]), + 2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]), + 3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]), + 4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]), + 5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]), + 6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]), + 7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]), + 8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]), + 9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]), + 10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]), + 11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]), + 12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]), + 13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]), + 14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]), + 15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]), + 16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]), + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072, + 0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089 + ]) diff --git a/configs/_base_/datasets/atrw.py b/configs/_base_/datasets/atrw.py new file mode 100644 index 0000000..7ec71c8 --- /dev/null +++ b/configs/_base_/datasets/atrw.py @@ -0,0 +1,144 @@ +dataset_info = dict( + dataset_name='atrw', + paper_info=dict( + author='Li, Shuyuan and Li, Jianguo and Tang, Hanlin ' + 'and Qian, Rui and Lin, Weiyao', + title='ATRW: A Benchmark for Amur Tiger ' + 'Re-identification in the Wild', + container='Proceedings of the 28th ACM ' + 'International Conference on Multimedia', + year='2020', + homepage='https://cvwc2019.github.io/challenge.html', + ), + keypoint_info={ + 0: + dict( + name='left_ear', + id=0, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 1: + dict( + name='right_ear', + id=1, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 2: + dict(name='nose', id=2, color=[51, 153, 255], type='upper', swap=''), + 3: + dict( + name='right_shoulder', + id=3, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 4: + dict( + name='right_front_paw', + id=4, + color=[255, 128, 0], + type='upper', + swap='left_front_paw'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='left_front_paw', + id=6, + color=[0, 255, 0], + type='upper', + swap='right_front_paw'), + 7: + dict( + name='right_hip', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 8: + dict( + name='right_knee', + id=8, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 9: + dict( + name='right_back_paw', + id=9, + color=[255, 128, 0], + type='lower', + swap='left_back_paw'), + 10: + dict( + name='left_hip', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 11: + dict( + name='left_knee', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 12: + dict( + name='left_back_paw', + id=12, + color=[0, 255, 0], + type='lower', + swap='right_back_paw'), + 13: + dict(name='tail', id=13, color=[51, 153, 255], type='lower', swap=''), + 14: + dict( + name='center', id=14, color=[51, 153, 255], type='lower', swap=''), + }, + skeleton_info={ + 0: + dict(link=('left_ear', 'nose'), id=0, color=[51, 153, 255]), + 1: + dict(link=('right_ear', 'nose'), id=1, color=[51, 153, 255]), + 2: + dict(link=('nose', 'center'), id=2, color=[51, 153, 255]), + 3: + dict( + link=('left_shoulder', 'left_front_paw'), id=3, color=[0, 255, 0]), + 4: + dict(link=('left_shoulder', 'center'), id=4, color=[0, 255, 0]), + 5: + dict( + link=('right_shoulder', 'right_front_paw'), + id=5, + color=[255, 128, 0]), + 6: + dict(link=('right_shoulder', 'center'), id=6, color=[255, 128, 0]), + 7: + dict(link=('tail', 'center'), id=7, color=[51, 153, 255]), + 8: + dict(link=('right_back_paw', 'right_knee'), id=8, color=[255, 128, 0]), + 9: + dict(link=('right_knee', 'right_hip'), id=9, color=[255, 128, 0]), + 10: + dict(link=('right_hip', 'tail'), id=10, color=[255, 128, 0]), + 11: + dict(link=('left_back_paw', 'left_knee'), id=11, color=[0, 255, 0]), + 12: + dict(link=('left_knee', 'left_hip'), id=12, color=[0, 255, 0]), + 13: + dict(link=('left_hip', 'tail'), id=13, color=[0, 255, 0]), + }, + joint_weights=[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + sigmas=[ + 0.0277, 0.0823, 0.0831, 0.0202, 0.0716, 0.0263, 0.0646, 0.0302, 0.0440, + 0.0316, 0.0333, 0.0547, 0.0263, 0.0683, 0.0539 + ]) diff --git a/configs/_base_/datasets/coco.py b/configs/_base_/datasets/coco.py new file mode 100644 index 0000000..865a95b --- /dev/null +++ b/configs/_base_/datasets/coco.py @@ -0,0 +1,181 @@ +dataset_info = dict( + dataset_name='coco', + paper_info=dict( + author='Lin, Tsung-Yi and Maire, Michael and ' + 'Belongie, Serge and Hays, James and ' + 'Perona, Pietro and Ramanan, Deva and ' + r'Doll{\'a}r, Piotr and Zitnick, C Lawrence', + title='Microsoft coco: Common objects in context', + container='European conference on computer vision', + year='2014', + homepage='http://cocodataset.org/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/configs/_base_/datasets/coco_wholebody.py b/configs/_base_/datasets/coco_wholebody.py new file mode 100644 index 0000000..ef9b707 --- /dev/null +++ b/configs/_base_/datasets/coco_wholebody.py @@ -0,0 +1,1154 @@ +dataset_info = dict( + dataset_name='coco_wholebody', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='left_big_toe', + id=17, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 18: + dict( + name='left_small_toe', + id=18, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 19: + dict( + name='left_heel', + id=19, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 20: + dict( + name='right_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 21: + dict( + name='right_small_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 22: + dict( + name='right_heel', + id=22, + color=[255, 128, 0], + type='lower', + swap='left_heel'), + 23: + dict( + name='face-0', + id=23, + color=[255, 255, 255], + type='', + swap='face-16'), + 24: + dict( + name='face-1', + id=24, + color=[255, 255, 255], + type='', + swap='face-15'), + 25: + dict( + name='face-2', + id=25, + color=[255, 255, 255], + type='', + swap='face-14'), + 26: + dict( + name='face-3', + id=26, + color=[255, 255, 255], + type='', + swap='face-13'), + 27: + dict( + name='face-4', + id=27, + color=[255, 255, 255], + type='', + swap='face-12'), + 28: + dict( + name='face-5', + id=28, + color=[255, 255, 255], + type='', + swap='face-11'), + 29: + dict( + name='face-6', + id=29, + color=[255, 255, 255], + type='', + swap='face-10'), + 30: + dict( + name='face-7', + id=30, + color=[255, 255, 255], + type='', + swap='face-9'), + 31: + dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''), + 32: + dict( + name='face-9', + id=32, + color=[255, 255, 255], + type='', + swap='face-7'), + 33: + dict( + name='face-10', + id=33, + color=[255, 255, 255], + type='', + swap='face-6'), + 34: + dict( + name='face-11', + id=34, + color=[255, 255, 255], + type='', + swap='face-5'), + 35: + dict( + name='face-12', + id=35, + color=[255, 255, 255], + type='', + swap='face-4'), + 36: + dict( + name='face-13', + id=36, + color=[255, 255, 255], + type='', + swap='face-3'), + 37: + dict( + name='face-14', + id=37, + color=[255, 255, 255], + type='', + swap='face-2'), + 38: + dict( + name='face-15', + id=38, + color=[255, 255, 255], + type='', + swap='face-1'), + 39: + dict( + name='face-16', + id=39, + color=[255, 255, 255], + type='', + swap='face-0'), + 40: + dict( + name='face-17', + id=40, + color=[255, 255, 255], + type='', + swap='face-26'), + 41: + dict( + name='face-18', + id=41, + color=[255, 255, 255], + type='', + swap='face-25'), + 42: + dict( + name='face-19', + id=42, + color=[255, 255, 255], + type='', + swap='face-24'), + 43: + dict( + name='face-20', + id=43, + color=[255, 255, 255], + type='', + swap='face-23'), + 44: + dict( + name='face-21', + id=44, + color=[255, 255, 255], + type='', + swap='face-22'), + 45: + dict( + name='face-22', + id=45, + color=[255, 255, 255], + type='', + swap='face-21'), + 46: + dict( + name='face-23', + id=46, + color=[255, 255, 255], + type='', + swap='face-20'), + 47: + dict( + name='face-24', + id=47, + color=[255, 255, 255], + type='', + swap='face-19'), + 48: + dict( + name='face-25', + id=48, + color=[255, 255, 255], + type='', + swap='face-18'), + 49: + dict( + name='face-26', + id=49, + color=[255, 255, 255], + type='', + swap='face-17'), + 50: + dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''), + 51: + dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''), + 52: + dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''), + 53: + dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''), + 54: + dict( + name='face-31', + id=54, + color=[255, 255, 255], + type='', + swap='face-35'), + 55: + dict( + name='face-32', + id=55, + color=[255, 255, 255], + type='', + swap='face-34'), + 56: + dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''), + 57: + dict( + name='face-34', + id=57, + color=[255, 255, 255], + type='', + swap='face-32'), + 58: + dict( + name='face-35', + id=58, + color=[255, 255, 255], + type='', + swap='face-31'), + 59: + dict( + name='face-36', + id=59, + color=[255, 255, 255], + type='', + swap='face-45'), + 60: + dict( + name='face-37', + id=60, + color=[255, 255, 255], + type='', + swap='face-44'), + 61: + dict( + name='face-38', + id=61, + color=[255, 255, 255], + type='', + swap='face-43'), + 62: + dict( + name='face-39', + id=62, + color=[255, 255, 255], + type='', + swap='face-42'), + 63: + dict( + name='face-40', + id=63, + color=[255, 255, 255], + type='', + swap='face-47'), + 64: + dict( + name='face-41', + id=64, + color=[255, 255, 255], + type='', + swap='face-46'), + 65: + dict( + name='face-42', + id=65, + color=[255, 255, 255], + type='', + swap='face-39'), + 66: + dict( + name='face-43', + id=66, + color=[255, 255, 255], + type='', + swap='face-38'), + 67: + dict( + name='face-44', + id=67, + color=[255, 255, 255], + type='', + swap='face-37'), + 68: + dict( + name='face-45', + id=68, + color=[255, 255, 255], + type='', + swap='face-36'), + 69: + dict( + name='face-46', + id=69, + color=[255, 255, 255], + type='', + swap='face-41'), + 70: + dict( + name='face-47', + id=70, + color=[255, 255, 255], + type='', + swap='face-40'), + 71: + dict( + name='face-48', + id=71, + color=[255, 255, 255], + type='', + swap='face-54'), + 72: + dict( + name='face-49', + id=72, + color=[255, 255, 255], + type='', + swap='face-53'), + 73: + dict( + name='face-50', + id=73, + color=[255, 255, 255], + type='', + swap='face-52'), + 74: + dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''), + 75: + dict( + name='face-52', + id=75, + color=[255, 255, 255], + type='', + swap='face-50'), + 76: + dict( + name='face-53', + id=76, + color=[255, 255, 255], + type='', + swap='face-49'), + 77: + dict( + name='face-54', + id=77, + color=[255, 255, 255], + type='', + swap='face-48'), + 78: + dict( + name='face-55', + id=78, + color=[255, 255, 255], + type='', + swap='face-59'), + 79: + dict( + name='face-56', + id=79, + color=[255, 255, 255], + type='', + swap='face-58'), + 80: + dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''), + 81: + dict( + name='face-58', + id=81, + color=[255, 255, 255], + type='', + swap='face-56'), + 82: + dict( + name='face-59', + id=82, + color=[255, 255, 255], + type='', + swap='face-55'), + 83: + dict( + name='face-60', + id=83, + color=[255, 255, 255], + type='', + swap='face-64'), + 84: + dict( + name='face-61', + id=84, + color=[255, 255, 255], + type='', + swap='face-63'), + 85: + dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''), + 86: + dict( + name='face-63', + id=86, + color=[255, 255, 255], + type='', + swap='face-61'), + 87: + dict( + name='face-64', + id=87, + color=[255, 255, 255], + type='', + swap='face-60'), + 88: + dict( + name='face-65', + id=88, + color=[255, 255, 255], + type='', + swap='face-67'), + 89: + dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''), + 90: + dict( + name='face-67', + id=90, + color=[255, 255, 255], + type='', + swap='face-65'), + 91: + dict( + name='left_hand_root', + id=91, + color=[255, 255, 255], + type='', + swap='right_hand_root'), + 92: + dict( + name='left_thumb1', + id=92, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 93: + dict( + name='left_thumb2', + id=93, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 94: + dict( + name='left_thumb3', + id=94, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 95: + dict( + name='left_thumb4', + id=95, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 96: + dict( + name='left_forefinger1', + id=96, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 97: + dict( + name='left_forefinger2', + id=97, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 98: + dict( + name='left_forefinger3', + id=98, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 99: + dict( + name='left_forefinger4', + id=99, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 100: + dict( + name='left_middle_finger1', + id=100, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 101: + dict( + name='left_middle_finger2', + id=101, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 102: + dict( + name='left_middle_finger3', + id=102, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 103: + dict( + name='left_middle_finger4', + id=103, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 104: + dict( + name='left_ring_finger1', + id=104, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 105: + dict( + name='left_ring_finger2', + id=105, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 106: + dict( + name='left_ring_finger3', + id=106, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 107: + dict( + name='left_ring_finger4', + id=107, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 108: + dict( + name='left_pinky_finger1', + id=108, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 109: + dict( + name='left_pinky_finger2', + id=109, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 110: + dict( + name='left_pinky_finger3', + id=110, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 111: + dict( + name='left_pinky_finger4', + id=111, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 112: + dict( + name='right_hand_root', + id=112, + color=[255, 255, 255], + type='', + swap='left_hand_root'), + 113: + dict( + name='right_thumb1', + id=113, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 114: + dict( + name='right_thumb2', + id=114, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 115: + dict( + name='right_thumb3', + id=115, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 116: + dict( + name='right_thumb4', + id=116, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 117: + dict( + name='right_forefinger1', + id=117, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 118: + dict( + name='right_forefinger2', + id=118, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 119: + dict( + name='right_forefinger3', + id=119, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 120: + dict( + name='right_forefinger4', + id=120, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 121: + dict( + name='right_middle_finger1', + id=121, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 122: + dict( + name='right_middle_finger2', + id=122, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 123: + dict( + name='right_middle_finger3', + id=123, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 124: + dict( + name='right_middle_finger4', + id=124, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 125: + dict( + name='right_ring_finger1', + id=125, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 126: + dict( + name='right_ring_finger2', + id=126, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 127: + dict( + name='right_ring_finger3', + id=127, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 128: + dict( + name='right_ring_finger4', + id=128, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 129: + dict( + name='right_pinky_finger1', + id=129, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 130: + dict( + name='right_pinky_finger2', + id=130, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 131: + dict( + name='right_pinky_finger3', + id=131, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 132: + dict( + name='right_pinky_finger4', + id=132, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]), + 20: + dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]), + 21: + dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]), + 22: + dict( + link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]), + 23: + dict( + link=('right_ankle', 'right_small_toe'), + id=23, + color=[255, 128, 0]), + 24: + dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128, + 0]), + 26: + dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]), + 29: + dict( + link=('left_hand_root', 'left_forefinger1'), + id=29, + color=[255, 153, 255]), + 30: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=30, + color=[255, 153, 255]), + 31: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_hand_root', 'left_middle_finger1'), + id=33, + color=[102, 178, 255]), + 34: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=34, + color=[102, 178, 255]), + 35: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_hand_root', 'left_ring_finger1'), + id=37, + color=[255, 51, 51]), + 38: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=38, + color=[255, 51, 51]), + 39: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_hand_root', 'left_pinky_finger1'), + id=41, + color=[0, 255, 0]), + 42: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=42, + color=[0, 255, 0]), + 43: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('right_hand_root', 'right_thumb1'), + id=45, + color=[255, 128, 0]), + 46: + dict( + link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]), + 47: + dict( + link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_hand_root', 'right_forefinger1'), + id=49, + color=[255, 153, 255]), + 50: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=50, + color=[255, 153, 255]), + 51: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_hand_root', 'right_middle_finger1'), + id=53, + color=[102, 178, 255]), + 54: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=54, + color=[102, 178, 255]), + 55: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_hand_root', 'right_ring_finger1'), + id=57, + color=[255, 51, 51]), + 58: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=58, + color=[255, 51, 51]), + 59: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_hand_root', 'right_pinky_finger1'), + id=61, + color=[0, 255, 0]), + 62: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=62, + color=[0, 255, 0]), + 63: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=64, + color=[0, 255, 0]) + }, + joint_weights=[1.] * 133, + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L175' + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066, + 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, + 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, + 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, + 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, + 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, + 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, + 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, + 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, + 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, + 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, + 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, + 0.019, 0.022, 0.031 + ]) diff --git a/configs/_base_/datasets/coco_wholebody_face.py b/configs/_base_/datasets/coco_wholebody_face.py new file mode 100644 index 0000000..7c9ee33 --- /dev/null +++ b/configs/_base_/datasets/coco_wholebody_face.py @@ -0,0 +1,448 @@ +dataset_info = dict( + dataset_name='coco_wholebody_face', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict( + name='face-0', + id=0, + color=[255, 255, 255], + type='', + swap='face-16'), + 1: + dict( + name='face-1', + id=1, + color=[255, 255, 255], + type='', + swap='face-15'), + 2: + dict( + name='face-2', + id=2, + color=[255, 255, 255], + type='', + swap='face-14'), + 3: + dict( + name='face-3', + id=3, + color=[255, 255, 255], + type='', + swap='face-13'), + 4: + dict( + name='face-4', + id=4, + color=[255, 255, 255], + type='', + swap='face-12'), + 5: + dict( + name='face-5', + id=5, + color=[255, 255, 255], + type='', + swap='face-11'), + 6: + dict( + name='face-6', + id=6, + color=[255, 255, 255], + type='', + swap='face-10'), + 7: + dict( + name='face-7', id=7, color=[255, 255, 255], type='', + swap='face-9'), + 8: + dict(name='face-8', id=8, color=[255, 255, 255], type='', swap=''), + 9: + dict( + name='face-9', id=9, color=[255, 255, 255], type='', + swap='face-7'), + 10: + dict( + name='face-10', + id=10, + color=[255, 255, 255], + type='', + swap='face-6'), + 11: + dict( + name='face-11', + id=11, + color=[255, 255, 255], + type='', + swap='face-5'), + 12: + dict( + name='face-12', + id=12, + color=[255, 255, 255], + type='', + swap='face-4'), + 13: + dict( + name='face-13', + id=13, + color=[255, 255, 255], + type='', + swap='face-3'), + 14: + dict( + name='face-14', + id=14, + color=[255, 255, 255], + type='', + swap='face-2'), + 15: + dict( + name='face-15', + id=15, + color=[255, 255, 255], + type='', + swap='face-1'), + 16: + dict( + name='face-16', + id=16, + color=[255, 255, 255], + type='', + swap='face-0'), + 17: + dict( + name='face-17', + id=17, + color=[255, 255, 255], + type='', + swap='face-26'), + 18: + dict( + name='face-18', + id=18, + color=[255, 255, 255], + type='', + swap='face-25'), + 19: + dict( + name='face-19', + id=19, + color=[255, 255, 255], + type='', + swap='face-24'), + 20: + dict( + name='face-20', + id=20, + color=[255, 255, 255], + type='', + swap='face-23'), + 21: + dict( + name='face-21', + id=21, + color=[255, 255, 255], + type='', + swap='face-22'), + 22: + dict( + name='face-22', + id=22, + color=[255, 255, 255], + type='', + swap='face-21'), + 23: + dict( + name='face-23', + id=23, + color=[255, 255, 255], + type='', + swap='face-20'), + 24: + dict( + name='face-24', + id=24, + color=[255, 255, 255], + type='', + swap='face-19'), + 25: + dict( + name='face-25', + id=25, + color=[255, 255, 255], + type='', + swap='face-18'), + 26: + dict( + name='face-26', + id=26, + color=[255, 255, 255], + type='', + swap='face-17'), + 27: + dict(name='face-27', id=27, color=[255, 255, 255], type='', swap=''), + 28: + dict(name='face-28', id=28, color=[255, 255, 255], type='', swap=''), + 29: + dict(name='face-29', id=29, color=[255, 255, 255], type='', swap=''), + 30: + dict(name='face-30', id=30, color=[255, 255, 255], type='', swap=''), + 31: + dict( + name='face-31', + id=31, + color=[255, 255, 255], + type='', + swap='face-35'), + 32: + dict( + name='face-32', + id=32, + color=[255, 255, 255], + type='', + swap='face-34'), + 33: + dict(name='face-33', id=33, color=[255, 255, 255], type='', swap=''), + 34: + dict( + name='face-34', + id=34, + color=[255, 255, 255], + type='', + swap='face-32'), + 35: + dict( + name='face-35', + id=35, + color=[255, 255, 255], + type='', + swap='face-31'), + 36: + dict( + name='face-36', + id=36, + color=[255, 255, 255], + type='', + swap='face-45'), + 37: + dict( + name='face-37', + id=37, + color=[255, 255, 255], + type='', + swap='face-44'), + 38: + dict( + name='face-38', + id=38, + color=[255, 255, 255], + type='', + swap='face-43'), + 39: + dict( + name='face-39', + id=39, + color=[255, 255, 255], + type='', + swap='face-42'), + 40: + dict( + name='face-40', + id=40, + color=[255, 255, 255], + type='', + swap='face-47'), + 41: + dict( + name='face-41', + id=41, + color=[255, 255, 255], + type='', + swap='face-46'), + 42: + dict( + name='face-42', + id=42, + color=[255, 255, 255], + type='', + swap='face-39'), + 43: + dict( + name='face-43', + id=43, + color=[255, 255, 255], + type='', + swap='face-38'), + 44: + dict( + name='face-44', + id=44, + color=[255, 255, 255], + type='', + swap='face-37'), + 45: + dict( + name='face-45', + id=45, + color=[255, 255, 255], + type='', + swap='face-36'), + 46: + dict( + name='face-46', + id=46, + color=[255, 255, 255], + type='', + swap='face-41'), + 47: + dict( + name='face-47', + id=47, + color=[255, 255, 255], + type='', + swap='face-40'), + 48: + dict( + name='face-48', + id=48, + color=[255, 255, 255], + type='', + swap='face-54'), + 49: + dict( + name='face-49', + id=49, + color=[255, 255, 255], + type='', + swap='face-53'), + 50: + dict( + name='face-50', + id=50, + color=[255, 255, 255], + type='', + swap='face-52'), + 51: + dict(name='face-51', id=52, color=[255, 255, 255], type='', swap=''), + 52: + dict( + name='face-52', + id=52, + color=[255, 255, 255], + type='', + swap='face-50'), + 53: + dict( + name='face-53', + id=53, + color=[255, 255, 255], + type='', + swap='face-49'), + 54: + dict( + name='face-54', + id=54, + color=[255, 255, 255], + type='', + swap='face-48'), + 55: + dict( + name='face-55', + id=55, + color=[255, 255, 255], + type='', + swap='face-59'), + 56: + dict( + name='face-56', + id=56, + color=[255, 255, 255], + type='', + swap='face-58'), + 57: + dict(name='face-57', id=57, color=[255, 255, 255], type='', swap=''), + 58: + dict( + name='face-58', + id=58, + color=[255, 255, 255], + type='', + swap='face-56'), + 59: + dict( + name='face-59', + id=59, + color=[255, 255, 255], + type='', + swap='face-55'), + 60: + dict( + name='face-60', + id=60, + color=[255, 255, 255], + type='', + swap='face-64'), + 61: + dict( + name='face-61', + id=61, + color=[255, 255, 255], + type='', + swap='face-63'), + 62: + dict(name='face-62', id=62, color=[255, 255, 255], type='', swap=''), + 63: + dict( + name='face-63', + id=63, + color=[255, 255, 255], + type='', + swap='face-61'), + 64: + dict( + name='face-64', + id=64, + color=[255, 255, 255], + type='', + swap='face-60'), + 65: + dict( + name='face-65', + id=65, + color=[255, 255, 255], + type='', + swap='face-67'), + 66: + dict(name='face-66', id=66, color=[255, 255, 255], type='', swap=''), + 67: + dict( + name='face-67', + id=67, + color=[255, 255, 255], + type='', + swap='face-65') + }, + skeleton_info={}, + joint_weights=[1.] * 68, + + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L177' + sigmas=[ + 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, 0.025, 0.020, 0.023, + 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, 0.013, 0.012, 0.011, + 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, 0.009, 0.007, 0.007, + 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, 0.011, 0.009, 0.011, + 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, 0.034, 0.008, 0.008, + 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, 0.009, 0.009, 0.007, + 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, 0.008 + ]) diff --git a/configs/_base_/datasets/coco_wholebody_hand.py b/configs/_base_/datasets/coco_wholebody_hand.py new file mode 100644 index 0000000..1910b2c --- /dev/null +++ b/configs/_base_/datasets/coco_wholebody_hand.py @@ -0,0 +1,147 @@ +dataset_info = dict( + dataset_name='coco_wholebody_hand', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[ + 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, 0.018, + 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, 0.022, + 0.031 + ]) diff --git a/configs/_base_/datasets/coco_wholebody_info.py b/configs/_base_/datasets/coco_wholebody_info.py new file mode 100644 index 0000000..50ac8fe --- /dev/null +++ b/configs/_base_/datasets/coco_wholebody_info.py @@ -0,0 +1,1154 @@ +cocowholebody_info = dict( + dataset_name='coco_wholebody', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='left_big_toe', + id=17, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 18: + dict( + name='left_small_toe', + id=18, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 19: + dict( + name='left_heel', + id=19, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 20: + dict( + name='right_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 21: + dict( + name='right_small_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 22: + dict( + name='right_heel', + id=22, + color=[255, 128, 0], + type='lower', + swap='left_heel'), + 23: + dict( + name='face-0', + id=23, + color=[255, 255, 255], + type='', + swap='face-16'), + 24: + dict( + name='face-1', + id=24, + color=[255, 255, 255], + type='', + swap='face-15'), + 25: + dict( + name='face-2', + id=25, + color=[255, 255, 255], + type='', + swap='face-14'), + 26: + dict( + name='face-3', + id=26, + color=[255, 255, 255], + type='', + swap='face-13'), + 27: + dict( + name='face-4', + id=27, + color=[255, 255, 255], + type='', + swap='face-12'), + 28: + dict( + name='face-5', + id=28, + color=[255, 255, 255], + type='', + swap='face-11'), + 29: + dict( + name='face-6', + id=29, + color=[255, 255, 255], + type='', + swap='face-10'), + 30: + dict( + name='face-7', + id=30, + color=[255, 255, 255], + type='', + swap='face-9'), + 31: + dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''), + 32: + dict( + name='face-9', + id=32, + color=[255, 255, 255], + type='', + swap='face-7'), + 33: + dict( + name='face-10', + id=33, + color=[255, 255, 255], + type='', + swap='face-6'), + 34: + dict( + name='face-11', + id=34, + color=[255, 255, 255], + type='', + swap='face-5'), + 35: + dict( + name='face-12', + id=35, + color=[255, 255, 255], + type='', + swap='face-4'), + 36: + dict( + name='face-13', + id=36, + color=[255, 255, 255], + type='', + swap='face-3'), + 37: + dict( + name='face-14', + id=37, + color=[255, 255, 255], + type='', + swap='face-2'), + 38: + dict( + name='face-15', + id=38, + color=[255, 255, 255], + type='', + swap='face-1'), + 39: + dict( + name='face-16', + id=39, + color=[255, 255, 255], + type='', + swap='face-0'), + 40: + dict( + name='face-17', + id=40, + color=[255, 255, 255], + type='', + swap='face-26'), + 41: + dict( + name='face-18', + id=41, + color=[255, 255, 255], + type='', + swap='face-25'), + 42: + dict( + name='face-19', + id=42, + color=[255, 255, 255], + type='', + swap='face-24'), + 43: + dict( + name='face-20', + id=43, + color=[255, 255, 255], + type='', + swap='face-23'), + 44: + dict( + name='face-21', + id=44, + color=[255, 255, 255], + type='', + swap='face-22'), + 45: + dict( + name='face-22', + id=45, + color=[255, 255, 255], + type='', + swap='face-21'), + 46: + dict( + name='face-23', + id=46, + color=[255, 255, 255], + type='', + swap='face-20'), + 47: + dict( + name='face-24', + id=47, + color=[255, 255, 255], + type='', + swap='face-19'), + 48: + dict( + name='face-25', + id=48, + color=[255, 255, 255], + type='', + swap='face-18'), + 49: + dict( + name='face-26', + id=49, + color=[255, 255, 255], + type='', + swap='face-17'), + 50: + dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''), + 51: + dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''), + 52: + dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''), + 53: + dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''), + 54: + dict( + name='face-31', + id=54, + color=[255, 255, 255], + type='', + swap='face-35'), + 55: + dict( + name='face-32', + id=55, + color=[255, 255, 255], + type='', + swap='face-34'), + 56: + dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''), + 57: + dict( + name='face-34', + id=57, + color=[255, 255, 255], + type='', + swap='face-32'), + 58: + dict( + name='face-35', + id=58, + color=[255, 255, 255], + type='', + swap='face-31'), + 59: + dict( + name='face-36', + id=59, + color=[255, 255, 255], + type='', + swap='face-45'), + 60: + dict( + name='face-37', + id=60, + color=[255, 255, 255], + type='', + swap='face-44'), + 61: + dict( + name='face-38', + id=61, + color=[255, 255, 255], + type='', + swap='face-43'), + 62: + dict( + name='face-39', + id=62, + color=[255, 255, 255], + type='', + swap='face-42'), + 63: + dict( + name='face-40', + id=63, + color=[255, 255, 255], + type='', + swap='face-47'), + 64: + dict( + name='face-41', + id=64, + color=[255, 255, 255], + type='', + swap='face-46'), + 65: + dict( + name='face-42', + id=65, + color=[255, 255, 255], + type='', + swap='face-39'), + 66: + dict( + name='face-43', + id=66, + color=[255, 255, 255], + type='', + swap='face-38'), + 67: + dict( + name='face-44', + id=67, + color=[255, 255, 255], + type='', + swap='face-37'), + 68: + dict( + name='face-45', + id=68, + color=[255, 255, 255], + type='', + swap='face-36'), + 69: + dict( + name='face-46', + id=69, + color=[255, 255, 255], + type='', + swap='face-41'), + 70: + dict( + name='face-47', + id=70, + color=[255, 255, 255], + type='', + swap='face-40'), + 71: + dict( + name='face-48', + id=71, + color=[255, 255, 255], + type='', + swap='face-54'), + 72: + dict( + name='face-49', + id=72, + color=[255, 255, 255], + type='', + swap='face-53'), + 73: + dict( + name='face-50', + id=73, + color=[255, 255, 255], + type='', + swap='face-52'), + 74: + dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''), + 75: + dict( + name='face-52', + id=75, + color=[255, 255, 255], + type='', + swap='face-50'), + 76: + dict( + name='face-53', + id=76, + color=[255, 255, 255], + type='', + swap='face-49'), + 77: + dict( + name='face-54', + id=77, + color=[255, 255, 255], + type='', + swap='face-48'), + 78: + dict( + name='face-55', + id=78, + color=[255, 255, 255], + type='', + swap='face-59'), + 79: + dict( + name='face-56', + id=79, + color=[255, 255, 255], + type='', + swap='face-58'), + 80: + dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''), + 81: + dict( + name='face-58', + id=81, + color=[255, 255, 255], + type='', + swap='face-56'), + 82: + dict( + name='face-59', + id=82, + color=[255, 255, 255], + type='', + swap='face-55'), + 83: + dict( + name='face-60', + id=83, + color=[255, 255, 255], + type='', + swap='face-64'), + 84: + dict( + name='face-61', + id=84, + color=[255, 255, 255], + type='', + swap='face-63'), + 85: + dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''), + 86: + dict( + name='face-63', + id=86, + color=[255, 255, 255], + type='', + swap='face-61'), + 87: + dict( + name='face-64', + id=87, + color=[255, 255, 255], + type='', + swap='face-60'), + 88: + dict( + name='face-65', + id=88, + color=[255, 255, 255], + type='', + swap='face-67'), + 89: + dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''), + 90: + dict( + name='face-67', + id=90, + color=[255, 255, 255], + type='', + swap='face-65'), + 91: + dict( + name='left_hand_root', + id=91, + color=[255, 255, 255], + type='', + swap='right_hand_root'), + 92: + dict( + name='left_thumb1', + id=92, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 93: + dict( + name='left_thumb2', + id=93, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 94: + dict( + name='left_thumb3', + id=94, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 95: + dict( + name='left_thumb4', + id=95, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 96: + dict( + name='left_forefinger1', + id=96, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 97: + dict( + name='left_forefinger2', + id=97, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 98: + dict( + name='left_forefinger3', + id=98, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 99: + dict( + name='left_forefinger4', + id=99, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 100: + dict( + name='left_middle_finger1', + id=100, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 101: + dict( + name='left_middle_finger2', + id=101, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 102: + dict( + name='left_middle_finger3', + id=102, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 103: + dict( + name='left_middle_finger4', + id=103, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 104: + dict( + name='left_ring_finger1', + id=104, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 105: + dict( + name='left_ring_finger2', + id=105, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 106: + dict( + name='left_ring_finger3', + id=106, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 107: + dict( + name='left_ring_finger4', + id=107, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 108: + dict( + name='left_pinky_finger1', + id=108, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 109: + dict( + name='left_pinky_finger2', + id=109, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 110: + dict( + name='left_pinky_finger3', + id=110, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 111: + dict( + name='left_pinky_finger4', + id=111, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 112: + dict( + name='right_hand_root', + id=112, + color=[255, 255, 255], + type='', + swap='left_hand_root'), + 113: + dict( + name='right_thumb1', + id=113, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 114: + dict( + name='right_thumb2', + id=114, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 115: + dict( + name='right_thumb3', + id=115, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 116: + dict( + name='right_thumb4', + id=116, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 117: + dict( + name='right_forefinger1', + id=117, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 118: + dict( + name='right_forefinger2', + id=118, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 119: + dict( + name='right_forefinger3', + id=119, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 120: + dict( + name='right_forefinger4', + id=120, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 121: + dict( + name='right_middle_finger1', + id=121, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 122: + dict( + name='right_middle_finger2', + id=122, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 123: + dict( + name='right_middle_finger3', + id=123, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 124: + dict( + name='right_middle_finger4', + id=124, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 125: + dict( + name='right_ring_finger1', + id=125, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 126: + dict( + name='right_ring_finger2', + id=126, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 127: + dict( + name='right_ring_finger3', + id=127, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 128: + dict( + name='right_ring_finger4', + id=128, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 129: + dict( + name='right_pinky_finger1', + id=129, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 130: + dict( + name='right_pinky_finger2', + id=130, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 131: + dict( + name='right_pinky_finger3', + id=131, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 132: + dict( + name='right_pinky_finger4', + id=132, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]), + 20: + dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]), + 21: + dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]), + 22: + dict( + link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]), + 23: + dict( + link=('right_ankle', 'right_small_toe'), + id=23, + color=[255, 128, 0]), + 24: + dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128, + 0]), + 26: + dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]), + 29: + dict( + link=('left_hand_root', 'left_forefinger1'), + id=29, + color=[255, 153, 255]), + 30: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=30, + color=[255, 153, 255]), + 31: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_hand_root', 'left_middle_finger1'), + id=33, + color=[102, 178, 255]), + 34: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=34, + color=[102, 178, 255]), + 35: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_hand_root', 'left_ring_finger1'), + id=37, + color=[255, 51, 51]), + 38: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=38, + color=[255, 51, 51]), + 39: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_hand_root', 'left_pinky_finger1'), + id=41, + color=[0, 255, 0]), + 42: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=42, + color=[0, 255, 0]), + 43: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('right_hand_root', 'right_thumb1'), + id=45, + color=[255, 128, 0]), + 46: + dict( + link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]), + 47: + dict( + link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_hand_root', 'right_forefinger1'), + id=49, + color=[255, 153, 255]), + 50: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=50, + color=[255, 153, 255]), + 51: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_hand_root', 'right_middle_finger1'), + id=53, + color=[102, 178, 255]), + 54: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=54, + color=[102, 178, 255]), + 55: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_hand_root', 'right_ring_finger1'), + id=57, + color=[255, 51, 51]), + 58: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=58, + color=[255, 51, 51]), + 59: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_hand_root', 'right_pinky_finger1'), + id=61, + color=[0, 255, 0]), + 62: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=62, + color=[0, 255, 0]), + 63: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=64, + color=[0, 255, 0]) + }, + joint_weights=[1.] * 133, + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L175' + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066, + 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, + 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, + 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, + 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, + 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, + 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, + 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, + 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, + 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, + 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, + 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, + 0.019, 0.022, 0.031 + ]) diff --git a/configs/_base_/datasets/cofw.py b/configs/_base_/datasets/cofw.py new file mode 100644 index 0000000..2fb7ad2 --- /dev/null +++ b/configs/_base_/datasets/cofw.py @@ -0,0 +1,134 @@ +dataset_info = dict( + dataset_name='cofw', + paper_info=dict( + author='Burgos-Artizzu, Xavier P and Perona, ' + r'Pietro and Doll{\'a}r, Piotr', + title='Robust face landmark estimation under occlusion', + container='Proceedings of the IEEE international ' + 'conference on computer vision', + year='2013', + homepage='http://www.vision.caltech.edu/xpburgos/ICCV13/', + ), + keypoint_info={ + 0: + dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-1'), + 1: + dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-0'), + 2: + dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'), + 3: + dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'), + 4: + dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-6'), + 5: + dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-7'), + 6: + dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-4'), + 7: + dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-5'), + 8: + dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'), + 9: + dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'), + 10: + dict( + name='kpt-10', + id=10, + color=[255, 255, 255], + type='', + swap='kpt-11'), + 11: + dict( + name='kpt-11', + id=11, + color=[255, 255, 255], + type='', + swap='kpt-10'), + 12: + dict( + name='kpt-12', + id=12, + color=[255, 255, 255], + type='', + swap='kpt-14'), + 13: + dict( + name='kpt-13', + id=13, + color=[255, 255, 255], + type='', + swap='kpt-15'), + 14: + dict( + name='kpt-14', + id=14, + color=[255, 255, 255], + type='', + swap='kpt-12'), + 15: + dict( + name='kpt-15', + id=15, + color=[255, 255, 255], + type='', + swap='kpt-13'), + 16: + dict( + name='kpt-16', + id=16, + color=[255, 255, 255], + type='', + swap='kpt-17'), + 17: + dict( + name='kpt-17', + id=17, + color=[255, 255, 255], + type='', + swap='kpt-16'), + 18: + dict( + name='kpt-18', + id=18, + color=[255, 255, 255], + type='', + swap='kpt-19'), + 19: + dict( + name='kpt-19', + id=19, + color=[255, 255, 255], + type='', + swap='kpt-18'), + 20: + dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap=''), + 21: + dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap=''), + 22: + dict( + name='kpt-22', + id=22, + color=[255, 255, 255], + type='', + swap='kpt-23'), + 23: + dict( + name='kpt-23', + id=23, + color=[255, 255, 255], + type='', + swap='kpt-22'), + 24: + dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap=''), + 25: + dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''), + 26: + dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap=''), + 27: + dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''), + 28: + dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap='') + }, + skeleton_info={}, + joint_weights=[1.] * 29, + sigmas=[]) diff --git a/configs/_base_/datasets/crowdpose.py b/configs/_base_/datasets/crowdpose.py new file mode 100644 index 0000000..4508653 --- /dev/null +++ b/configs/_base_/datasets/crowdpose.py @@ -0,0 +1,147 @@ +dataset_info = dict( + dataset_name='crowdpose', + paper_info=dict( + author='Li, Jiefeng and Wang, Can and Zhu, Hao and ' + 'Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu', + title='CrowdPose: Efficient Crowded Scenes Pose Estimation ' + 'and A New Benchmark', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2019', + homepage='https://github.com/Jeff-sjtu/CrowdPose', + ), + keypoint_info={ + 0: + dict( + name='left_shoulder', + id=0, + color=[51, 153, 255], + type='upper', + swap='right_shoulder'), + 1: + dict( + name='right_shoulder', + id=1, + color=[51, 153, 255], + type='upper', + swap='left_shoulder'), + 2: + dict( + name='left_elbow', + id=2, + color=[51, 153, 255], + type='upper', + swap='right_elbow'), + 3: + dict( + name='right_elbow', + id=3, + color=[51, 153, 255], + type='upper', + swap='left_elbow'), + 4: + dict( + name='left_wrist', + id=4, + color=[51, 153, 255], + type='upper', + swap='right_wrist'), + 5: + dict( + name='right_wrist', + id=5, + color=[0, 255, 0], + type='upper', + swap='left_wrist'), + 6: + dict( + name='left_hip', + id=6, + color=[255, 128, 0], + type='lower', + swap='right_hip'), + 7: + dict( + name='right_hip', + id=7, + color=[0, 255, 0], + type='lower', + swap='left_hip'), + 8: + dict( + name='left_knee', + id=8, + color=[255, 128, 0], + type='lower', + swap='right_knee'), + 9: + dict( + name='right_knee', + id=9, + color=[0, 255, 0], + type='lower', + swap='left_knee'), + 10: + dict( + name='left_ankle', + id=10, + color=[255, 128, 0], + type='lower', + swap='right_ankle'), + 11: + dict( + name='right_ankle', + id=11, + color=[0, 255, 0], + type='lower', + swap='left_ankle'), + 12: + dict( + name='top_head', id=12, color=[255, 128, 0], type='upper', + swap=''), + 13: + dict(name='neck', id=13, color=[0, 255, 0], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('top_head', 'neck'), id=12, color=[51, 153, 255]), + 13: + dict(link=('right_shoulder', 'neck'), id=13, color=[51, 153, 255]), + 14: + dict(link=('left_shoulder', 'neck'), id=14, color=[51, 153, 255]) + }, + joint_weights=[ + 0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5 + ], + sigmas=[ + 0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087, + 0.089, 0.089, 0.079, 0.079 + ]) diff --git a/configs/_base_/datasets/deepfashion_full.py b/configs/_base_/datasets/deepfashion_full.py new file mode 100644 index 0000000..4d98906 --- /dev/null +++ b/configs/_base_/datasets/deepfashion_full.py @@ -0,0 +1,74 @@ +dataset_info = dict( + dataset_name='deepfashion_full', + paper_info=dict( + author='Liu, Ziwei and Luo, Ping and Qiu, Shi ' + 'and Wang, Xiaogang and Tang, Xiaoou', + title='DeepFashion: Powering Robust Clothes Recognition ' + 'and Retrieval with Rich Annotations', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2016', + homepage='http://mmlab.ie.cuhk.edu.hk/projects/' + 'DeepFashion/LandmarkDetection.html', + ), + keypoint_info={ + 0: + dict( + name='left collar', + id=0, + color=[255, 255, 255], + type='', + swap='right collar'), + 1: + dict( + name='right collar', + id=1, + color=[255, 255, 255], + type='', + swap='left collar'), + 2: + dict( + name='left sleeve', + id=2, + color=[255, 255, 255], + type='', + swap='right sleeve'), + 3: + dict( + name='right sleeve', + id=3, + color=[255, 255, 255], + type='', + swap='left sleeve'), + 4: + dict( + name='left waistline', + id=0, + color=[255, 255, 255], + type='', + swap='right waistline'), + 5: + dict( + name='right waistline', + id=1, + color=[255, 255, 255], + type='', + swap='left waistline'), + 6: + dict( + name='left hem', + id=2, + color=[255, 255, 255], + type='', + swap='right hem'), + 7: + dict( + name='right hem', + id=3, + color=[255, 255, 255], + type='', + swap='left hem'), + }, + skeleton_info={}, + joint_weights=[1.] * 8, + sigmas=[]) diff --git a/configs/_base_/datasets/deepfashion_lower.py b/configs/_base_/datasets/deepfashion_lower.py new file mode 100644 index 0000000..db014a1 --- /dev/null +++ b/configs/_base_/datasets/deepfashion_lower.py @@ -0,0 +1,46 @@ +dataset_info = dict( + dataset_name='deepfashion_lower', + paper_info=dict( + author='Liu, Ziwei and Luo, Ping and Qiu, Shi ' + 'and Wang, Xiaogang and Tang, Xiaoou', + title='DeepFashion: Powering Robust Clothes Recognition ' + 'and Retrieval with Rich Annotations', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2016', + homepage='http://mmlab.ie.cuhk.edu.hk/projects/' + 'DeepFashion/LandmarkDetection.html', + ), + keypoint_info={ + 0: + dict( + name='left waistline', + id=0, + color=[255, 255, 255], + type='', + swap='right waistline'), + 1: + dict( + name='right waistline', + id=1, + color=[255, 255, 255], + type='', + swap='left waistline'), + 2: + dict( + name='left hem', + id=2, + color=[255, 255, 255], + type='', + swap='right hem'), + 3: + dict( + name='right hem', + id=3, + color=[255, 255, 255], + type='', + swap='left hem'), + }, + skeleton_info={}, + joint_weights=[1.] * 4, + sigmas=[]) diff --git a/configs/_base_/datasets/deepfashion_upper.py b/configs/_base_/datasets/deepfashion_upper.py new file mode 100644 index 0000000..f0b012f --- /dev/null +++ b/configs/_base_/datasets/deepfashion_upper.py @@ -0,0 +1,60 @@ +dataset_info = dict( + dataset_name='deepfashion_upper', + paper_info=dict( + author='Liu, Ziwei and Luo, Ping and Qiu, Shi ' + 'and Wang, Xiaogang and Tang, Xiaoou', + title='DeepFashion: Powering Robust Clothes Recognition ' + 'and Retrieval with Rich Annotations', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2016', + homepage='http://mmlab.ie.cuhk.edu.hk/projects/' + 'DeepFashion/LandmarkDetection.html', + ), + keypoint_info={ + 0: + dict( + name='left collar', + id=0, + color=[255, 255, 255], + type='', + swap='right collar'), + 1: + dict( + name='right collar', + id=1, + color=[255, 255, 255], + type='', + swap='left collar'), + 2: + dict( + name='left sleeve', + id=2, + color=[255, 255, 255], + type='', + swap='right sleeve'), + 3: + dict( + name='right sleeve', + id=3, + color=[255, 255, 255], + type='', + swap='left sleeve'), + 4: + dict( + name='left hem', + id=4, + color=[255, 255, 255], + type='', + swap='right hem'), + 5: + dict( + name='right hem', + id=5, + color=[255, 255, 255], + type='', + swap='left hem'), + }, + skeleton_info={}, + joint_weights=[1.] * 6, + sigmas=[]) diff --git a/configs/_base_/datasets/fly.py b/configs/_base_/datasets/fly.py new file mode 100644 index 0000000..5f94ff5 --- /dev/null +++ b/configs/_base_/datasets/fly.py @@ -0,0 +1,237 @@ +dataset_info = dict( + dataset_name='fly', + paper_info=dict( + author='Pereira, Talmo D and Aldarondo, Diego E and ' + 'Willmore, Lindsay and Kislin, Mikhail and ' + 'Wang, Samuel S-H and Murthy, Mala and Shaevitz, Joshua W', + title='Fast animal pose estimation using deep neural networks', + container='Nature methods', + year='2019', + homepage='https://github.com/jgraving/DeepPoseKit-Data', + ), + keypoint_info={ + 0: + dict(name='head', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='eyeL', id=1, color=[255, 255, 255], type='', swap='eyeR'), + 2: + dict(name='eyeR', id=2, color=[255, 255, 255], type='', swap='eyeL'), + 3: + dict(name='neck', id=3, color=[255, 255, 255], type='', swap=''), + 4: + dict(name='thorax', id=4, color=[255, 255, 255], type='', swap=''), + 5: + dict(name='abdomen', id=5, color=[255, 255, 255], type='', swap=''), + 6: + dict( + name='forelegR1', + id=6, + color=[255, 255, 255], + type='', + swap='forelegL1'), + 7: + dict( + name='forelegR2', + id=7, + color=[255, 255, 255], + type='', + swap='forelegL2'), + 8: + dict( + name='forelegR3', + id=8, + color=[255, 255, 255], + type='', + swap='forelegL3'), + 9: + dict( + name='forelegR4', + id=9, + color=[255, 255, 255], + type='', + swap='forelegL4'), + 10: + dict( + name='midlegR1', + id=10, + color=[255, 255, 255], + type='', + swap='midlegL1'), + 11: + dict( + name='midlegR2', + id=11, + color=[255, 255, 255], + type='', + swap='midlegL2'), + 12: + dict( + name='midlegR3', + id=12, + color=[255, 255, 255], + type='', + swap='midlegL3'), + 13: + dict( + name='midlegR4', + id=13, + color=[255, 255, 255], + type='', + swap='midlegL4'), + 14: + dict( + name='hindlegR1', + id=14, + color=[255, 255, 255], + type='', + swap='hindlegL1'), + 15: + dict( + name='hindlegR2', + id=15, + color=[255, 255, 255], + type='', + swap='hindlegL2'), + 16: + dict( + name='hindlegR3', + id=16, + color=[255, 255, 255], + type='', + swap='hindlegL3'), + 17: + dict( + name='hindlegR4', + id=17, + color=[255, 255, 255], + type='', + swap='hindlegL4'), + 18: + dict( + name='forelegL1', + id=18, + color=[255, 255, 255], + type='', + swap='forelegR1'), + 19: + dict( + name='forelegL2', + id=19, + color=[255, 255, 255], + type='', + swap='forelegR2'), + 20: + dict( + name='forelegL3', + id=20, + color=[255, 255, 255], + type='', + swap='forelegR3'), + 21: + dict( + name='forelegL4', + id=21, + color=[255, 255, 255], + type='', + swap='forelegR4'), + 22: + dict( + name='midlegL1', + id=22, + color=[255, 255, 255], + type='', + swap='midlegR1'), + 23: + dict( + name='midlegL2', + id=23, + color=[255, 255, 255], + type='', + swap='midlegR2'), + 24: + dict( + name='midlegL3', + id=24, + color=[255, 255, 255], + type='', + swap='midlegR3'), + 25: + dict( + name='midlegL4', + id=25, + color=[255, 255, 255], + type='', + swap='midlegR4'), + 26: + dict( + name='hindlegL1', + id=26, + color=[255, 255, 255], + type='', + swap='hindlegR1'), + 27: + dict( + name='hindlegL2', + id=27, + color=[255, 255, 255], + type='', + swap='hindlegR2'), + 28: + dict( + name='hindlegL3', + id=28, + color=[255, 255, 255], + type='', + swap='hindlegR3'), + 29: + dict( + name='hindlegL4', + id=29, + color=[255, 255, 255], + type='', + swap='hindlegR4'), + 30: + dict( + name='wingL', id=30, color=[255, 255, 255], type='', swap='wingR'), + 31: + dict( + name='wingR', id=31, color=[255, 255, 255], type='', swap='wingL'), + }, + skeleton_info={ + 0: dict(link=('eyeL', 'head'), id=0, color=[255, 255, 255]), + 1: dict(link=('eyeR', 'head'), id=1, color=[255, 255, 255]), + 2: dict(link=('neck', 'head'), id=2, color=[255, 255, 255]), + 3: dict(link=('thorax', 'neck'), id=3, color=[255, 255, 255]), + 4: dict(link=('abdomen', 'thorax'), id=4, color=[255, 255, 255]), + 5: dict(link=('forelegR2', 'forelegR1'), id=5, color=[255, 255, 255]), + 6: dict(link=('forelegR3', 'forelegR2'), id=6, color=[255, 255, 255]), + 7: dict(link=('forelegR4', 'forelegR3'), id=7, color=[255, 255, 255]), + 8: dict(link=('midlegR2', 'midlegR1'), id=8, color=[255, 255, 255]), + 9: dict(link=('midlegR3', 'midlegR2'), id=9, color=[255, 255, 255]), + 10: dict(link=('midlegR4', 'midlegR3'), id=10, color=[255, 255, 255]), + 11: + dict(link=('hindlegR2', 'hindlegR1'), id=11, color=[255, 255, 255]), + 12: + dict(link=('hindlegR3', 'hindlegR2'), id=12, color=[255, 255, 255]), + 13: + dict(link=('hindlegR4', 'hindlegR3'), id=13, color=[255, 255, 255]), + 14: + dict(link=('forelegL2', 'forelegL1'), id=14, color=[255, 255, 255]), + 15: + dict(link=('forelegL3', 'forelegL2'), id=15, color=[255, 255, 255]), + 16: + dict(link=('forelegL4', 'forelegL3'), id=16, color=[255, 255, 255]), + 17: dict(link=('midlegL2', 'midlegL1'), id=17, color=[255, 255, 255]), + 18: dict(link=('midlegL3', 'midlegL2'), id=18, color=[255, 255, 255]), + 19: dict(link=('midlegL4', 'midlegL3'), id=19, color=[255, 255, 255]), + 20: + dict(link=('hindlegL2', 'hindlegL1'), id=20, color=[255, 255, 255]), + 21: + dict(link=('hindlegL3', 'hindlegL2'), id=21, color=[255, 255, 255]), + 22: + dict(link=('hindlegL4', 'hindlegL3'), id=22, color=[255, 255, 255]), + 23: dict(link=('wingL', 'neck'), id=23, color=[255, 255, 255]), + 24: dict(link=('wingR', 'neck'), id=24, color=[255, 255, 255]) + }, + joint_weights=[1.] * 32, + sigmas=[]) diff --git a/configs/_base_/datasets/freihand2d.py b/configs/_base_/datasets/freihand2d.py new file mode 100644 index 0000000..8b960d1 --- /dev/null +++ b/configs/_base_/datasets/freihand2d.py @@ -0,0 +1,144 @@ +dataset_info = dict( + dataset_name='freihand', + paper_info=dict( + author='Zimmermann, Christian and Ceylan, Duygu and ' + 'Yang, Jimei and Russell, Bryan and ' + 'Argus, Max and Brox, Thomas', + title='Freihand: A dataset for markerless capture of hand pose ' + 'and shape from single rgb images', + container='Proceedings of the IEEE International ' + 'Conference on Computer Vision', + year='2019', + homepage='https://lmb.informatik.uni-freiburg.de/projects/freihand/', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/configs/_base_/datasets/h36m.py b/configs/_base_/datasets/h36m.py new file mode 100644 index 0000000..00a719d --- /dev/null +++ b/configs/_base_/datasets/h36m.py @@ -0,0 +1,152 @@ +dataset_info = dict( + dataset_name='h36m', + paper_info=dict( + author='Ionescu, Catalin and Papava, Dragos and ' + 'Olaru, Vlad and Sminchisescu, Cristian', + title='Human3.6M: Large Scale Datasets and Predictive ' + 'Methods for 3D Human Sensing in Natural Environments', + container='IEEE Transactions on Pattern Analysis and ' + 'Machine Intelligence', + year='2014', + homepage='http://vision.imar.ro/human3.6m/description.php', + ), + keypoint_info={ + 0: + dict(name='root', id=0, color=[51, 153, 255], type='lower', swap=''), + 1: + dict( + name='right_hip', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 2: + dict( + name='right_knee', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 3: + dict( + name='right_foot', + id=3, + color=[255, 128, 0], + type='lower', + swap='left_foot'), + 4: + dict( + name='left_hip', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 5: + dict( + name='left_knee', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 6: + dict( + name='left_foot', + id=6, + color=[0, 255, 0], + type='lower', + swap='right_foot'), + 7: + dict(name='spine', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict(name='thorax', id=8, color=[51, 153, 255], type='upper', swap=''), + 9: + dict( + name='neck_base', + id=9, + color=[51, 153, 255], + type='upper', + swap=''), + 10: + dict(name='head', id=10, color=[51, 153, 255], type='upper', swap=''), + 11: + dict( + name='left_shoulder', + id=11, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 12: + dict( + name='left_elbow', + id=12, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 13: + dict( + name='left_wrist', + id=13, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 14: + dict( + name='right_shoulder', + id=14, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 15: + dict( + name='right_elbow', + id=15, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 16: + dict( + name='right_wrist', + id=16, + color=[255, 128, 0], + type='upper', + swap='left_wrist') + }, + skeleton_info={ + 0: + dict(link=('root', 'left_hip'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_hip', 'left_knee'), id=1, color=[0, 255, 0]), + 2: + dict(link=('left_knee', 'left_foot'), id=2, color=[0, 255, 0]), + 3: + dict(link=('root', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('right_hip', 'right_knee'), id=4, color=[255, 128, 0]), + 5: + dict(link=('right_knee', 'right_foot'), id=5, color=[255, 128, 0]), + 6: + dict(link=('root', 'spine'), id=6, color=[51, 153, 255]), + 7: + dict(link=('spine', 'thorax'), id=7, color=[51, 153, 255]), + 8: + dict(link=('thorax', 'neck_base'), id=8, color=[51, 153, 255]), + 9: + dict(link=('neck_base', 'head'), id=9, color=[51, 153, 255]), + 10: + dict(link=('thorax', 'left_shoulder'), id=10, color=[0, 255, 0]), + 11: + dict(link=('left_shoulder', 'left_elbow'), id=11, color=[0, 255, 0]), + 12: + dict(link=('left_elbow', 'left_wrist'), id=12, color=[0, 255, 0]), + 13: + dict(link=('thorax', 'right_shoulder'), id=13, color=[255, 128, 0]), + 14: + dict( + link=('right_shoulder', 'right_elbow'), id=14, color=[255, 128, + 0]), + 15: + dict(link=('right_elbow', 'right_wrist'), id=15, color=[255, 128, 0]) + }, + joint_weights=[1.] * 17, + sigmas=[], + stats_info=dict(bbox_center=(528., 427.), bbox_scale=400.)) diff --git a/configs/_base_/datasets/halpe.py b/configs/_base_/datasets/halpe.py new file mode 100644 index 0000000..1385fe8 --- /dev/null +++ b/configs/_base_/datasets/halpe.py @@ -0,0 +1,1157 @@ +dataset_info = dict( + dataset_name='halpe', + paper_info=dict( + author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie' + ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu' + ' and Ma, Ze and Chen, Mingyang and Lu, Cewu', + title='PaStaNet: Toward Human Activity Knowledge Engine', + container='CVPR', + year='2020', + homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''), + 18: + dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''), + 19: + dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''), + 20: + dict( + name='left_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 21: + dict( + name='right_big_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 22: + dict( + name='left_small_toe', + id=22, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 23: + dict( + name='right_small_toe', + id=23, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 24: + dict( + name='left_heel', + id=24, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 25: + dict( + name='right_heel', + id=25, + color=[255, 128, 0], + type='lower', + swap='left_heel'), + 26: + dict( + name='face-0', + id=26, + color=[255, 255, 255], + type='', + swap='face-16'), + 27: + dict( + name='face-1', + id=27, + color=[255, 255, 255], + type='', + swap='face-15'), + 28: + dict( + name='face-2', + id=28, + color=[255, 255, 255], + type='', + swap='face-14'), + 29: + dict( + name='face-3', + id=29, + color=[255, 255, 255], + type='', + swap='face-13'), + 30: + dict( + name='face-4', + id=30, + color=[255, 255, 255], + type='', + swap='face-12'), + 31: + dict( + name='face-5', + id=31, + color=[255, 255, 255], + type='', + swap='face-11'), + 32: + dict( + name='face-6', + id=32, + color=[255, 255, 255], + type='', + swap='face-10'), + 33: + dict( + name='face-7', + id=33, + color=[255, 255, 255], + type='', + swap='face-9'), + 34: + dict(name='face-8', id=34, color=[255, 255, 255], type='', swap=''), + 35: + dict( + name='face-9', + id=35, + color=[255, 255, 255], + type='', + swap='face-7'), + 36: + dict( + name='face-10', + id=36, + color=[255, 255, 255], + type='', + swap='face-6'), + 37: + dict( + name='face-11', + id=37, + color=[255, 255, 255], + type='', + swap='face-5'), + 38: + dict( + name='face-12', + id=38, + color=[255, 255, 255], + type='', + swap='face-4'), + 39: + dict( + name='face-13', + id=39, + color=[255, 255, 255], + type='', + swap='face-3'), + 40: + dict( + name='face-14', + id=40, + color=[255, 255, 255], + type='', + swap='face-2'), + 41: + dict( + name='face-15', + id=41, + color=[255, 255, 255], + type='', + swap='face-1'), + 42: + dict( + name='face-16', + id=42, + color=[255, 255, 255], + type='', + swap='face-0'), + 43: + dict( + name='face-17', + id=43, + color=[255, 255, 255], + type='', + swap='face-26'), + 44: + dict( + name='face-18', + id=44, + color=[255, 255, 255], + type='', + swap='face-25'), + 45: + dict( + name='face-19', + id=45, + color=[255, 255, 255], + type='', + swap='face-24'), + 46: + dict( + name='face-20', + id=46, + color=[255, 255, 255], + type='', + swap='face-23'), + 47: + dict( + name='face-21', + id=47, + color=[255, 255, 255], + type='', + swap='face-22'), + 48: + dict( + name='face-22', + id=48, + color=[255, 255, 255], + type='', + swap='face-21'), + 49: + dict( + name='face-23', + id=49, + color=[255, 255, 255], + type='', + swap='face-20'), + 50: + dict( + name='face-24', + id=50, + color=[255, 255, 255], + type='', + swap='face-19'), + 51: + dict( + name='face-25', + id=51, + color=[255, 255, 255], + type='', + swap='face-18'), + 52: + dict( + name='face-26', + id=52, + color=[255, 255, 255], + type='', + swap='face-17'), + 53: + dict(name='face-27', id=53, color=[255, 255, 255], type='', swap=''), + 54: + dict(name='face-28', id=54, color=[255, 255, 255], type='', swap=''), + 55: + dict(name='face-29', id=55, color=[255, 255, 255], type='', swap=''), + 56: + dict(name='face-30', id=56, color=[255, 255, 255], type='', swap=''), + 57: + dict( + name='face-31', + id=57, + color=[255, 255, 255], + type='', + swap='face-35'), + 58: + dict( + name='face-32', + id=58, + color=[255, 255, 255], + type='', + swap='face-34'), + 59: + dict(name='face-33', id=59, color=[255, 255, 255], type='', swap=''), + 60: + dict( + name='face-34', + id=60, + color=[255, 255, 255], + type='', + swap='face-32'), + 61: + dict( + name='face-35', + id=61, + color=[255, 255, 255], + type='', + swap='face-31'), + 62: + dict( + name='face-36', + id=62, + color=[255, 255, 255], + type='', + swap='face-45'), + 63: + dict( + name='face-37', + id=63, + color=[255, 255, 255], + type='', + swap='face-44'), + 64: + dict( + name='face-38', + id=64, + color=[255, 255, 255], + type='', + swap='face-43'), + 65: + dict( + name='face-39', + id=65, + color=[255, 255, 255], + type='', + swap='face-42'), + 66: + dict( + name='face-40', + id=66, + color=[255, 255, 255], + type='', + swap='face-47'), + 67: + dict( + name='face-41', + id=67, + color=[255, 255, 255], + type='', + swap='face-46'), + 68: + dict( + name='face-42', + id=68, + color=[255, 255, 255], + type='', + swap='face-39'), + 69: + dict( + name='face-43', + id=69, + color=[255, 255, 255], + type='', + swap='face-38'), + 70: + dict( + name='face-44', + id=70, + color=[255, 255, 255], + type='', + swap='face-37'), + 71: + dict( + name='face-45', + id=71, + color=[255, 255, 255], + type='', + swap='face-36'), + 72: + dict( + name='face-46', + id=72, + color=[255, 255, 255], + type='', + swap='face-41'), + 73: + dict( + name='face-47', + id=73, + color=[255, 255, 255], + type='', + swap='face-40'), + 74: + dict( + name='face-48', + id=74, + color=[255, 255, 255], + type='', + swap='face-54'), + 75: + dict( + name='face-49', + id=75, + color=[255, 255, 255], + type='', + swap='face-53'), + 76: + dict( + name='face-50', + id=76, + color=[255, 255, 255], + type='', + swap='face-52'), + 77: + dict(name='face-51', id=77, color=[255, 255, 255], type='', swap=''), + 78: + dict( + name='face-52', + id=78, + color=[255, 255, 255], + type='', + swap='face-50'), + 79: + dict( + name='face-53', + id=79, + color=[255, 255, 255], + type='', + swap='face-49'), + 80: + dict( + name='face-54', + id=80, + color=[255, 255, 255], + type='', + swap='face-48'), + 81: + dict( + name='face-55', + id=81, + color=[255, 255, 255], + type='', + swap='face-59'), + 82: + dict( + name='face-56', + id=82, + color=[255, 255, 255], + type='', + swap='face-58'), + 83: + dict(name='face-57', id=83, color=[255, 255, 255], type='', swap=''), + 84: + dict( + name='face-58', + id=84, + color=[255, 255, 255], + type='', + swap='face-56'), + 85: + dict( + name='face-59', + id=85, + color=[255, 255, 255], + type='', + swap='face-55'), + 86: + dict( + name='face-60', + id=86, + color=[255, 255, 255], + type='', + swap='face-64'), + 87: + dict( + name='face-61', + id=87, + color=[255, 255, 255], + type='', + swap='face-63'), + 88: + dict(name='face-62', id=88, color=[255, 255, 255], type='', swap=''), + 89: + dict( + name='face-63', + id=89, + color=[255, 255, 255], + type='', + swap='face-61'), + 90: + dict( + name='face-64', + id=90, + color=[255, 255, 255], + type='', + swap='face-60'), + 91: + dict( + name='face-65', + id=91, + color=[255, 255, 255], + type='', + swap='face-67'), + 92: + dict(name='face-66', id=92, color=[255, 255, 255], type='', swap=''), + 93: + dict( + name='face-67', + id=93, + color=[255, 255, 255], + type='', + swap='face-65'), + 94: + dict( + name='left_hand_root', + id=94, + color=[255, 255, 255], + type='', + swap='right_hand_root'), + 95: + dict( + name='left_thumb1', + id=95, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 96: + dict( + name='left_thumb2', + id=96, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 97: + dict( + name='left_thumb3', + id=97, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 98: + dict( + name='left_thumb4', + id=98, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 99: + dict( + name='left_forefinger1', + id=99, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 100: + dict( + name='left_forefinger2', + id=100, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 101: + dict( + name='left_forefinger3', + id=101, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 102: + dict( + name='left_forefinger4', + id=102, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 103: + dict( + name='left_middle_finger1', + id=103, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 104: + dict( + name='left_middle_finger2', + id=104, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 105: + dict( + name='left_middle_finger3', + id=105, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 106: + dict( + name='left_middle_finger4', + id=106, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 107: + dict( + name='left_ring_finger1', + id=107, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 108: + dict( + name='left_ring_finger2', + id=108, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 109: + dict( + name='left_ring_finger3', + id=109, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 110: + dict( + name='left_ring_finger4', + id=110, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 111: + dict( + name='left_pinky_finger1', + id=111, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 112: + dict( + name='left_pinky_finger2', + id=112, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 113: + dict( + name='left_pinky_finger3', + id=113, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 114: + dict( + name='left_pinky_finger4', + id=114, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 115: + dict( + name='right_hand_root', + id=115, + color=[255, 255, 255], + type='', + swap='left_hand_root'), + 116: + dict( + name='right_thumb1', + id=116, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 117: + dict( + name='right_thumb2', + id=117, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 118: + dict( + name='right_thumb3', + id=118, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 119: + dict( + name='right_thumb4', + id=119, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 120: + dict( + name='right_forefinger1', + id=120, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 121: + dict( + name='right_forefinger2', + id=121, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 122: + dict( + name='right_forefinger3', + id=122, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 123: + dict( + name='right_forefinger4', + id=123, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 124: + dict( + name='right_middle_finger1', + id=124, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 125: + dict( + name='right_middle_finger2', + id=125, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 126: + dict( + name='right_middle_finger3', + id=126, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 127: + dict( + name='right_middle_finger4', + id=127, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 128: + dict( + name='right_ring_finger1', + id=128, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 129: + dict( + name='right_ring_finger2', + id=129, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 130: + dict( + name='right_ring_finger3', + id=130, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 131: + dict( + name='right_ring_finger4', + id=131, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 132: + dict( + name='right_pinky_finger1', + id=132, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 133: + dict( + name='right_pinky_finger2', + id=133, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 134: + dict( + name='right_pinky_finger3', + id=134, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 135: + dict( + name='right_pinky_finger4', + id=135, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]), + 3: + dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]), + 4: + dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]), + 5: + dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]), + 6: + dict(link=('head', 'neck'), id=6, color=[51, 153, 255]), + 7: + dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]), + 8: + dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]), + 9: + dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]), + 12: + dict( + link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128, + 0]), + 13: + dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]), + 14: + dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]), + 16: + dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]), + 18: + dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]), + 20: + dict( + link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]), + 21: + dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]), + 22: + dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]), + 23: + dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]), + 24: + dict( + link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('right_ankle', 'right_small_toe'), + id=25, + color=[255, 128, 0]), + 26: + dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_wrist', 'left_thumb1'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb1', 'left_thumb2'), id=28, color=[255, 128, 0]), + 29: + dict(link=('left_thumb2', 'left_thumb3'), id=29, color=[255, 128, 0]), + 30: + dict(link=('left_thumb3', 'left_thumb4'), id=30, color=[255, 128, 0]), + 31: + dict( + link=('left_wrist', 'left_forefinger1'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=33, + color=[255, 153, 255]), + 34: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=34, + color=[255, 153, 255]), + 35: + dict( + link=('left_wrist', 'left_middle_finger1'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=37, + color=[102, 178, 255]), + 38: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=38, + color=[102, 178, 255]), + 39: + dict( + link=('left_wrist', 'left_ring_finger1'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=41, + color=[255, 51, 51]), + 42: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=42, + color=[255, 51, 51]), + 43: + dict( + link=('left_wrist', 'left_pinky_finger1'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=45, + color=[0, 255, 0]), + 46: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=46, + color=[0, 255, 0]), + 47: + dict(link=('right_wrist', 'right_thumb1'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb1', 'right_thumb2'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_thumb2', 'right_thumb3'), id=49, color=[255, 128, 0]), + 50: + dict( + link=('right_thumb3', 'right_thumb4'), id=50, color=[255, 128, 0]), + 51: + dict( + link=('right_wrist', 'right_forefinger1'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=53, + color=[255, 153, 255]), + 54: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=54, + color=[255, 153, 255]), + 55: + dict( + link=('right_wrist', 'right_middle_finger1'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=57, + color=[102, 178, 255]), + 58: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=58, + color=[102, 178, 255]), + 59: + dict( + link=('right_wrist', 'right_ring_finger1'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=61, + color=[255, 51, 51]), + 62: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=62, + color=[255, 51, 51]), + 63: + dict( + link=('right_wrist', 'right_pinky_finger1'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=64, + color=[0, 255, 0]), + 65: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=65, + color=[0, 255, 0]), + 66: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=66, + color=[0, 255, 0]) + }, + joint_weights=[1.] * 136, + + # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/' + # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245' + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.08, 0.08, 0.08, + 0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015 + ]) diff --git a/configs/_base_/datasets/horse10.py b/configs/_base_/datasets/horse10.py new file mode 100644 index 0000000..a485bf1 --- /dev/null +++ b/configs/_base_/datasets/horse10.py @@ -0,0 +1,201 @@ +dataset_info = dict( + dataset_name='horse10', + paper_info=dict( + author='Mathis, Alexander and Biasi, Thomas and ' + 'Schneider, Steffen and ' + 'Yuksekgonul, Mert and Rogers, Byron and ' + 'Bethge, Matthias and ' + 'Mathis, Mackenzie W', + title='Pretraining boosts out-of-domain robustness ' + 'for pose estimation', + container='Proceedings of the IEEE/CVF Winter Conference on ' + 'Applications of Computer Vision', + year='2021', + homepage='http://www.mackenziemathislab.org/horse10', + ), + keypoint_info={ + 0: + dict(name='Nose', id=0, color=[255, 153, 255], type='upper', swap=''), + 1: + dict(name='Eye', id=1, color=[255, 153, 255], type='upper', swap=''), + 2: + dict( + name='Nearknee', + id=2, + color=[255, 102, 255], + type='upper', + swap=''), + 3: + dict( + name='Nearfrontfetlock', + id=3, + color=[255, 102, 255], + type='upper', + swap=''), + 4: + dict( + name='Nearfrontfoot', + id=4, + color=[255, 102, 255], + type='upper', + swap=''), + 5: + dict( + name='Offknee', id=5, color=[255, 102, 255], type='upper', + swap=''), + 6: + dict( + name='Offfrontfetlock', + id=6, + color=[255, 102, 255], + type='upper', + swap=''), + 7: + dict( + name='Offfrontfoot', + id=7, + color=[255, 102, 255], + type='upper', + swap=''), + 8: + dict( + name='Shoulder', + id=8, + color=[255, 153, 255], + type='upper', + swap=''), + 9: + dict( + name='Midshoulder', + id=9, + color=[255, 153, 255], + type='upper', + swap=''), + 10: + dict( + name='Elbow', id=10, color=[255, 153, 255], type='upper', swap=''), + 11: + dict( + name='Girth', id=11, color=[255, 153, 255], type='upper', swap=''), + 12: + dict( + name='Wither', id=12, color=[255, 153, 255], type='upper', + swap=''), + 13: + dict( + name='Nearhindhock', + id=13, + color=[255, 51, 255], + type='lower', + swap=''), + 14: + dict( + name='Nearhindfetlock', + id=14, + color=[255, 51, 255], + type='lower', + swap=''), + 15: + dict( + name='Nearhindfoot', + id=15, + color=[255, 51, 255], + type='lower', + swap=''), + 16: + dict(name='Hip', id=16, color=[255, 153, 255], type='lower', swap=''), + 17: + dict( + name='Stifle', id=17, color=[255, 153, 255], type='lower', + swap=''), + 18: + dict( + name='Offhindhock', + id=18, + color=[255, 51, 255], + type='lower', + swap=''), + 19: + dict( + name='Offhindfetlock', + id=19, + color=[255, 51, 255], + type='lower', + swap=''), + 20: + dict( + name='Offhindfoot', + id=20, + color=[255, 51, 255], + type='lower', + swap=''), + 21: + dict( + name='Ischium', + id=21, + color=[255, 153, 255], + type='lower', + swap='') + }, + skeleton_info={ + 0: + dict(link=('Nose', 'Eye'), id=0, color=[255, 153, 255]), + 1: + dict(link=('Eye', 'Wither'), id=1, color=[255, 153, 255]), + 2: + dict(link=('Wither', 'Hip'), id=2, color=[255, 153, 255]), + 3: + dict(link=('Hip', 'Ischium'), id=3, color=[255, 153, 255]), + 4: + dict(link=('Ischium', 'Stifle'), id=4, color=[255, 153, 255]), + 5: + dict(link=('Stifle', 'Girth'), id=5, color=[255, 153, 255]), + 6: + dict(link=('Girth', 'Elbow'), id=6, color=[255, 153, 255]), + 7: + dict(link=('Elbow', 'Shoulder'), id=7, color=[255, 153, 255]), + 8: + dict(link=('Shoulder', 'Midshoulder'), id=8, color=[255, 153, 255]), + 9: + dict(link=('Midshoulder', 'Wither'), id=9, color=[255, 153, 255]), + 10: + dict( + link=('Nearknee', 'Nearfrontfetlock'), + id=10, + color=[255, 102, 255]), + 11: + dict( + link=('Nearfrontfetlock', 'Nearfrontfoot'), + id=11, + color=[255, 102, 255]), + 12: + dict( + link=('Offknee', 'Offfrontfetlock'), id=12, color=[255, 102, 255]), + 13: + dict( + link=('Offfrontfetlock', 'Offfrontfoot'), + id=13, + color=[255, 102, 255]), + 14: + dict( + link=('Nearhindhock', 'Nearhindfetlock'), + id=14, + color=[255, 51, 255]), + 15: + dict( + link=('Nearhindfetlock', 'Nearhindfoot'), + id=15, + color=[255, 51, 255]), + 16: + dict( + link=('Offhindhock', 'Offhindfetlock'), + id=16, + color=[255, 51, 255]), + 17: + dict( + link=('Offhindfetlock', 'Offhindfoot'), + id=17, + color=[255, 51, 255]) + }, + joint_weights=[1.] * 22, + sigmas=[]) diff --git a/configs/_base_/datasets/interhand2d.py b/configs/_base_/datasets/interhand2d.py new file mode 100644 index 0000000..0134f07 --- /dev/null +++ b/configs/_base_/datasets/interhand2d.py @@ -0,0 +1,142 @@ +dataset_info = dict( + dataset_name='interhand2d', + paper_info=dict( + author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and ' + 'Shiratori, Takaaki and Lee, Kyoung Mu', + title='InterHand2.6M: A dataset and baseline for 3D ' + 'interacting hand pose estimation from a single RGB image', + container='arXiv', + year='2020', + homepage='https://mks0601.github.io/InterHand2.6M/', + ), + keypoint_info={ + 0: + dict(name='thumb4', id=0, color=[255, 128, 0], type='', swap=''), + 1: + dict(name='thumb3', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb1', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict( + name='forefinger4', id=4, color=[255, 153, 255], type='', swap=''), + 5: + dict( + name='forefinger3', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger1', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='middle_finger4', + id=8, + color=[102, 178, 255], + type='', + swap=''), + 9: + dict( + name='middle_finger3', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger1', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='ring_finger4', id=12, color=[255, 51, 51], type='', swap=''), + 13: + dict( + name='ring_finger3', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger1', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict(name='pinky_finger4', id=16, color=[0, 255, 0], type='', swap=''), + 17: + dict(name='pinky_finger3', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger1', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='wrist', id=20, color=[255, 255, 255], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/configs/_base_/datasets/interhand3d.py b/configs/_base_/datasets/interhand3d.py new file mode 100644 index 0000000..e2bd812 --- /dev/null +++ b/configs/_base_/datasets/interhand3d.py @@ -0,0 +1,487 @@ +dataset_info = dict( + dataset_name='interhand3d', + paper_info=dict( + author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and ' + 'Shiratori, Takaaki and Lee, Kyoung Mu', + title='InterHand2.6M: A dataset and baseline for 3D ' + 'interacting hand pose estimation from a single RGB image', + container='arXiv', + year='2020', + homepage='https://mks0601.github.io/InterHand2.6M/', + ), + keypoint_info={ + 0: + dict( + name='right_thumb4', + id=0, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 1: + dict( + name='right_thumb3', + id=1, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 2: + dict( + name='right_thumb2', + id=2, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 3: + dict( + name='right_thumb1', + id=3, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 4: + dict( + name='right_forefinger4', + id=4, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 5: + dict( + name='right_forefinger3', + id=5, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 6: + dict( + name='right_forefinger2', + id=6, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 7: + dict( + name='right_forefinger1', + id=7, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 8: + dict( + name='right_middle_finger4', + id=8, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 9: + dict( + name='right_middle_finger3', + id=9, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 10: + dict( + name='right_middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 11: + dict( + name='right_middle_finger1', + id=11, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 12: + dict( + name='right_ring_finger4', + id=12, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 13: + dict( + name='right_ring_finger3', + id=13, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 14: + dict( + name='right_ring_finger2', + id=14, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 15: + dict( + name='right_ring_finger1', + id=15, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 16: + dict( + name='right_pinky_finger4', + id=16, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4'), + 17: + dict( + name='right_pinky_finger3', + id=17, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 18: + dict( + name='right_pinky_finger2', + id=18, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 19: + dict( + name='right_pinky_finger1', + id=19, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 20: + dict( + name='right_wrist', + id=20, + color=[255, 255, 255], + type='', + swap='left_wrist'), + 21: + dict( + name='left_thumb4', + id=21, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 22: + dict( + name='left_thumb3', + id=22, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 23: + dict( + name='left_thumb2', + id=23, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 24: + dict( + name='left_thumb1', + id=24, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 25: + dict( + name='left_forefinger4', + id=25, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 26: + dict( + name='left_forefinger3', + id=26, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 27: + dict( + name='left_forefinger2', + id=27, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 28: + dict( + name='left_forefinger1', + id=28, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 29: + dict( + name='left_middle_finger4', + id=29, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 30: + dict( + name='left_middle_finger3', + id=30, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 31: + dict( + name='left_middle_finger2', + id=31, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 32: + dict( + name='left_middle_finger1', + id=32, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 33: + dict( + name='left_ring_finger4', + id=33, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 34: + dict( + name='left_ring_finger3', + id=34, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 35: + dict( + name='left_ring_finger2', + id=35, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 36: + dict( + name='left_ring_finger1', + id=36, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 37: + dict( + name='left_pinky_finger4', + id=37, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 38: + dict( + name='left_pinky_finger3', + id=38, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 39: + dict( + name='left_pinky_finger2', + id=39, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 40: + dict( + name='left_pinky_finger1', + id=40, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 41: + dict( + name='left_wrist', + id=41, + color=[255, 255, 255], + type='', + swap='right_wrist'), + }, + skeleton_info={ + 0: + dict(link=('right_wrist', 'right_thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_thumb1', 'right_thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_thumb2', 'right_thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_thumb3', 'right_thumb4'), id=3, color=[255, 128, 0]), + 4: + dict( + link=('right_wrist', 'right_forefinger1'), + id=4, + color=[255, 153, 255]), + 5: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=5, + color=[255, 153, 255]), + 6: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=6, + color=[255, 153, 255]), + 7: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=7, + color=[255, 153, 255]), + 8: + dict( + link=('right_wrist', 'right_middle_finger1'), + id=8, + color=[102, 178, 255]), + 9: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict( + link=('right_wrist', 'right_ring_finger1'), + id=12, + color=[255, 51, 51]), + 13: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=13, + color=[255, 51, 51]), + 14: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=14, + color=[255, 51, 51]), + 15: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=15, + color=[255, 51, 51]), + 16: + dict( + link=('right_wrist', 'right_pinky_finger1'), + id=16, + color=[0, 255, 0]), + 17: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=17, + color=[0, 255, 0]), + 18: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=18, + color=[0, 255, 0]), + 19: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=19, + color=[0, 255, 0]), + 20: + dict(link=('left_wrist', 'left_thumb1'), id=20, color=[255, 128, 0]), + 21: + dict(link=('left_thumb1', 'left_thumb2'), id=21, color=[255, 128, 0]), + 22: + dict(link=('left_thumb2', 'left_thumb3'), id=22, color=[255, 128, 0]), + 23: + dict(link=('left_thumb3', 'left_thumb4'), id=23, color=[255, 128, 0]), + 24: + dict( + link=('left_wrist', 'left_forefinger1'), + id=24, + color=[255, 153, 255]), + 25: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=25, + color=[255, 153, 255]), + 26: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=26, + color=[255, 153, 255]), + 27: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=27, + color=[255, 153, 255]), + 28: + dict( + link=('left_wrist', 'left_middle_finger1'), + id=28, + color=[102, 178, 255]), + 29: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=29, + color=[102, 178, 255]), + 30: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=30, + color=[102, 178, 255]), + 31: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=31, + color=[102, 178, 255]), + 32: + dict( + link=('left_wrist', 'left_ring_finger1'), + id=32, + color=[255, 51, 51]), + 33: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=33, + color=[255, 51, 51]), + 34: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=34, + color=[255, 51, 51]), + 35: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=35, + color=[255, 51, 51]), + 36: + dict( + link=('left_wrist', 'left_pinky_finger1'), + id=36, + color=[0, 255, 0]), + 37: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=37, + color=[0, 255, 0]), + 38: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=38, + color=[0, 255, 0]), + 39: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=39, + color=[0, 255, 0]), + }, + joint_weights=[1.] * 42, + sigmas=[]) diff --git a/configs/_base_/datasets/jhmdb.py b/configs/_base_/datasets/jhmdb.py new file mode 100644 index 0000000..1b37488 --- /dev/null +++ b/configs/_base_/datasets/jhmdb.py @@ -0,0 +1,129 @@ +dataset_info = dict( + dataset_name='jhmdb', + paper_info=dict( + author='H. Jhuang and J. Gall and S. Zuffi and ' + 'C. Schmid and M. J. Black', + title='Towards understanding action recognition', + container='International Conf. on Computer Vision (ICCV)', + year='2013', + homepage='http://jhmdb.is.tue.mpg.de/dataset', + ), + keypoint_info={ + 0: + dict(name='neck', id=0, color=[255, 128, 0], type='upper', swap=''), + 1: + dict(name='belly', id=1, color=[255, 128, 0], type='upper', swap=''), + 2: + dict(name='head', id=2, color=[255, 128, 0], type='upper', swap=''), + 3: + dict( + name='right_shoulder', + id=3, + color=[0, 255, 0], + type='upper', + swap='left_shoulder'), + 4: + dict( + name='left_shoulder', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 5: + dict( + name='right_hip', + id=5, + color=[0, 255, 0], + type='lower', + swap='left_hip'), + 6: + dict( + name='left_hip', + id=6, + color=[51, 153, 255], + type='lower', + swap='right_hip'), + 7: + dict( + name='right_elbow', + id=7, + color=[51, 153, 255], + type='upper', + swap='left_elbow'), + 8: + dict( + name='left_elbow', + id=8, + color=[51, 153, 255], + type='upper', + swap='right_elbow'), + 9: + dict( + name='right_knee', + id=9, + color=[51, 153, 255], + type='lower', + swap='left_knee'), + 10: + dict( + name='left_knee', + id=10, + color=[255, 128, 0], + type='lower', + swap='right_knee'), + 11: + dict( + name='right_wrist', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 12: + dict( + name='left_wrist', + id=12, + color=[255, 128, 0], + type='upper', + swap='right_wrist'), + 13: + dict( + name='right_ankle', + id=13, + color=[0, 255, 0], + type='lower', + swap='left_ankle'), + 14: + dict( + name='left_ankle', + id=14, + color=[0, 255, 0], + type='lower', + swap='right_ankle') + }, + skeleton_info={ + 0: dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: dict(link=('right_hip', 'belly'), id=2, color=[255, 128, 0]), + 3: dict(link=('belly', 'left_hip'), id=3, color=[0, 255, 0]), + 4: dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), + 5: dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), + 6: dict(link=('belly', 'neck'), id=6, color=[51, 153, 255]), + 7: dict(link=('neck', 'head'), id=7, color=[51, 153, 255]), + 8: dict(link=('neck', 'right_shoulder'), id=8, color=[255, 128, 0]), + 9: dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('right_elbow', 'right_wrist'), id=10, color=[255, 128, 0]), + 11: dict(link=('neck', 'left_shoulder'), id=11, color=[0, 255, 0]), + 12: + dict(link=('left_shoulder', 'left_elbow'), id=12, color=[0, 255, 0]), + 13: dict(link=('left_elbow', 'left_wrist'), id=13, color=[0, 255, 0]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, 1.5, 1.5, 1.5, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.025, 0.107, 0.025, 0.079, 0.079, 0.107, 0.107, 0.072, 0.072, 0.087, + 0.087, 0.062, 0.062, 0.089, 0.089 + ]) diff --git a/configs/_base_/datasets/locust.py b/configs/_base_/datasets/locust.py new file mode 100644 index 0000000..db3fa15 --- /dev/null +++ b/configs/_base_/datasets/locust.py @@ -0,0 +1,263 @@ +dataset_info = dict( + dataset_name='locust', + paper_info=dict( + author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and ' + 'Li, Liang and Koger, Benjamin and Costelloe, Blair R and ' + 'Couzin, Iain D', + title='DeepPoseKit, a software toolkit for fast and robust ' + 'animal pose estimation using deep learning', + container='Elife', + year='2019', + homepage='https://github.com/jgraving/DeepPoseKit-Data', + ), + keypoint_info={ + 0: + dict(name='head', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='neck', id=1, color=[255, 255, 255], type='', swap=''), + 2: + dict(name='thorax', id=2, color=[255, 255, 255], type='', swap=''), + 3: + dict(name='abdomen1', id=3, color=[255, 255, 255], type='', swap=''), + 4: + dict(name='abdomen2', id=4, color=[255, 255, 255], type='', swap=''), + 5: + dict( + name='anttipL', + id=5, + color=[255, 255, 255], + type='', + swap='anttipR'), + 6: + dict( + name='antbaseL', + id=6, + color=[255, 255, 255], + type='', + swap='antbaseR'), + 7: + dict(name='eyeL', id=7, color=[255, 255, 255], type='', swap='eyeR'), + 8: + dict( + name='forelegL1', + id=8, + color=[255, 255, 255], + type='', + swap='forelegR1'), + 9: + dict( + name='forelegL2', + id=9, + color=[255, 255, 255], + type='', + swap='forelegR2'), + 10: + dict( + name='forelegL3', + id=10, + color=[255, 255, 255], + type='', + swap='forelegR3'), + 11: + dict( + name='forelegL4', + id=11, + color=[255, 255, 255], + type='', + swap='forelegR4'), + 12: + dict( + name='midlegL1', + id=12, + color=[255, 255, 255], + type='', + swap='midlegR1'), + 13: + dict( + name='midlegL2', + id=13, + color=[255, 255, 255], + type='', + swap='midlegR2'), + 14: + dict( + name='midlegL3', + id=14, + color=[255, 255, 255], + type='', + swap='midlegR3'), + 15: + dict( + name='midlegL4', + id=15, + color=[255, 255, 255], + type='', + swap='midlegR4'), + 16: + dict( + name='hindlegL1', + id=16, + color=[255, 255, 255], + type='', + swap='hindlegR1'), + 17: + dict( + name='hindlegL2', + id=17, + color=[255, 255, 255], + type='', + swap='hindlegR2'), + 18: + dict( + name='hindlegL3', + id=18, + color=[255, 255, 255], + type='', + swap='hindlegR3'), + 19: + dict( + name='hindlegL4', + id=19, + color=[255, 255, 255], + type='', + swap='hindlegR4'), + 20: + dict( + name='anttipR', + id=20, + color=[255, 255, 255], + type='', + swap='anttipL'), + 21: + dict( + name='antbaseR', + id=21, + color=[255, 255, 255], + type='', + swap='antbaseL'), + 22: + dict(name='eyeR', id=22, color=[255, 255, 255], type='', swap='eyeL'), + 23: + dict( + name='forelegR1', + id=23, + color=[255, 255, 255], + type='', + swap='forelegL1'), + 24: + dict( + name='forelegR2', + id=24, + color=[255, 255, 255], + type='', + swap='forelegL2'), + 25: + dict( + name='forelegR3', + id=25, + color=[255, 255, 255], + type='', + swap='forelegL3'), + 26: + dict( + name='forelegR4', + id=26, + color=[255, 255, 255], + type='', + swap='forelegL4'), + 27: + dict( + name='midlegR1', + id=27, + color=[255, 255, 255], + type='', + swap='midlegL1'), + 28: + dict( + name='midlegR2', + id=28, + color=[255, 255, 255], + type='', + swap='midlegL2'), + 29: + dict( + name='midlegR3', + id=29, + color=[255, 255, 255], + type='', + swap='midlegL3'), + 30: + dict( + name='midlegR4', + id=30, + color=[255, 255, 255], + type='', + swap='midlegL4'), + 31: + dict( + name='hindlegR1', + id=31, + color=[255, 255, 255], + type='', + swap='hindlegL1'), + 32: + dict( + name='hindlegR2', + id=32, + color=[255, 255, 255], + type='', + swap='hindlegL2'), + 33: + dict( + name='hindlegR3', + id=33, + color=[255, 255, 255], + type='', + swap='hindlegL3'), + 34: + dict( + name='hindlegR4', + id=34, + color=[255, 255, 255], + type='', + swap='hindlegL4') + }, + skeleton_info={ + 0: dict(link=('neck', 'head'), id=0, color=[255, 255, 255]), + 1: dict(link=('thorax', 'neck'), id=1, color=[255, 255, 255]), + 2: dict(link=('abdomen1', 'thorax'), id=2, color=[255, 255, 255]), + 3: dict(link=('abdomen2', 'abdomen1'), id=3, color=[255, 255, 255]), + 4: dict(link=('antbaseL', 'anttipL'), id=4, color=[255, 255, 255]), + 5: dict(link=('eyeL', 'antbaseL'), id=5, color=[255, 255, 255]), + 6: dict(link=('forelegL2', 'forelegL1'), id=6, color=[255, 255, 255]), + 7: dict(link=('forelegL3', 'forelegL2'), id=7, color=[255, 255, 255]), + 8: dict(link=('forelegL4', 'forelegL3'), id=8, color=[255, 255, 255]), + 9: dict(link=('midlegL2', 'midlegL1'), id=9, color=[255, 255, 255]), + 10: dict(link=('midlegL3', 'midlegL2'), id=10, color=[255, 255, 255]), + 11: dict(link=('midlegL4', 'midlegL3'), id=11, color=[255, 255, 255]), + 12: + dict(link=('hindlegL2', 'hindlegL1'), id=12, color=[255, 255, 255]), + 13: + dict(link=('hindlegL3', 'hindlegL2'), id=13, color=[255, 255, 255]), + 14: + dict(link=('hindlegL4', 'hindlegL3'), id=14, color=[255, 255, 255]), + 15: dict(link=('antbaseR', 'anttipR'), id=15, color=[255, 255, 255]), + 16: dict(link=('eyeR', 'antbaseR'), id=16, color=[255, 255, 255]), + 17: + dict(link=('forelegR2', 'forelegR1'), id=17, color=[255, 255, 255]), + 18: + dict(link=('forelegR3', 'forelegR2'), id=18, color=[255, 255, 255]), + 19: + dict(link=('forelegR4', 'forelegR3'), id=19, color=[255, 255, 255]), + 20: dict(link=('midlegR2', 'midlegR1'), id=20, color=[255, 255, 255]), + 21: dict(link=('midlegR3', 'midlegR2'), id=21, color=[255, 255, 255]), + 22: dict(link=('midlegR4', 'midlegR3'), id=22, color=[255, 255, 255]), + 23: + dict(link=('hindlegR2', 'hindlegR1'), id=23, color=[255, 255, 255]), + 24: + dict(link=('hindlegR3', 'hindlegR2'), id=24, color=[255, 255, 255]), + 25: + dict(link=('hindlegR4', 'hindlegR3'), id=25, color=[255, 255, 255]) + }, + joint_weights=[1.] * 35, + sigmas=[]) diff --git a/configs/_base_/datasets/macaque.py b/configs/_base_/datasets/macaque.py new file mode 100644 index 0000000..ea8dac2 --- /dev/null +++ b/configs/_base_/datasets/macaque.py @@ -0,0 +1,183 @@ +dataset_info = dict( + dataset_name='macaque', + paper_info=dict( + author='Labuguen, Rollyn and Matsumoto, Jumpei and ' + 'Negrete, Salvador and Nishimaru, Hiroshi and ' + 'Nishijo, Hisao and Takada, Masahiko and ' + 'Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro', + title='MacaquePose: A novel "in the wild" macaque monkey pose dataset ' + 'for markerless motion capture', + container='bioRxiv', + year='2020', + homepage='http://www.pri.kyoto-u.ac.jp/datasets/' + 'macaquepose/index.html', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/configs/_base_/datasets/mhp.py b/configs/_base_/datasets/mhp.py new file mode 100644 index 0000000..e16e37c --- /dev/null +++ b/configs/_base_/datasets/mhp.py @@ -0,0 +1,156 @@ +dataset_info = dict( + dataset_name='mhp', + paper_info=dict( + author='Zhao, Jian and Li, Jianshu and Cheng, Yu and ' + 'Sim, Terence and Yan, Shuicheng and Feng, Jiashi', + title='Understanding humans in crowded scenes: ' + 'Deep nested adversarial learning and a ' + 'new benchmark for multi-human parsing', + container='Proceedings of the 26th ACM ' + 'international conference on Multimedia', + year='2018', + homepage='https://lv-mhp.github.io/dataset', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 1: + dict( + name='right_knee', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 2: + dict( + name='right_hip', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 3: + dict( + name='left_hip', + id=3, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 4: + dict( + name='left_knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 5: + dict( + name='left_ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 6: + dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''), + 7: + dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict( + name='upper_neck', + id=8, + color=[51, 153, 255], + type='upper', + swap=''), + 9: + dict( + name='head_top', id=9, color=[51, 153, 255], type='upper', + swap=''), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='right_elbow', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 12: + dict( + name='right_shoulder', + id=12, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 13: + dict( + name='left_shoulder', + id=13, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 14: + dict( + name='left_elbow', + id=14, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 15: + dict( + name='left_wrist', + id=15, + color=[0, 255, 0], + type='upper', + swap='right_wrist') + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]), + 3: + dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]), + 4: + dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), + 5: + dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), + 6: + dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]), + 7: + dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]), + 8: + dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]), + 9: + dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]), + 10: + dict( + link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128, + 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]), + 13: + dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]), + 14: + dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0]) + }, + joint_weights=[ + 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, + 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 + ]) diff --git a/configs/_base_/datasets/mpi_inf_3dhp.py b/configs/_base_/datasets/mpi_inf_3dhp.py new file mode 100644 index 0000000..ffd0a70 --- /dev/null +++ b/configs/_base_/datasets/mpi_inf_3dhp.py @@ -0,0 +1,132 @@ +dataset_info = dict( + dataset_name='mpi_inf_3dhp', + paper_info=dict( + author='ehta, Dushyant and Rhodin, Helge and Casas, Dan and ' + 'Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and ' + 'Theobalt, Christian', + title='Monocular 3D Human Pose Estimation In The Wild Using Improved ' + 'CNN Supervision', + container='2017 international conference on 3D vision (3DV)', + year='2017', + homepage='http://gvv.mpi-inf.mpg.de/3dhp-dataset', + ), + keypoint_info={ + 0: + dict( + name='head_top', id=0, color=[51, 153, 255], type='upper', + swap=''), + 1: + dict(name='neck', id=1, color=[51, 153, 255], type='upper', swap=''), + 2: + dict( + name='right_shoulder', + id=2, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 3: + dict( + name='right_elbow', + id=3, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 4: + dict( + name='right_wrist', + id=4, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='left_elbow', + id=6, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 7: + dict( + name='left_wrist', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 8: + dict( + name='right_hip', + id=8, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 9: + dict( + name='right_knee', + id=9, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 10: + dict( + name='right_ankle', + id=10, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='left_knee', + id=12, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 13: + dict( + name='left_ankle', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 14: + dict(name='root', id=14, color=[51, 153, 255], type='lower', swap=''), + 15: + dict(name='spine', id=15, color=[51, 153, 255], type='upper', swap=''), + 16: + dict(name='head', id=16, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 128, 0]), + 1: dict( + link=('right_shoulder', 'right_elbow'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_elbow', 'right_wrist'), id=2, color=[255, 128, 0]), + 3: dict(link=('neck', 'left_shoulder'), id=3, color=[0, 255, 0]), + 4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: dict(link=('root', 'right_hip'), id=6, color=[255, 128, 0]), + 7: dict(link=('right_hip', 'right_knee'), id=7, color=[255, 128, 0]), + 8: dict(link=('right_knee', 'right_ankle'), id=8, color=[255, 128, 0]), + 9: dict(link=('root', 'left_hip'), id=9, color=[0, 255, 0]), + 10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]), + 11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 255, 0]), + 12: dict(link=('head_top', 'head'), id=12, color=[51, 153, 255]), + 13: dict(link=('head', 'neck'), id=13, color=[51, 153, 255]), + 14: dict(link=('neck', 'spine'), id=14, color=[51, 153, 255]), + 15: dict(link=('spine', 'root'), id=15, color=[51, 153, 255]) + }, + joint_weights=[1.] * 17, + sigmas=[]) diff --git a/configs/_base_/datasets/mpii.py b/configs/_base_/datasets/mpii.py new file mode 100644 index 0000000..6c2a491 --- /dev/null +++ b/configs/_base_/datasets/mpii.py @@ -0,0 +1,155 @@ +dataset_info = dict( + dataset_name='mpii', + paper_info=dict( + author='Mykhaylo Andriluka and Leonid Pishchulin and ' + 'Peter Gehler and Schiele, Bernt', + title='2D Human Pose Estimation: New Benchmark and ' + 'State of the Art Analysis', + container='IEEE Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2014', + homepage='http://human-pose.mpi-inf.mpg.de/', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 1: + dict( + name='right_knee', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 2: + dict( + name='right_hip', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 3: + dict( + name='left_hip', + id=3, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 4: + dict( + name='left_knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 5: + dict( + name='left_ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 6: + dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''), + 7: + dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict( + name='upper_neck', + id=8, + color=[51, 153, 255], + type='upper', + swap=''), + 9: + dict( + name='head_top', id=9, color=[51, 153, 255], type='upper', + swap=''), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='right_elbow', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 12: + dict( + name='right_shoulder', + id=12, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 13: + dict( + name='left_shoulder', + id=13, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 14: + dict( + name='left_elbow', + id=14, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 15: + dict( + name='left_wrist', + id=15, + color=[0, 255, 0], + type='upper', + swap='right_wrist') + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]), + 3: + dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]), + 4: + dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), + 5: + dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), + 6: + dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]), + 7: + dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]), + 8: + dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]), + 9: + dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]), + 10: + dict( + link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128, + 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]), + 13: + dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]), + 14: + dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0]) + }, + joint_weights=[ + 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, + 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 + ]) diff --git a/configs/_base_/datasets/mpii_info.py b/configs/_base_/datasets/mpii_info.py new file mode 100644 index 0000000..8090992 --- /dev/null +++ b/configs/_base_/datasets/mpii_info.py @@ -0,0 +1,155 @@ +mpii_info = dict( + dataset_name='mpii', + paper_info=dict( + author='Mykhaylo Andriluka and Leonid Pishchulin and ' + 'Peter Gehler and Schiele, Bernt', + title='2D Human Pose Estimation: New Benchmark and ' + 'State of the Art Analysis', + container='IEEE Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2014', + homepage='http://human-pose.mpi-inf.mpg.de/', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 1: + dict( + name='right_knee', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 2: + dict( + name='right_hip', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 3: + dict( + name='left_hip', + id=3, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 4: + dict( + name='left_knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 5: + dict( + name='left_ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 6: + dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''), + 7: + dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict( + name='upper_neck', + id=8, + color=[51, 153, 255], + type='upper', + swap=''), + 9: + dict( + name='head_top', id=9, color=[51, 153, 255], type='upper', + swap=''), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='right_elbow', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 12: + dict( + name='right_shoulder', + id=12, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 13: + dict( + name='left_shoulder', + id=13, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 14: + dict( + name='left_elbow', + id=14, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 15: + dict( + name='left_wrist', + id=15, + color=[0, 255, 0], + type='upper', + swap='right_wrist') + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]), + 3: + dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]), + 4: + dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), + 5: + dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), + 6: + dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]), + 7: + dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]), + 8: + dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]), + 9: + dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]), + 10: + dict( + link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128, + 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]), + 13: + dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]), + 14: + dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0]) + }, + joint_weights=[ + 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, + 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 + ]) diff --git a/configs/_base_/datasets/mpii_trb.py b/configs/_base_/datasets/mpii_trb.py new file mode 100644 index 0000000..73940d4 --- /dev/null +++ b/configs/_base_/datasets/mpii_trb.py @@ -0,0 +1,380 @@ +dataset_info = dict( + dataset_name='mpii_trb', + paper_info=dict( + author='Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and ' + 'Liu, Wentao and Qian, Chen and Ouyang, Wanli', + title='TRB: A Novel Triplet Representation for ' + 'Understanding 2D Human Body', + container='Proceedings of the IEEE International ' + 'Conference on Computer Vision', + year='2019', + homepage='https://github.com/kennymckormick/' + 'Triplet-Representation-of-human-Body', + ), + keypoint_info={ + 0: + dict( + name='left_shoulder', + id=0, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 1: + dict( + name='right_shoulder', + id=1, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 2: + dict( + name='left_elbow', + id=2, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 3: + dict( + name='right_elbow', + id=3, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 4: + dict( + name='left_wrist', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 5: + dict( + name='right_wrist', + id=5, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 6: + dict( + name='left_hip', + id=6, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 7: + dict( + name='right_hip', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 8: + dict( + name='left_knee', + id=8, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 9: + dict( + name='right_knee', + id=9, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 10: + dict( + name='left_ankle', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 11: + dict( + name='right_ankle', + id=11, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 12: + dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''), + 13: + dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap=''), + 14: + dict( + name='right_neck', + id=14, + color=[255, 255, 255], + type='upper', + swap='left_neck'), + 15: + dict( + name='left_neck', + id=15, + color=[255, 255, 255], + type='upper', + swap='right_neck'), + 16: + dict( + name='medial_right_shoulder', + id=16, + color=[255, 255, 255], + type='upper', + swap='medial_left_shoulder'), + 17: + dict( + name='lateral_right_shoulder', + id=17, + color=[255, 255, 255], + type='upper', + swap='lateral_left_shoulder'), + 18: + dict( + name='medial_right_bow', + id=18, + color=[255, 255, 255], + type='upper', + swap='medial_left_bow'), + 19: + dict( + name='lateral_right_bow', + id=19, + color=[255, 255, 255], + type='upper', + swap='lateral_left_bow'), + 20: + dict( + name='medial_right_wrist', + id=20, + color=[255, 255, 255], + type='upper', + swap='medial_left_wrist'), + 21: + dict( + name='lateral_right_wrist', + id=21, + color=[255, 255, 255], + type='upper', + swap='lateral_left_wrist'), + 22: + dict( + name='medial_left_shoulder', + id=22, + color=[255, 255, 255], + type='upper', + swap='medial_right_shoulder'), + 23: + dict( + name='lateral_left_shoulder', + id=23, + color=[255, 255, 255], + type='upper', + swap='lateral_right_shoulder'), + 24: + dict( + name='medial_left_bow', + id=24, + color=[255, 255, 255], + type='upper', + swap='medial_right_bow'), + 25: + dict( + name='lateral_left_bow', + id=25, + color=[255, 255, 255], + type='upper', + swap='lateral_right_bow'), + 26: + dict( + name='medial_left_wrist', + id=26, + color=[255, 255, 255], + type='upper', + swap='medial_right_wrist'), + 27: + dict( + name='lateral_left_wrist', + id=27, + color=[255, 255, 255], + type='upper', + swap='lateral_right_wrist'), + 28: + dict( + name='medial_right_hip', + id=28, + color=[255, 255, 255], + type='lower', + swap='medial_left_hip'), + 29: + dict( + name='lateral_right_hip', + id=29, + color=[255, 255, 255], + type='lower', + swap='lateral_left_hip'), + 30: + dict( + name='medial_right_knee', + id=30, + color=[255, 255, 255], + type='lower', + swap='medial_left_knee'), + 31: + dict( + name='lateral_right_knee', + id=31, + color=[255, 255, 255], + type='lower', + swap='lateral_left_knee'), + 32: + dict( + name='medial_right_ankle', + id=32, + color=[255, 255, 255], + type='lower', + swap='medial_left_ankle'), + 33: + dict( + name='lateral_right_ankle', + id=33, + color=[255, 255, 255], + type='lower', + swap='lateral_left_ankle'), + 34: + dict( + name='medial_left_hip', + id=34, + color=[255, 255, 255], + type='lower', + swap='medial_right_hip'), + 35: + dict( + name='lateral_left_hip', + id=35, + color=[255, 255, 255], + type='lower', + swap='lateral_right_hip'), + 36: + dict( + name='medial_left_knee', + id=36, + color=[255, 255, 255], + type='lower', + swap='medial_right_knee'), + 37: + dict( + name='lateral_left_knee', + id=37, + color=[255, 255, 255], + type='lower', + swap='lateral_right_knee'), + 38: + dict( + name='medial_left_ankle', + id=38, + color=[255, 255, 255], + type='lower', + swap='medial_right_ankle'), + 39: + dict( + name='lateral_left_ankle', + id=39, + color=[255, 255, 255], + type='lower', + swap='lateral_right_ankle'), + }, + skeleton_info={ + 0: + dict(link=('head', 'neck'), id=0, color=[51, 153, 255]), + 1: + dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]), + 2: + dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]), + 3: + dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]), + 4: + dict( + link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]), + 5: + dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: + dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]), + 7: + dict(link=('left_shoulder', 'left_hip'), id=7, color=[51, 153, 255]), + 8: + dict(link=('right_shoulder', 'right_hip'), id=8, color=[51, 153, 255]), + 9: + dict(link=('left_hip', 'right_hip'), id=9, color=[51, 153, 255]), + 10: + dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_hip', 'right_knee'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]), + 13: + dict(link=('right_knee', 'right_ankle'), id=13, color=[255, 128, 0]), + 14: + dict(link=('right_neck', 'left_neck'), id=14, color=[255, 255, 255]), + 15: + dict( + link=('medial_right_shoulder', 'lateral_right_shoulder'), + id=15, + color=[255, 255, 255]), + 16: + dict( + link=('medial_right_bow', 'lateral_right_bow'), + id=16, + color=[255, 255, 255]), + 17: + dict( + link=('medial_right_wrist', 'lateral_right_wrist'), + id=17, + color=[255, 255, 255]), + 18: + dict( + link=('medial_left_shoulder', 'lateral_left_shoulder'), + id=18, + color=[255, 255, 255]), + 19: + dict( + link=('medial_left_bow', 'lateral_left_bow'), + id=19, + color=[255, 255, 255]), + 20: + dict( + link=('medial_left_wrist', 'lateral_left_wrist'), + id=20, + color=[255, 255, 255]), + 21: + dict( + link=('medial_right_hip', 'lateral_right_hip'), + id=21, + color=[255, 255, 255]), + 22: + dict( + link=('medial_right_knee', 'lateral_right_knee'), + id=22, + color=[255, 255, 255]), + 23: + dict( + link=('medial_right_ankle', 'lateral_right_ankle'), + id=23, + color=[255, 255, 255]), + 24: + dict( + link=('medial_left_hip', 'lateral_left_hip'), + id=24, + color=[255, 255, 255]), + 25: + dict( + link=('medial_left_knee', 'lateral_left_knee'), + id=25, + color=[255, 255, 255]), + 26: + dict( + link=('medial_left_ankle', 'lateral_left_ankle'), + id=26, + color=[255, 255, 255]) + }, + joint_weights=[1.] * 40, + sigmas=[]) diff --git a/configs/_base_/datasets/ochuman.py b/configs/_base_/datasets/ochuman.py new file mode 100644 index 0000000..2ef2083 --- /dev/null +++ b/configs/_base_/datasets/ochuman.py @@ -0,0 +1,181 @@ +dataset_info = dict( + dataset_name='ochuman', + paper_info=dict( + author='Zhang, Song-Hai and Li, Ruilong and Dong, Xin and ' + 'Rosin, Paul and Cai, Zixi and Han, Xi and ' + 'Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min', + title='Pose2seg: Detection free human instance segmentation', + container='Proceedings of the IEEE conference on computer ' + 'vision and pattern recognition', + year='2019', + homepage='https://github.com/liruilong940607/OCHumanApi', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/configs/_base_/datasets/onehand10k.py b/configs/_base_/datasets/onehand10k.py new file mode 100644 index 0000000..016770f --- /dev/null +++ b/configs/_base_/datasets/onehand10k.py @@ -0,0 +1,142 @@ +dataset_info = dict( + dataset_name='onehand10k', + paper_info=dict( + author='Wang, Yangang and Peng, Cong and Liu, Yebin', + title='Mask-pose cascaded cnn for 2d hand pose estimation ' + 'from single color image', + container='IEEE Transactions on Circuits and Systems ' + 'for Video Technology', + year='2018', + homepage='https://www.yangangwang.com/papers/WANG-MCC-2018-10.html', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/configs/_base_/datasets/panoptic_body3d.py b/configs/_base_/datasets/panoptic_body3d.py new file mode 100644 index 0000000..e3b19ac --- /dev/null +++ b/configs/_base_/datasets/panoptic_body3d.py @@ -0,0 +1,160 @@ +dataset_info = dict( + dataset_name='panoptic_pose_3d', + paper_info=dict( + author='Joo, Hanbyul and Simon, Tomas and Li, Xulong' + 'and Liu, Hao and Tan, Lei and Gui, Lin and Banerjee, Sean' + 'and Godisart, Timothy and Nabbe, Bart and Matthews, Iain' + 'and Kanade, Takeo and Nobuhara, Shohei and Sheikh, Yaser', + title='Panoptic Studio: A Massively Multiview System ' + 'for Interaction Motion Capture', + container='IEEE Transactions on Pattern Analysis' + ' and Machine Intelligence', + year='2017', + homepage='http://domedb.perception.cs.cmu.edu', + ), + keypoint_info={ + 0: + dict(name='neck', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict(name='nose', id=1, color=[51, 153, 255], type='upper', swap=''), + 2: + dict(name='mid_hip', id=2, color=[0, 255, 0], type='lower', swap=''), + 3: + dict( + name='left_shoulder', + id=3, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 4: + dict( + name='left_elbow', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 5: + dict( + name='left_wrist', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 6: + dict( + name='left_hip', + id=6, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 7: + dict( + name='left_knee', + id=7, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 8: + dict( + name='left_ankle', + id=8, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 9: + dict( + name='right_shoulder', + id=9, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 10: + dict( + name='right_elbow', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 11: + dict( + name='right_wrist', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='right_knee', + id=13, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 14: + dict( + name='right_ankle', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 15: + dict( + name='left_eye', + id=15, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 16: + dict( + name='left_ear', + id=16, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 17: + dict( + name='right_eye', + id=17, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 18: + dict( + name='right_ear', + id=18, + color=[51, 153, 255], + type='upper', + swap='left_ear') + }, + skeleton_info={ + 0: dict(link=('nose', 'neck'), id=0, color=[51, 153, 255]), + 1: dict(link=('neck', 'left_shoulder'), id=1, color=[0, 255, 0]), + 2: dict(link=('neck', 'right_shoulder'), id=2, color=[255, 128, 0]), + 3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]), + 4: dict( + link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: + dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]), + 7: dict(link=('left_ankle', 'left_knee'), id=7, color=[0, 255, 0]), + 8: dict(link=('left_knee', 'left_hip'), id=8, color=[0, 255, 0]), + 9: dict(link=('right_ankle', 'right_knee'), id=9, color=[255, 128, 0]), + 10: dict(link=('right_knee', 'right_hip'), id=10, color=[255, 128, 0]), + 11: dict(link=('mid_hip', 'left_hip'), id=11, color=[0, 255, 0]), + 12: dict(link=('mid_hip', 'right_hip'), id=12, color=[255, 128, 0]), + 13: dict(link=('mid_hip', 'neck'), id=13, color=[51, 153, 255]), + }, + joint_weights=[ + 1.0, 1.0, 1.0, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, + 1.5, 1.0, 1.0, 1.0, 1.0 + ], + sigmas=[ + 0.026, 0.026, 0.107, 0.079, 0.072, 0.062, 0.107, 0.087, 0.089, 0.079, + 0.072, 0.062, 0.107, 0.087, 0.089, 0.025, 0.035, 0.025, 0.035 + ]) diff --git a/configs/_base_/datasets/panoptic_hand2d.py b/configs/_base_/datasets/panoptic_hand2d.py new file mode 100644 index 0000000..7a65731 --- /dev/null +++ b/configs/_base_/datasets/panoptic_hand2d.py @@ -0,0 +1,143 @@ +dataset_info = dict( + dataset_name='panoptic_hand2d', + paper_info=dict( + author='Simon, Tomas and Joo, Hanbyul and ' + 'Matthews, Iain and Sheikh, Yaser', + title='Hand keypoint detection in single images using ' + 'multiview bootstrapping', + container='Proceedings of the IEEE conference on ' + 'Computer Vision and Pattern Recognition', + year='2017', + homepage='http://domedb.perception.cs.cmu.edu/handdb.html', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/configs/_base_/datasets/posetrack18.py b/configs/_base_/datasets/posetrack18.py new file mode 100644 index 0000000..5aefd1c --- /dev/null +++ b/configs/_base_/datasets/posetrack18.py @@ -0,0 +1,176 @@ +dataset_info = dict( + dataset_name='posetrack18', + paper_info=dict( + author='Andriluka, Mykhaylo and Iqbal, Umar and ' + 'Insafutdinov, Eldar and Pishchulin, Leonid and ' + 'Milan, Anton and Gall, Juergen and Schiele, Bernt', + title='Posetrack: A benchmark for human pose estimation and tracking', + container='Proceedings of the IEEE Conference on ' + 'Computer Vision and Pattern Recognition', + year='2018', + homepage='https://posetrack.net/users/download.php', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='head_bottom', + id=1, + color=[51, 153, 255], + type='upper', + swap=''), + 2: + dict( + name='head_top', id=2, color=[51, 153, 255], type='upper', + swap=''), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('nose', 'head_bottom'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'head_top'), id=13, color=[51, 153, 255]), + 14: + dict( + link=('head_bottom', 'left_shoulder'), id=14, color=[51, 153, + 255]), + 15: + dict( + link=('head_bottom', 'right_shoulder'), + id=15, + color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/configs/_base_/datasets/rhd2d.py b/configs/_base_/datasets/rhd2d.py new file mode 100644 index 0000000..f48e637 --- /dev/null +++ b/configs/_base_/datasets/rhd2d.py @@ -0,0 +1,141 @@ +dataset_info = dict( + dataset_name='rhd2d', + paper_info=dict( + author='Christian Zimmermann and Thomas Brox', + title='Learning to Estimate 3D Hand Pose from Single RGB Images', + container='arXiv', + year='2017', + homepage='https://lmb.informatik.uni-freiburg.de/resources/' + 'datasets/RenderedHandposeDataset.en.html', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/configs/_base_/datasets/wflw.py b/configs/_base_/datasets/wflw.py new file mode 100644 index 0000000..bed6f56 --- /dev/null +++ b/configs/_base_/datasets/wflw.py @@ -0,0 +1,582 @@ +dataset_info = dict( + dataset_name='wflw', + paper_info=dict( + author='Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, ' + 'Quan and Cai, Yici and Zhou, Qiang', + title='Look at boundary: A boundary-aware face alignment algorithm', + container='Proceedings of the IEEE conference on computer ' + 'vision and pattern recognition', + year='2018', + homepage='https://wywu.github.io/projects/LAB/WFLW.html', + ), + keypoint_info={ + 0: + dict( + name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-32'), + 1: + dict( + name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-31'), + 2: + dict( + name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-30'), + 3: + dict( + name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-29'), + 4: + dict( + name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-28'), + 5: + dict( + name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-27'), + 6: + dict( + name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-26'), + 7: + dict( + name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-25'), + 8: + dict( + name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-24'), + 9: + dict( + name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-23'), + 10: + dict( + name='kpt-10', + id=10, + color=[255, 255, 255], + type='', + swap='kpt-22'), + 11: + dict( + name='kpt-11', + id=11, + color=[255, 255, 255], + type='', + swap='kpt-21'), + 12: + dict( + name='kpt-12', + id=12, + color=[255, 255, 255], + type='', + swap='kpt-20'), + 13: + dict( + name='kpt-13', + id=13, + color=[255, 255, 255], + type='', + swap='kpt-19'), + 14: + dict( + name='kpt-14', + id=14, + color=[255, 255, 255], + type='', + swap='kpt-18'), + 15: + dict( + name='kpt-15', + id=15, + color=[255, 255, 255], + type='', + swap='kpt-17'), + 16: + dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''), + 17: + dict( + name='kpt-17', + id=17, + color=[255, 255, 255], + type='', + swap='kpt-15'), + 18: + dict( + name='kpt-18', + id=18, + color=[255, 255, 255], + type='', + swap='kpt-14'), + 19: + dict( + name='kpt-19', + id=19, + color=[255, 255, 255], + type='', + swap='kpt-13'), + 20: + dict( + name='kpt-20', + id=20, + color=[255, 255, 255], + type='', + swap='kpt-12'), + 21: + dict( + name='kpt-21', + id=21, + color=[255, 255, 255], + type='', + swap='kpt-11'), + 22: + dict( + name='kpt-22', + id=22, + color=[255, 255, 255], + type='', + swap='kpt-10'), + 23: + dict( + name='kpt-23', id=23, color=[255, 255, 255], type='', + swap='kpt-9'), + 24: + dict( + name='kpt-24', id=24, color=[255, 255, 255], type='', + swap='kpt-8'), + 25: + dict( + name='kpt-25', id=25, color=[255, 255, 255], type='', + swap='kpt-7'), + 26: + dict( + name='kpt-26', id=26, color=[255, 255, 255], type='', + swap='kpt-6'), + 27: + dict( + name='kpt-27', id=27, color=[255, 255, 255], type='', + swap='kpt-5'), + 28: + dict( + name='kpt-28', id=28, color=[255, 255, 255], type='', + swap='kpt-4'), + 29: + dict( + name='kpt-29', id=29, color=[255, 255, 255], type='', + swap='kpt-3'), + 30: + dict( + name='kpt-30', id=30, color=[255, 255, 255], type='', + swap='kpt-2'), + 31: + dict( + name='kpt-31', id=31, color=[255, 255, 255], type='', + swap='kpt-1'), + 32: + dict( + name='kpt-32', id=32, color=[255, 255, 255], type='', + swap='kpt-0'), + 33: + dict( + name='kpt-33', + id=33, + color=[255, 255, 255], + type='', + swap='kpt-46'), + 34: + dict( + name='kpt-34', + id=34, + color=[255, 255, 255], + type='', + swap='kpt-45'), + 35: + dict( + name='kpt-35', + id=35, + color=[255, 255, 255], + type='', + swap='kpt-44'), + 36: + dict( + name='kpt-36', + id=36, + color=[255, 255, 255], + type='', + swap='kpt-43'), + 37: + dict( + name='kpt-37', + id=37, + color=[255, 255, 255], + type='', + swap='kpt-42'), + 38: + dict( + name='kpt-38', + id=38, + color=[255, 255, 255], + type='', + swap='kpt-50'), + 39: + dict( + name='kpt-39', + id=39, + color=[255, 255, 255], + type='', + swap='kpt-49'), + 40: + dict( + name='kpt-40', + id=40, + color=[255, 255, 255], + type='', + swap='kpt-48'), + 41: + dict( + name='kpt-41', + id=41, + color=[255, 255, 255], + type='', + swap='kpt-47'), + 42: + dict( + name='kpt-42', + id=42, + color=[255, 255, 255], + type='', + swap='kpt-37'), + 43: + dict( + name='kpt-43', + id=43, + color=[255, 255, 255], + type='', + swap='kpt-36'), + 44: + dict( + name='kpt-44', + id=44, + color=[255, 255, 255], + type='', + swap='kpt-35'), + 45: + dict( + name='kpt-45', + id=45, + color=[255, 255, 255], + type='', + swap='kpt-34'), + 46: + dict( + name='kpt-46', + id=46, + color=[255, 255, 255], + type='', + swap='kpt-33'), + 47: + dict( + name='kpt-47', + id=47, + color=[255, 255, 255], + type='', + swap='kpt-41'), + 48: + dict( + name='kpt-48', + id=48, + color=[255, 255, 255], + type='', + swap='kpt-40'), + 49: + dict( + name='kpt-49', + id=49, + color=[255, 255, 255], + type='', + swap='kpt-39'), + 50: + dict( + name='kpt-50', + id=50, + color=[255, 255, 255], + type='', + swap='kpt-38'), + 51: + dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''), + 52: + dict(name='kpt-52', id=52, color=[255, 255, 255], type='', swap=''), + 53: + dict(name='kpt-53', id=53, color=[255, 255, 255], type='', swap=''), + 54: + dict(name='kpt-54', id=54, color=[255, 255, 255], type='', swap=''), + 55: + dict( + name='kpt-55', + id=55, + color=[255, 255, 255], + type='', + swap='kpt-59'), + 56: + dict( + name='kpt-56', + id=56, + color=[255, 255, 255], + type='', + swap='kpt-58'), + 57: + dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''), + 58: + dict( + name='kpt-58', + id=58, + color=[255, 255, 255], + type='', + swap='kpt-56'), + 59: + dict( + name='kpt-59', + id=59, + color=[255, 255, 255], + type='', + swap='kpt-55'), + 60: + dict( + name='kpt-60', + id=60, + color=[255, 255, 255], + type='', + swap='kpt-72'), + 61: + dict( + name='kpt-61', + id=61, + color=[255, 255, 255], + type='', + swap='kpt-71'), + 62: + dict( + name='kpt-62', + id=62, + color=[255, 255, 255], + type='', + swap='kpt-70'), + 63: + dict( + name='kpt-63', + id=63, + color=[255, 255, 255], + type='', + swap='kpt-69'), + 64: + dict( + name='kpt-64', + id=64, + color=[255, 255, 255], + type='', + swap='kpt-68'), + 65: + dict( + name='kpt-65', + id=65, + color=[255, 255, 255], + type='', + swap='kpt-75'), + 66: + dict( + name='kpt-66', + id=66, + color=[255, 255, 255], + type='', + swap='kpt-74'), + 67: + dict( + name='kpt-67', + id=67, + color=[255, 255, 255], + type='', + swap='kpt-73'), + 68: + dict( + name='kpt-68', + id=68, + color=[255, 255, 255], + type='', + swap='kpt-64'), + 69: + dict( + name='kpt-69', + id=69, + color=[255, 255, 255], + type='', + swap='kpt-63'), + 70: + dict( + name='kpt-70', + id=70, + color=[255, 255, 255], + type='', + swap='kpt-62'), + 71: + dict( + name='kpt-71', + id=71, + color=[255, 255, 255], + type='', + swap='kpt-61'), + 72: + dict( + name='kpt-72', + id=72, + color=[255, 255, 255], + type='', + swap='kpt-60'), + 73: + dict( + name='kpt-73', + id=73, + color=[255, 255, 255], + type='', + swap='kpt-67'), + 74: + dict( + name='kpt-74', + id=74, + color=[255, 255, 255], + type='', + swap='kpt-66'), + 75: + dict( + name='kpt-75', + id=75, + color=[255, 255, 255], + type='', + swap='kpt-65'), + 76: + dict( + name='kpt-76', + id=76, + color=[255, 255, 255], + type='', + swap='kpt-82'), + 77: + dict( + name='kpt-77', + id=77, + color=[255, 255, 255], + type='', + swap='kpt-81'), + 78: + dict( + name='kpt-78', + id=78, + color=[255, 255, 255], + type='', + swap='kpt-80'), + 79: + dict(name='kpt-79', id=79, color=[255, 255, 255], type='', swap=''), + 80: + dict( + name='kpt-80', + id=80, + color=[255, 255, 255], + type='', + swap='kpt-78'), + 81: + dict( + name='kpt-81', + id=81, + color=[255, 255, 255], + type='', + swap='kpt-77'), + 82: + dict( + name='kpt-82', + id=82, + color=[255, 255, 255], + type='', + swap='kpt-76'), + 83: + dict( + name='kpt-83', + id=83, + color=[255, 255, 255], + type='', + swap='kpt-87'), + 84: + dict( + name='kpt-84', + id=84, + color=[255, 255, 255], + type='', + swap='kpt-86'), + 85: + dict(name='kpt-85', id=85, color=[255, 255, 255], type='', swap=''), + 86: + dict( + name='kpt-86', + id=86, + color=[255, 255, 255], + type='', + swap='kpt-84'), + 87: + dict( + name='kpt-87', + id=87, + color=[255, 255, 255], + type='', + swap='kpt-83'), + 88: + dict( + name='kpt-88', + id=88, + color=[255, 255, 255], + type='', + swap='kpt-92'), + 89: + dict( + name='kpt-89', + id=89, + color=[255, 255, 255], + type='', + swap='kpt-91'), + 90: + dict(name='kpt-90', id=90, color=[255, 255, 255], type='', swap=''), + 91: + dict( + name='kpt-91', + id=91, + color=[255, 255, 255], + type='', + swap='kpt-89'), + 92: + dict( + name='kpt-92', + id=92, + color=[255, 255, 255], + type='', + swap='kpt-88'), + 93: + dict( + name='kpt-93', + id=93, + color=[255, 255, 255], + type='', + swap='kpt-95'), + 94: + dict(name='kpt-94', id=94, color=[255, 255, 255], type='', swap=''), + 95: + dict( + name='kpt-95', + id=95, + color=[255, 255, 255], + type='', + swap='kpt-93'), + 96: + dict( + name='kpt-96', + id=96, + color=[255, 255, 255], + type='', + swap='kpt-97'), + 97: + dict( + name='kpt-97', + id=97, + color=[255, 255, 255], + type='', + swap='kpt-96') + }, + skeleton_info={}, + joint_weights=[1.] * 98, + sigmas=[]) diff --git a/configs/_base_/datasets/zebra.py b/configs/_base_/datasets/zebra.py new file mode 100644 index 0000000..eac71f7 --- /dev/null +++ b/configs/_base_/datasets/zebra.py @@ -0,0 +1,64 @@ +dataset_info = dict( + dataset_name='zebra', + paper_info=dict( + author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and ' + 'Li, Liang and Koger, Benjamin and Costelloe, Blair R and ' + 'Couzin, Iain D', + title='DeepPoseKit, a software toolkit for fast and robust ' + 'animal pose estimation using deep learning', + container='Elife', + year='2019', + homepage='https://github.com/jgraving/DeepPoseKit-Data', + ), + keypoint_info={ + 0: + dict(name='snout', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='head', id=1, color=[255, 255, 255], type='', swap=''), + 2: + dict(name='neck', id=2, color=[255, 255, 255], type='', swap=''), + 3: + dict( + name='forelegL1', + id=3, + color=[255, 255, 255], + type='', + swap='forelegR1'), + 4: + dict( + name='forelegR1', + id=4, + color=[255, 255, 255], + type='', + swap='forelegL1'), + 5: + dict( + name='hindlegL1', + id=5, + color=[255, 255, 255], + type='', + swap='hindlegR1'), + 6: + dict( + name='hindlegR1', + id=6, + color=[255, 255, 255], + type='', + swap='hindlegL1'), + 7: + dict(name='tailbase', id=7, color=[255, 255, 255], type='', swap=''), + 8: + dict(name='tailtip', id=8, color=[255, 255, 255], type='', swap='') + }, + skeleton_info={ + 0: dict(link=('head', 'snout'), id=0, color=[255, 255, 255]), + 1: dict(link=('neck', 'head'), id=1, color=[255, 255, 255]), + 2: dict(link=('forelegL1', 'neck'), id=2, color=[255, 255, 255]), + 3: dict(link=('forelegR1', 'neck'), id=3, color=[255, 255, 255]), + 4: dict(link=('hindlegL1', 'tailbase'), id=4, color=[255, 255, 255]), + 5: dict(link=('hindlegR1', 'tailbase'), id=5, color=[255, 255, 255]), + 6: dict(link=('tailbase', 'neck'), id=6, color=[255, 255, 255]), + 7: dict(link=('tailtip', 'tailbase'), id=7, color=[255, 255, 255]) + }, + joint_weights=[1.] * 9, + sigmas=[]) diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py new file mode 100644 index 0000000..d78da5a --- /dev/null +++ b/configs/_base_/default_runtime.py @@ -0,0 +1,19 @@ +checkpoint_config = dict(interval=10) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] + +# disable opencv multithreading to avoid system being overloaded +opencv_num_threads = 0 +# set multi-process start method as `fork` to speed up the training +mp_start_method = 'fork' diff --git a/configs/_base_/filters/gausian_filter.py b/configs/_base_/filters/gausian_filter.py new file mode 100644 index 0000000..e69de29 diff --git a/configs/detection/yolo_classes.py b/configs/detection/yolo_classes.py new file mode 100644 index 0000000..2339a11 --- /dev/null +++ b/configs/detection/yolo_classes.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +YOLO_COCO_80_CLASSES = [ +"person", +"bicycle", +"car", +"motorbike", +"aeroplane", +"bus", +"train", +"truck", +"boat", +"traffic light", +"fire hydrant", +"stop sign", +"parking meter", +"bench", +"bird", +"cat", +"dog", +"horse", +"sheep", +"cow", +"elephant", +"bear", +"zebra", +"giraffe", +"backpack", +"umbrella", +"handbag", +"tie", +"suitcase", +"frisbee", +"skis", +"snowboard", +"sports ball", +"kite", +"baseball bat", +"baseball glove", +"skateboard", +"surfboard", +"tennis racket", +"bottle", +"wine glass", +"cup", +"fork", +"knife", +"spoon", +"bowl", +"banana", +"apple", +"sandwich", +"orange", +"broccoli", +"carrot", +"hot dog", +"pizza", +"donut", +"cake", +"chair", +"sofa", +"pottedplant", +"bed", +"diningtable", +"toilet", +"tvmonitor", +"laptop", +"mouse", +"remote", +"keyboard", +"cell phone", +"microwave", +"oven", +"toaster", +"sink", +"refrigerator", +"book", +"clock", +"vase", +"scissors", +"teddy bear", +"hair drier", +"toothbrush"] \ No newline at end of file diff --git a/configs/detection/yolov3_d53_320_273e_coco.py b/configs/detection/yolov3_d53_320_273e_coco.py new file mode 100644 index 0000000..d7e9cca --- /dev/null +++ b/configs/detection/yolov3_d53_320_273e_coco.py @@ -0,0 +1,140 @@ +# model settings +model = dict( + type='YOLOV3', + pretrained='open-mmlab://darknet53', + backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)), + neck=dict( + type='YOLOV3Neck', + num_scales=3, + in_channels=[1024, 512, 256], + out_channels=[512, 256, 128]), + bbox_head=dict( + type='YOLOV3Head', + num_classes=80, + in_channels=[512, 256, 128], + out_channels=[1024, 512, 256], + anchor_generator=dict( + type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), (156, 198), (373, 326)], + [(30, 61), (62, 45), (59, 119)], + [(10, 13), (16, 30), (33, 23)]], + strides=[32, 16, 8]), + bbox_coder=dict(type='YOLOBBoxCoder'), + featmap_strides=[32, 16, 8], + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_conf=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_xy=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=2.0, + reduction='sum'), + loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='GridAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0)), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + conf_thr=0.005, + nms=dict(type='nms', iou_threshold=0.45), + max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco' +img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='PhotoMetricDistortion'), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(320, 320), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(320, 320), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_train2017.json', + img_prefix=f'{data_root}/train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_val2017.json', + img_prefix=f'{data_root}/val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_val2017.json', + img_prefix=f'{data_root}/val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=2000, # same as burn-in in darknet + warmup_ratio=0.1, + step=[218, 246]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=273) +evaluation = dict(interval=1, metric=['bbox']) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +custom_hooks = [dict(type='NumClassCheckHook')] + +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/pose/ViTPose_base_coco_256x192.py b/configs/pose/ViTPose_base_coco_256x192.py new file mode 100644 index 0000000..f61b314 --- /dev/null +++ b/configs/pose/ViTPose_base_coco_256x192.py @@ -0,0 +1,170 @@ +_base_ = [ + '../_base_/default_runtime.py', + '../_base_/datasets/coco.py' +] +evaluation = dict(interval=10, metric='mAP', save_best='AP') + +optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.75, + custom_keys={ + 'bias': dict(decay_multi=0.), + 'pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + } + ) + ) + +optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +target_type = 'GaussianHeatmap' +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained=None, + backbone=dict( + type='ViT', + img_size=(256, 192), + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + ratio=1, + use_checkpoint=False, + mlp_ratio=4, + qkv_bias=True, + drop_path_rate=0.3, + ), + keypoint_head=dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=False, + target_type=target_type, + modulate_kernel=11, + use_udp=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=4, + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) + diff --git a/configs/pose/ViTPose_base_simple_coco_256x192.py b/configs/pose/ViTPose_base_simple_coco_256x192.py new file mode 100644 index 0000000..59985e4 --- /dev/null +++ b/configs/pose/ViTPose_base_simple_coco_256x192.py @@ -0,0 +1,171 @@ +_base_ = [ + '../_base_/default_runtime.py', + '../_base_/datasets/coco.py' +] +evaluation = dict(interval=10, metric='mAP', save_best='AP') + +optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.75, + custom_keys={ + 'bias': dict(decay_multi=0.), + 'pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + } + ) + ) + +optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +target_type = 'GaussianHeatmap' +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained=None, + backbone=dict( + type='ViT', + img_size=(256, 192), + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + ratio=1, + use_checkpoint=False, + mlp_ratio=4, + qkv_bias=True, + drop_path_rate=0.3, + ), + keypoint_head=dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=0, + num_deconv_filters=[], + num_deconv_kernels=[], + upsample=4, + extra=dict(final_conv_kernel=3, ), + out_channels=channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=False, + target_type=target_type, + modulate_kernel=11, + use_udp=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=4, + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) + diff --git a/configs/pose/ViTPose_small_coco_256x192.py b/configs/pose/ViTPose_small_coco_256x192.py new file mode 100644 index 0000000..80683c6 --- /dev/null +++ b/configs/pose/ViTPose_small_coco_256x192.py @@ -0,0 +1,170 @@ +_base_ = [ + '../_base_/default_runtime.py', + '../_base_/datasets/coco.py' +] +evaluation = dict(interval=10, metric='mAP', save_best='AP') + +optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.), + 'pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + } + ) + ) + +optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +target_type = 'GaussianHeatmap' +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained=None, + backbone=dict( + type='ViT', + img_size=(256, 192), + patch_size=16, + embed_dim=384, + depth=12, + num_heads=12, + ratio=1, + use_checkpoint=False, + mlp_ratio=4, + qkv_bias=True, + drop_path_rate=0.1, + ), + keypoint_head=dict( + type='TopdownHeatmapSimpleHead', + in_channels=384, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=False, + target_type=target_type, + modulate_kernel=11, + use_udp=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=4, + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) + diff --git a/configs/pose3d/MB_ft_h36m.yaml b/configs/pose3d/MB_ft_h36m.yaml new file mode 100644 index 0000000..b52f5a2 --- /dev/null +++ b/configs/pose3d/MB_ft_h36m.yaml @@ -0,0 +1,50 @@ +# General +train_2d: False +no_eval: False +finetune: True +partial_train: null + +# Traning +epochs: 60 +checkpoint_frequency: 30 +batch_size: 32 +dropout: 0.0 +learning_rate: 0.0002 +weight_decay: 0.01 +lr_decay: 0.99 + +# Model +maxlen: 243 +dim_feat: 512 +mlp_ratio: 2 +depth: 5 +dim_rep: 512 +num_heads: 8 +att_fuse: True + +# Data +data_root: data/motion3d/MB3D_f243s81/ +subset_list: [H36M-SH] +dt_file: h36m_sh_conf_cam_source_final.pkl +clip_len: 243 +data_stride: 81 +rootrel: True +sample_stride: 1 +num_joints: 17 +no_conf: False +gt_2d: False + +# Loss +lambda_3d_velocity: 20.0 +lambda_scale: 0.5 +lambda_lv: 0.0 +lambda_lg: 0.0 +lambda_a: 0.0 +lambda_av: 0.0 + +# Augmentation +synthetic: False +flip: True +mask_ratio: 0. +mask_T_ratio: 0. +noise: False diff --git a/gafa_utils.py b/gafa_utils.py new file mode 100644 index 0000000..53345f7 --- /dev/null +++ b/gafa_utils.py @@ -0,0 +1,448 @@ + + +import numpy as np +import cv2 +import torch +import torch.nn.functional as F +from torchvision.transforms.functional import resize +import numpy as np +import torchvision.transforms.functional as TF +from torchvision import transforms +from albumentations.core.transforms_interface import DualTransform, to_tuple +import albumentations as A + +import torchvision.transforms as T +from torchvision.transforms import Compose as ComposeTransform + +import matplotlib.pyplot as plt + +from PIL import Image + +from utils import * + +MIN_CONF_THRESH = 0.3 +MIN_IDXS_COUNT = 50 + + +class SingleAttrTransform: + """ + Superclass for data transformation + """ + + def __init__(self, input_key, output_key): + self.input_keys = self._validate_key_arg(input_key) + self.output_keys = self._validate_key_arg(output_key) + if len(self.input_keys) != len(self.output_keys): + raise Exception( + f"len(input_keys) != len(output_keys): {len(self.input_keys)} != {len(self.output_keys)}" + ) + + def __call__(self, item): + """ + item: dictionary containing each variable in a dataset + """ + self.before_transform(item) + for in_key, out_key in zip(self.input_keys, self.output_keys): + input_seq = item[in_key] + item[out_key] = self.transform(input_seq) + return item + + def transform(self, input_seq): + raise NotImplementedError + + def before_transform(self, item): + return + + def _validate_key_arg(self, key_or_keys): + if isinstance(key_or_keys, str): + return [key_or_keys] + else: + return key_or_keys + + +class ImageTransform: + def __init__(self, img_key, transform): + self.img_key = img_key + self.transform = transform + + def __call__(self, item): + item[self.img_key] = self.transform(item[self.img_key]) + return item + +###################################### +############ Bounding Box ############## +##################################### +class ExpandBB(SingleAttrTransform): + """ + Expand or shurink the bounding box by multiplying specified arguments + """ + + def __init__(self, t, b, l, r, input_key="bb", output_key=None): + output_key = output_key or input_key + super().__init__(input_key, output_key) + self.t = t + self.b = b + self.l = l + self.r = r + + def transform(self, bb): + old_w, old_h = bb["w"], bb["h"] + old_u, old_v = bb["u"], bb["v"] + + lpad = int(old_w * self.l) + rpad = int(old_w * self.r) + tpad = int(old_h * self.t) + bpad = int(old_h * self.b) + + return { + "w": old_w + lpad + rpad, + "h": old_h + tpad + bpad, + "u": old_u - lpad, + "v": old_v - tpad, + } + +class SquareFromWidth(SingleAttrTransform): + """ + Expand or shurink the bounding box by multiplying specified arguments + """ + + def __init__(self, t, b, l, r, input_key="bb", output_key=None): + output_key = output_key or input_key + super().__init__(input_key, output_key) + self.t = t + self.b = b + self.l = l + self.r = r + + def transform(self, bb): + old_w, old_h = bb["w"], bb["h"] + old_u, old_v = bb["u"], bb["v"] + + lpad = 0 #int(old_w * self.l) + rpad = 0 #int(old_w * self.r) + tpad = 0 #int(old_h * self.t) + bpad = 0 #int(old_h * self.b) + + return { + "w": old_w + lpad + rpad, + "h": old_h + tpad + bpad, + "u": old_u - lpad, + "v": old_v - tpad, + } + + +class ExpandBBRect(SingleAttrTransform): + """ + Make bonding box rectangle. + """ + + def __init__(self, input_key="bb", output_key=None): + output_key = output_key or input_key + super().__init__(input_key, output_key) + + def transform(self, bb): + old_w, old_h = bb["w"], bb["h"] + old_u, old_v = bb["u"], bb["v"] + + if old_w <= old_h: + diff = old_h - old_w + lpad = diff // 2 + + return {"w": old_h, "h": old_h, "u": old_u - lpad, "v": old_v} + + if old_h < old_w: + diff = old_w - old_h + tpad = diff // 2 + + return {"w": old_w, "h": old_w, "u": old_u, "v": old_v - tpad} + + +class ReshapeBBRect(SingleAttrTransform): + """ + Crop or Expand the BB tp specified ratio + """ + + def __init__(self, img_ratio, input_key="bb", output_key=None): + output_key = output_key or input_key + super().__init__(input_key, output_key) + + assert len(img_ratio) == 2 + self.height = img_ratio[0] + self.width = img_ratio[1] + + def transform(self, bb): + old_w, old_h = bb["w"], bb["h"] + old_u, old_v = bb["u"], bb["v"] + + old_ratio = old_h / old_w + new_ratio = self.height / self.width + + # 縦が長すぎる場合 + if old_ratio > new_ratio: + diff = old_h - old_w * (self.height / self.width) + lpad = diff // 2 + + return {"w": old_w, "h": old_h - diff, "u": old_u, "v": old_v + lpad} + + # 横が長すぎる場合 + else: + diff = old_w - old_h * (self.width / self.height) + lpad = diff // 2 + + return {"w": old_w - diff, "h": old_h, "u": old_u + lpad, "v": old_v} + + +class CropBB: + def __init__(self, img_key="image", bb_key="bb", out_key="image"): + self.img_key = img_key + self.bb_key = bb_key + self.out_key = out_key + + def __call__(self, item): + # self._check_keys(item) + bb = item[self.bb_key] + item[self.out_key] = TF.crop( + item[self.img_key], top=int(bb["v"]), left=int(bb["u"]), height=int(bb["h"]), width=int(bb["w"]) + ) + return item + + +class KeypointsToBB: + def __init__(self, kp_indices): + if hasattr(kp_indices, "__iter__"): + kp_indices = list(kp_indices) + self.kp_indices = kp_indices + + def __call__(self, item): + out = {k: v for k, v in item.items()} + kp = item["keypoints"] + + kp = kp[self.kp_indices] + kp = kp[np.all(kp != 0, axis=1), :] + u, v = np.min(kp.astype(np.int64), axis=0) + umax, vmax = np.max(kp.astype(np.int64), axis=0) + out["bb"] = {"u": u, "v": v, "w": umax - u, "h": vmax - v} + return out + + + + +# define transforms +head_transform = ComposeTransform( + [ + # KeypointsToBB((0, 1, 15, 16, 17, 18)), + KeypointsToBB((0,1,2,3,4,5,6)), #coco17 corresponding + ExpandBB(0.85, -0.2, 0.1, 0.1, "bb"), + ExpandBBRect("bb"), + ] +) + +# define transforms +head_transform_rest = ComposeTransform( + [ + # KeypointsToBB((0, 1, 15, 16, 17, 18)), + KeypointsToBB((0,1,2,3,4,5,6)), #coco17 corresponding + ExpandBB(0.1, -0.2, 0.1, 0.1, "bb"), + ExpandBBRect("bb"), + ] +) + +# define transforms +head_transform_face = ComposeTransform( + [ + # KeypointsToBB((0, 1, 15, 16, 17, 18)), + KeypointsToBB((0,1,2,3,4)), #coco17 corresponding + ExpandBB(3.0, 2.5, 0.5, 0.5, "bb"), + # ExpandBBRect("bb"), + ] +) + + + +body_transform = ComposeTransform( + [ + KeypointsToBB(slice(None)), + ExpandBB(0.15, 0.05, 0.2, 0.2, "bb"), + ExpandBBRect("bb"), + ReshapeBBRect((256, 192)), + CropBB(bb_key="bb"), + ImageTransform( + "image", + T.Compose( + [ + T.Resize((256, 192)), + ] + ), + ), + ] +) + +body_transform_from_bb = ComposeTransform( + [ + ExpandBB(0.15, 0.05, 0.2, 0.2, "bb"), + ExpandBBRect("bb"), + ReshapeBBRect((256, 192)), + CropBB(bb_key="bb"), + ImageTransform( + "image", + T.Compose( + [ + T.Resize((256, 192)), + ] + ), + ), + ] +) + +normalize_img = A.Compose([ + A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + ]) + +normalize_img_torch = T.Compose([ + T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + ]) + +@timeit +def get_valid_ids(body_json): + #  count valid detections per idx to find the valid ones + idxs_count = {} + for det in body_json: + idx = det["idx"] + kpts = np.array(det["keypoints"]).reshape((-1, 3)) + if (kpts[:, 2] > MIN_CONF_THRESH).all(): + if idx in idxs_count.keys(): + idxs_count[idx] += 1 + else: + idxs_count[idx] = 1 + + valid_idxs = [] + for idx, count in idxs_count.items(): + if count > MIN_IDXS_COUNT: + valid_idxs.append(idx) + + return (valid_idxs) + +@timeit +def get_valid_frames_by_keys(valid_idxs, body_results): + out = {} + for idx in valid_idxs: + out[idx] = [] + + for det in body_results: + if det["idx"] in valid_idxs: + kpts = np.array(det["keypoints"]).reshape((-1, 3)) + if (kpts[:, 2] > MIN_CONF_THRESH).all(): + + # add the timestamp to the frame detection + date_str = det["image_id"].split(".")[0].split("_ts_")[-1] + date_format = '%Y_%m_%d_%H_%M_%S_%f' + timestamp = datetime.strptime(date_str, date_format) + det["timestamp"] = timestamp + + # check previous timestamp + if len(out[det["idx"]]) > 0: + last_ts = out[det["idx"]][-1]["timestamp"] + diff_ts = (timestamp - last_ts).total_seconds() + else: + diff_ts = 0 + + assert(diff_ts >= 0) + + # if diff_ts < 0.3: + # # add the frame detection to the output dic by idx + # out[det["idx"]].append(det) + # else: + # print(det["idx"], "Discard det because ts diff too high ({} > 0.2 s)".format(diff_ts), tag = "warning", tag_color = "yellow", color = "white") + + out[det["idx"]].append(det) + + return out + + + +@timeit +def get_inputs(f_i, valid_frames, n_frames): + + if f_i < n_frames: + # not enough past frames + return None, None, None, None, None, None + else: + imgs = torch.zeros((1, n_frames, 3, 256, 192)) + head_masks = torch.zeros((1, n_frames, 1, 256, 192)) + body_dvs = torch.zeros((1, n_frames, 2)) + + norm_body_center = np.zeros((n_frames, 2)) + + + sequences_ids = [f_i + off for off in range(-n_frames + 1, 1)] + image_ids = [] + print(sequences_ids) + for k, i in enumerate(sequences_ids): + seq_frame_i = valid_frames[i] + # load images + image_ids.append(seq_frame_i["image_id"]) + image_path = os.path.join(images_root, seq_frame_i["image_id"]) + img_org = Image.open(image_path) + kpts = np.array(seq_frame_i["keypoints"]).reshape((-1,3)) + assert((kpts[:,2] > MIN_CONF_THRESH).all()) + + item = { + "image": img_org, + "keypoints": kpts[:, :2], + } + + # get head bb in pixels + head_trans = head_transform(item) + head_bb = head_trans['bb'] + head_bb = np.array([head_bb['u'], head_bb['v'], head_bb['w'], head_bb['h']]).astype(np.float32) + + # get body bb in pixels + body_trans = body_transform(item) + body_bb = body_trans['bb'] + body_bb = np.array([body_bb['u'], body_bb['v'], body_bb['w'], body_bb['h']]) + body_image = np.array(body_trans['image']) + + # change head bb to relative to body bb + head_bb_abs = head_bb.copy() + + head_bb[0] -= body_bb[0] + head_bb[1] -= body_bb[1] + + head_bb[0] = head_bb[0] / body_bb[2] + head_bb[1] = head_bb[1] / body_bb[3] + head_bb[2] = head_bb[2] / body_bb[2] + head_bb[3] = head_bb[3] / body_bb[3] + + # store body center + norm_body_center[k,:] = (body_bb[[0, 1]] + body_bb[[2, 3]] / 2) / body_bb[[2,3]] + + # normalize image + img = normalize_img(image = body_image)['image'] + img = torch.from_numpy(img.transpose(2, 0, 1)) + + assert(img.shape[0] == 3) + assert(img.shape[1] == 256) + assert(img.shape[2] == 192) + + # create mask of head bounding box + head_mask = torch.zeros(1, img.shape[1], img.shape[2]) + head_bb_int = head_bb.copy() + head_bb_int[[0, 2]] *= img.shape[2] + head_bb_int[[1, 3]] *= img.shape[1] + head_bb_int[2] += head_bb_int[0] + head_bb_int[3] += head_bb_int[1] + head_bb_int = head_bb_int.astype(np.int64) + head_bb_int[head_bb_int < 0] = 0 + + print(head_bb, color = "red") + print(head_bb_int, color = "red") + head_mask[:, head_bb_int[1]:head_bb_int[3], head_bb_int[0]:head_bb_int[2]] = 1 + + # assign + head_masks[0, k, :, :, :] = head_mask + imgs[0, k, :, :, :] = img + + # compute dv + body_dvs[0, :, :] = torch.from_numpy(norm_body_center - np.roll(norm_body_center, shift=1, axis=0)) + + return imgs, head_masks, body_dvs, head_bb_abs, image_ids, body_bb \ No newline at end of file diff --git a/launch.sh b/launch.sh new file mode 100644 index 0000000..89d1e65 --- /dev/null +++ b/launch.sh @@ -0,0 +1,29 @@ +#!/bin/bash +IsRunning=`docker ps -f name=rgbd_detect | grep -c "rgbd_detect"`; +if [ $IsRunning -eq "0" ]; then + xhost +local:docker + docker run --rm \ + --gpus all \ + -e DISPLAY=$DISPLAY \ + -e XAUTHORITY=$XAUTHORITY \ + -e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e 'QT_X11_NO_MITSHM=1' \ + -v /tmp/.X11-unix:/tmp/.X11-unix:rw \ + -v /tmp/docker_share:/tmp/docker_share \ + -v `pwd`:/workspace/rgbd_pose_and_depth \ + --ipc host \ + --device /dev/dri \ + --device /dev/snd \ + --device /dev/input \ + --device /dev/bus/usb \ + --privileged \ + --ulimit rtprio=99 \ + --net host \ + --name rgbd_detect \ + --entrypoint /bin/bash \ + -ti inria_docker:rgbd_detect +else + echo "Docker image is already running. Opening new terminal..."; + docker exec -ti rgbd_detect /bin/bash +fi \ No newline at end of file diff --git a/lib/data/augmentation.py b/lib/data/augmentation.py new file mode 100644 index 0000000..0818d64 --- /dev/null +++ b/lib/data/augmentation.py @@ -0,0 +1,99 @@ +import numpy as np +import os +import random +import torch +import copy +import torch.nn as nn +from lib.utils.tools import read_pkl +from lib.utils.utils_data import flip_data, crop_scale_3d + +class Augmenter2D(object): + """ + Make 2D augmentations on the fly. PyTorch batch-processing GPU version. + """ + def __init__(self, args): + self.d2c_params = read_pkl(args.d2c_params_path) + self.noise = torch.load(args.noise_path) + self.mask_ratio = args.mask_ratio + self.mask_T_ratio = args.mask_T_ratio + self.num_Kframes = 27 + self.noise_std = 0.002 + + def dis2conf(self, dis, a, b, m, s): + f = a/(dis+a)+b*dis + shift = torch.randn(*dis.shape)*s + m + # if torch.cuda.is_available(): + shift = shift.to(dis.device) + return f + shift + + def add_noise(self, motion_2d): + a, b, m, s = self.d2c_params["a"], self.d2c_params["b"], self.d2c_params["m"], self.d2c_params["s"] + if "uniform_range" in self.noise.keys(): + uniform_range = self.noise["uniform_range"] + else: + uniform_range = 0.06 + motion_2d = motion_2d[:,:,:,:2] + batch_size = motion_2d.shape[0] + num_frames = motion_2d.shape[1] + num_joints = motion_2d.shape[2] + mean = self.noise['mean'].float() + std = self.noise['std'].float() + weight = self.noise['weight'][:,None].float() + sel = torch.rand((batch_size, self.num_Kframes, num_joints, 1)) + gaussian_sample = (torch.randn(batch_size, self.num_Kframes, num_joints, 2) * std + mean) + uniform_sample = (torch.rand((batch_size, self.num_Kframes, num_joints, 2))-0.5) * uniform_range + noise_mean = 0 + delta_noise = torch.randn(num_frames, num_joints, 2) * self.noise_std + noise_mean + # if torch.cuda.is_available(): + mean = mean.to(motion_2d.device) + std = std.to(motion_2d.device) + weight = weight.to(motion_2d.device) + gaussian_sample = gaussian_sample.to(motion_2d.device) + uniform_sample = uniform_sample.to(motion_2d.device) + sel = sel.to(motion_2d.device) + delta_noise = delta_noise.to(motion_2d.device) + + delta = gaussian_sample*(sel=weight) + delta_expand = torch.nn.functional.interpolate(delta.unsqueeze(1), [num_frames, num_joints, 2], mode='trilinear', align_corners=True)[:,0] + delta_final = delta_expand + delta_noise + motion_2d = motion_2d + delta_final + dx = delta_final[:,:,:,0] + dy = delta_final[:,:,:,1] + dis2 = dx*dx+dy*dy + dis = torch.sqrt(dis2) + conf = self.dis2conf(dis, a, b, m, s).clip(0,1).reshape([batch_size, num_frames, num_joints, -1]) + return torch.cat((motion_2d, conf), dim=3) + + def add_mask(self, x): + ''' motion_2d: (N,T,17,3) + ''' + N,T,J,C = x.shape + mask = torch.rand(N,T,J,1, dtype=x.dtype, device=x.device) > self.mask_ratio + mask_T = torch.rand(1,T,1,1, dtype=x.dtype, device=x.device) > self.mask_T_ratio + x = x * mask * mask_T + return x + + def augment2D(self, motion_2d, mask=False, noise=False): + if noise: + motion_2d = self.add_noise(motion_2d) + if mask: + motion_2d = self.add_mask(motion_2d) + return motion_2d + +class Augmenter3D(object): + """ + Make 3D augmentations when dataloaders get items. NumPy single motion version. + """ + def __init__(self, args): + self.flip = args.flip + if hasattr(args, "scale_range_pretrain"): + self.scale_range_pretrain = args.scale_range_pretrain + else: + self.scale_range_pretrain = None + + def augment3D(self, motion_3d): + if self.scale_range_pretrain: + motion_3d = crop_scale_3d(motion_3d, self.scale_range_pretrain) + if self.flip and random.random()>0.5: + motion_3d = flip_data(motion_3d) + return motion_3d \ No newline at end of file diff --git a/lib/data/datareader_h36m.py b/lib/data/datareader_h36m.py new file mode 100644 index 0000000..b0f20b6 --- /dev/null +++ b/lib/data/datareader_h36m.py @@ -0,0 +1,136 @@ +# Adapted from Optimizing Network Structure for 3D Human Pose Estimation (ICCV 2019) (https://github.com/CHUNYUWANG/lcn-pose/blob/master/tools/data.py) + +import numpy as np +import os, sys +import random +import copy +from lib.utils.tools import read_pkl +from lib.utils.utils_data import split_clips +random.seed(0) + +class DataReaderH36M(object): + def __init__(self, n_frames, sample_stride, data_stride_train, data_stride_test, read_confidence=True, dt_root = 'data/motion3d', dt_file = 'h36m_cpn_cam_source.pkl'): + self.gt_trainset = None + self.gt_testset = None + self.split_id_train = None + self.split_id_test = None + self.test_hw = None + self.dt_dataset = read_pkl('%s/%s' % (dt_root, dt_file)) + self.n_frames = n_frames + self.sample_stride = sample_stride + self.data_stride_train = data_stride_train + self.data_stride_test = data_stride_test + self.read_confidence = read_confidence + + def read_2d(self): + trainset = self.dt_dataset['train']['joint_2d'][::self.sample_stride, :, :2].astype(np.float32) # [N, 17, 2] + testset = self.dt_dataset['test']['joint_2d'][::self.sample_stride, :, :2].astype(np.float32) # [N, 17, 2] + # map to [-1, 1] + for idx, camera_name in enumerate(self.dt_dataset['train']['camera_name']): + if camera_name == '54138969' or camera_name == '60457274': + res_w, res_h = 1000, 1002 + elif camera_name == '55011271' or camera_name == '58860488': + res_w, res_h = 1000, 1000 + else: + assert 0, '%d data item has an invalid camera name' % idx + trainset[idx, :, :] = trainset[idx, :, :] / res_w * 2 - [1, res_h / res_w] + for idx, camera_name in enumerate(self.dt_dataset['test']['camera_name']): + if camera_name == '54138969' or camera_name == '60457274': + res_w, res_h = 1000, 1002 + elif camera_name == '55011271' or camera_name == '58860488': + res_w, res_h = 1000, 1000 + else: + assert 0, '%d data item has an invalid camera name' % idx + testset[idx, :, :] = testset[idx, :, :] / res_w * 2 - [1, res_h / res_w] + if self.read_confidence: + if 'confidence' in self.dt_dataset['train'].keys(): + train_confidence = self.dt_dataset['train']['confidence'][::self.sample_stride].astype(np.float32) + test_confidence = self.dt_dataset['test']['confidence'][::self.sample_stride].astype(np.float32) + if len(train_confidence.shape)==2: # (1559752, 17) + train_confidence = train_confidence[:,:,None] + test_confidence = test_confidence[:,:,None] + else: + # No conf provided, fill with 1. + train_confidence = np.ones(trainset.shape)[:,:,0:1] + test_confidence = np.ones(testset.shape)[:,:,0:1] + trainset = np.concatenate((trainset, train_confidence), axis=2) # [N, 17, 3] + testset = np.concatenate((testset, test_confidence), axis=2) # [N, 17, 3] + return trainset, testset + + def read_3d(self): + train_labels = self.dt_dataset['train']['joint3d_image'][::self.sample_stride, :, :3].astype(np.float32) # [N, 17, 3] + test_labels = self.dt_dataset['test']['joint3d_image'][::self.sample_stride, :, :3].astype(np.float32) # [N, 17, 3] + # map to [-1, 1] + for idx, camera_name in enumerate(self.dt_dataset['train']['camera_name']): + if camera_name == '54138969' or camera_name == '60457274': + res_w, res_h = 1000, 1002 + elif camera_name == '55011271' or camera_name == '58860488': + res_w, res_h = 1000, 1000 + else: + assert 0, '%d data item has an invalid camera name' % idx + train_labels[idx, :, :2] = train_labels[idx, :, :2] / res_w * 2 - [1, res_h / res_w] + train_labels[idx, :, 2:] = train_labels[idx, :, 2:] / res_w * 2 + + for idx, camera_name in enumerate(self.dt_dataset['test']['camera_name']): + if camera_name == '54138969' or camera_name == '60457274': + res_w, res_h = 1000, 1002 + elif camera_name == '55011271' or camera_name == '58860488': + res_w, res_h = 1000, 1000 + else: + assert 0, '%d data item has an invalid camera name' % idx + test_labels[idx, :, :2] = test_labels[idx, :, :2] / res_w * 2 - [1, res_h / res_w] + test_labels[idx, :, 2:] = test_labels[idx, :, 2:] / res_w * 2 + + return train_labels, test_labels + def read_hw(self): + if self.test_hw is not None: + return self.test_hw + test_hw = np.zeros((len(self.dt_dataset['test']['camera_name']), 2)) + for idx, camera_name in enumerate(self.dt_dataset['test']['camera_name']): + if camera_name == '54138969' or camera_name == '60457274': + res_w, res_h = 1000, 1002 + elif camera_name == '55011271' or camera_name == '58860488': + res_w, res_h = 1000, 1000 + else: + assert 0, '%d data item has an invalid camera name' % idx + test_hw[idx] = res_w, res_h + self.test_hw = test_hw + return test_hw + + def get_split_id(self): + if self.split_id_train is not None and self.split_id_test is not None: + return self.split_id_train, self.split_id_test + vid_list_train = self.dt_dataset['train']['source'][::self.sample_stride] # (1559752,) + vid_list_test = self.dt_dataset['test']['source'][::self.sample_stride] # (566920,) + self.split_id_train = split_clips(vid_list_train, self.n_frames, data_stride=self.data_stride_train) + self.split_id_test = split_clips(vid_list_test, self.n_frames, data_stride=self.data_stride_test) + return self.split_id_train, self.split_id_test + + def get_hw(self): +# Only Testset HW is needed for denormalization + test_hw = self.read_hw() # train_data (1559752, 2) test_data (566920, 2) + split_id_train, split_id_test = self.get_split_id() + test_hw = test_hw[split_id_test][:,0,:] # (N, 2) + return test_hw + + def get_sliced_data(self): + train_data, test_data = self.read_2d() # train_data (1559752, 17, 3) test_data (566920, 17, 3) + train_labels, test_labels = self.read_3d() # train_labels (1559752, 17, 3) test_labels (566920, 17, 3) + split_id_train, split_id_test = self.get_split_id() + train_data, test_data = train_data[split_id_train], test_data[split_id_test] # (N, 27, 17, 3) + train_labels, test_labels = train_labels[split_id_train], test_labels[split_id_test] # (N, 27, 17, 3) + # ipdb.set_trace() + return train_data, test_data, train_labels, test_labels + + def denormalize(self, test_data): +# data: (N, n_frames, 51) or data: (N, n_frames, 17, 3) + n_clips = test_data.shape[0] + test_hw = self.get_hw() + data = test_data.reshape([n_clips, -1, 17, 3]) + assert len(data) == len(test_hw) + # denormalize (x,y,z) coordiantes for results + for idx, item in enumerate(data): + res_w, res_h = test_hw[idx] + data[idx, :, :, :2] = (data[idx, :, :, :2] + np.array([1, res_h / res_w])) * res_w / 2 + data[idx, :, :, 2:] = data[idx, :, :, 2:] * res_w / 2 + return data # [n_clips, -1, 17, 3] diff --git a/lib/data/datareader_mesh.py b/lib/data/datareader_mesh.py new file mode 100644 index 0000000..7cb1e87 --- /dev/null +++ b/lib/data/datareader_mesh.py @@ -0,0 +1,59 @@ +import numpy as np +import os, sys +import copy +from lib.utils.tools import read_pkl +from lib.utils.utils_data import split_clips + +class DataReaderMesh(object): + def __init__(self, n_frames, sample_stride, data_stride_train, data_stride_test, read_confidence=True, dt_root = 'data/mesh', dt_file = 'pw3d_det.pkl', res=[1920, 1920]): + self.split_id_train = None + self.split_id_test = None + self.dt_dataset = read_pkl('%s/%s' % (dt_root, dt_file)) + self.n_frames = n_frames + self.sample_stride = sample_stride + self.data_stride_train = data_stride_train + self.data_stride_test = data_stride_test + self.read_confidence = read_confidence + self.res = res + + def read_2d(self): + if self.res is not None: + res_w, res_h = self.res + offset = [1, res_h / res_w] + else: + res = np.array(self.dt_dataset['train']['img_hw'])[::self.sample_stride].astype(np.float32) + res_w, res_h = res.max(1)[:, None, None], res.max(1)[:, None, None] + offset = 1 + trainset = self.dt_dataset['train']['joint_2d'][::self.sample_stride, :, :2].astype(np.float32) # [N, 17, 2] + testset = self.dt_dataset['test']['joint_2d'][::self.sample_stride, :, :2].astype(np.float32) # [N, 17, 2] + # res_w, res_h = self.res + trainset = trainset / res_w * 2 - offset + testset = testset / res_w * 2 - offset + if self.read_confidence: + train_confidence = self.dt_dataset['train']['confidence'][::self.sample_stride].astype(np.float32) + test_confidence = self.dt_dataset['test']['confidence'][::self.sample_stride].astype(np.float32) + if len(train_confidence.shape)==2: + train_confidence = train_confidence[:,:,None] + test_confidence = test_confidence[:,:,None] + trainset = np.concatenate((trainset, train_confidence), axis=2) # [N, 17, 3] + testset = np.concatenate((testset, test_confidence), axis=2) # [N, 17, 3] + return trainset, testset + + def get_split_id(self): + if self.split_id_train is not None and self.split_id_test is not None: + return self.split_id_train, self.split_id_test + vid_list_train = self.dt_dataset['train']['source'][::self.sample_stride] + vid_list_test = self.dt_dataset['test']['source'][::self.sample_stride] + self.split_id_train = split_clips(vid_list_train, self.n_frames, self.data_stride_train) + self.split_id_test = split_clips(vid_list_test, self.n_frames, self.data_stride_test) + return self.split_id_train, self.split_id_test + + def get_sliced_data(self): + train_data, test_data = self.read_2d() + train_labels, test_labels = self.read_3d() + split_id_train, split_id_test = self.get_split_id() + train_data, test_data = train_data[split_id_train], test_data[split_id_test] # (N, 27, 17, 3) + train_labels, test_labels = train_labels[split_id_train], test_labels[split_id_test] # (N, 27, 17, 3) + return train_data, test_data, train_labels, test_labels + + \ No newline at end of file diff --git a/lib/data/dataset_action.py b/lib/data/dataset_action.py new file mode 100644 index 0000000..87bc5de --- /dev/null +++ b/lib/data/dataset_action.py @@ -0,0 +1,206 @@ +import torch +import numpy as np +import os +import random +import copy +from torch.utils.data import Dataset, DataLoader +from lib.utils.utils_data import crop_scale, resample +from lib.utils.tools import read_pkl + +def get_action_names(file_path = "data/action/ntu_actions.txt"): + f = open(file_path, "r") + s = f.read() + actions = s.split('\n') + action_names = [] + for a in actions: + action_names.append(a.split('.')[1][1:]) + return action_names + +def make_cam(x, img_shape): + ''' + Input: x (M x T x V x C) + img_shape (height, width) + ''' + h, w = img_shape + if w >= h: + x_cam = x / w * 2 - 1 + else: + x_cam = x / h * 2 - 1 + return x_cam + +def coco2h36m(x): + ''' + Input: x (M x T x V x C) + + COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank} + + H36M: + 0: 'root', + 1: 'rhip', + 2: 'rkne', + 3: 'rank', + 4: 'lhip', + 5: 'lkne', + 6: 'lank', + 7: 'belly', + 8: 'neck', + 9: 'nose', + 10: 'head', + 11: 'lsho', + 12: 'lelb', + 13: 'lwri', + 14: 'rsho', + 15: 'relb', + 16: 'rwri' + ''' + y = np.zeros(x.shape) + y[:,:,0,:] = (x[:,:,11,:] + x[:,:,12,:]) * 0.5 + y[:,:,1,:] = x[:,:,12,:] + y[:,:,2,:] = x[:,:,14,:] + y[:,:,3,:] = x[:,:,16,:] + y[:,:,4,:] = x[:,:,11,:] + y[:,:,5,:] = x[:,:,13,:] + y[:,:,6,:] = x[:,:,15,:] + y[:,:,8,:] = (x[:,:,5,:] + x[:,:,6,:]) * 0.5 + y[:,:,7,:] = (y[:,:,0,:] + y[:,:,8,:]) * 0.5 + y[:,:,9,:] = x[:,:,0,:] + y[:,:,10,:] = (x[:,:,1,:] + x[:,:,2,:]) * 0.5 + y[:,:,11,:] = x[:,:,5,:] + y[:,:,12,:] = x[:,:,7,:] + y[:,:,13,:] = x[:,:,9,:] + y[:,:,14,:] = x[:,:,6,:] + y[:,:,15,:] = x[:,:,8,:] + y[:,:,16,:] = x[:,:,10,:] + return y + +def random_move(data_numpy, + angle_range=[-10., 10.], + scale_range=[0.9, 1.1], + transform_range=[-0.1, 0.1], + move_time_candidate=[1]): + data_numpy = np.transpose(data_numpy, (3,1,2,0)) # M,T,V,C-> C,T,V,M + C, T, V, M = data_numpy.shape + move_time = random.choice(move_time_candidate) + node = np.arange(0, T, T * 1.0 / move_time).round().astype(int) + node = np.append(node, T) + num_node = len(node) + A = np.random.uniform(angle_range[0], angle_range[1], num_node) + S = np.random.uniform(scale_range[0], scale_range[1], num_node) + T_x = np.random.uniform(transform_range[0], transform_range[1], num_node) + T_y = np.random.uniform(transform_range[0], transform_range[1], num_node) + a = np.zeros(T) + s = np.zeros(T) + t_x = np.zeros(T) + t_y = np.zeros(T) + # linspace + for i in range(num_node - 1): + a[node[i]:node[i + 1]] = np.linspace( + A[i], A[i + 1], node[i + 1] - node[i]) * np.pi / 180 + s[node[i]:node[i + 1]] = np.linspace(S[i], S[i + 1], node[i + 1] - node[i]) + t_x[node[i]:node[i + 1]] = np.linspace(T_x[i], T_x[i + 1], node[i + 1] - node[i]) + t_y[node[i]:node[i + 1]] = np.linspace(T_y[i], T_y[i + 1], node[i + 1] - node[i]) + theta = np.array([[np.cos(a) * s, -np.sin(a) * s], + [np.sin(a) * s, np.cos(a) * s]]) + # perform transformation + for i_frame in range(T): + xy = data_numpy[0:2, i_frame, :, :] + new_xy = np.dot(theta[:, :, i_frame], xy.reshape(2, -1)) + new_xy[0] += t_x[i_frame] + new_xy[1] += t_y[i_frame] + data_numpy[0:2, i_frame, :, :] = new_xy.reshape(2, V, M) + data_numpy = np.transpose(data_numpy, (3,1,2,0)) # C,T,V,M -> M,T,V,C + return data_numpy + +def human_tracking(x): + M, T = x.shape[:2] + if M==1: + return x + else: + diff0 = np.sum(np.linalg.norm(x[0,1:] - x[0,:-1], axis=-1), axis=-1) # (T-1, V, C) -> (T-1) + diff1 = np.sum(np.linalg.norm(x[0,1:] - x[1,:-1], axis=-1), axis=-1) + x_new = np.zeros(x.shape) + sel = np.cumsum(diff0 > diff1) % 2 + sel = sel[:,None,None] + x_new[0][0] = x[0][0] + x_new[1][0] = x[1][0] + x_new[0,1:] = x[1,1:] * sel + x[0,1:] * (1-sel) + x_new[1,1:] = x[0,1:] * sel + x[1,1:] * (1-sel) + return x_new + +class ActionDataset(Dataset): + def __init__(self, data_path, data_split, n_frames=243, random_move=True, scale_range=[1,1], check_split=True): # data_split: train/test etc. + np.random.seed(0) + dataset = read_pkl(data_path) + if check_split: + assert data_split in dataset['split'].keys() + self.split = dataset['split'][data_split] + annotations = dataset['annotations'] + self.random_move = random_move + self.is_train = "train" in data_split or (check_split==False) + if "oneshot" in data_split: + self.is_train = False + self.scale_range = scale_range + motions = [] + labels = [] + for sample in annotations: + if check_split and (not sample['frame_dir'] in self.split): + continue + resample_id = resample(ori_len=sample['total_frames'], target_len=n_frames, randomness=self.is_train) + motion_cam = make_cam(x=sample['keypoint'], img_shape=sample['img_shape']) + motion_cam = human_tracking(motion_cam) + motion_cam = coco2h36m(motion_cam) + motion_conf = sample['keypoint_score'][..., None] + motion = np.concatenate((motion_cam[:,resample_id], motion_conf[:,resample_id]), axis=-1) + if motion.shape[0]==1: # Single person, make a fake zero person + fake = np.zeros(motion.shape) + motion = np.concatenate((motion, fake), axis=0) + motions.append(motion.astype(np.float32)) + labels.append(sample['label']) + self.motions = np.array(motions) + self.labels = np.array(labels) + + def __len__(self): + 'Denotes the total number of samples' + return len(self.motions) + + def __getitem__(self, index): + raise NotImplementedError + +class NTURGBD(ActionDataset): + def __init__(self, data_path, data_split, n_frames=243, random_move=True, scale_range=[1,1]): + super(NTURGBD, self).__init__(data_path, data_split, n_frames, random_move, scale_range) + + def __getitem__(self, idx): + 'Generates one sample of data' + motion, label = self.motions[idx], self.labels[idx] # (M,T,J,C) + if self.random_move: + motion = random_move(motion) + if self.scale_range: + result = crop_scale(motion, scale_range=self.scale_range) + else: + result = motion + return result.astype(np.float32), label + +class NTURGBD1Shot(ActionDataset): + def __init__(self, data_path, data_split, n_frames=243, random_move=True, scale_range=[1,1], check_split=False): + super(NTURGBD1Shot, self).__init__(data_path, data_split, n_frames, random_move, scale_range, check_split) + oneshot_classes = [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108, 114] + new_classes = set(range(120)) - set(oneshot_classes) + old2new = {} + for i, cid in enumerate(new_classes): + old2new[cid] = i + filtered = [not (x in oneshot_classes) for x in self.labels] + self.motions = self.motions[filtered] + filtered_labels = self.labels[filtered] + self.labels = [old2new[x] for x in filtered_labels] + + def __getitem__(self, idx): + 'Generates one sample of data' + motion, label = self.motions[idx], self.labels[idx] # (M,T,J,C) + if self.random_move: + motion = random_move(motion) + if self.scale_range: + result = crop_scale(motion, scale_range=self.scale_range) + else: + result = motion + return result.astype(np.float32), label \ No newline at end of file diff --git a/lib/data/dataset_mesh.py b/lib/data/dataset_mesh.py new file mode 100644 index 0000000..c496a3a --- /dev/null +++ b/lib/data/dataset_mesh.py @@ -0,0 +1,97 @@ +import torch +import numpy as np +import glob +import os +import io +import random +import pickle +from torch.utils.data import Dataset, DataLoader +from lib.data.augmentation import Augmenter3D +from lib.utils.tools import read_pkl +from lib.utils.utils_data import flip_data, crop_scale +from lib.utils.utils_mesh import flip_thetas +from lib.utils.utils_smpl import SMPL +from torch.utils.data import Dataset, DataLoader +from lib.data.datareader_h36m import DataReaderH36M +from lib.data.datareader_mesh import DataReaderMesh +from lib.data.dataset_action import random_move + +class SMPLDataset(Dataset): + def __init__(self, args, data_split, dataset): # data_split: train/test; dataset: h36m, coco, pw3d + random.seed(0) + np.random.seed(0) + self.clip_len = args.clip_len + self.data_split = data_split + if dataset=="h36m": + datareader = DataReaderH36M(n_frames=self.clip_len, sample_stride=args.sample_stride, data_stride_train=args.data_stride, data_stride_test=self.clip_len, dt_root=args.data_root, dt_file=args.dt_file_h36m) + elif dataset=="coco": + datareader = DataReaderMesh(n_frames=1, sample_stride=args.sample_stride, data_stride_train=1, data_stride_test=1, dt_root=args.data_root, dt_file=args.dt_file_coco, res=[640, 640]) + elif dataset=="pw3d": + datareader = DataReaderMesh(n_frames=self.clip_len, sample_stride=args.sample_stride, data_stride_train=args.data_stride, data_stride_test=self.clip_len, dt_root=args.data_root, dt_file=args.dt_file_pw3d, res=[1920, 1920]) + else: + raise Exception("Mesh dataset undefined.") + + split_id_train, split_id_test = datareader.get_split_id() # Index of clips + train_data, test_data = datareader.read_2d() + train_data, test_data = train_data[split_id_train], test_data[split_id_test] # Input: (N, T, 17, 3) + self.motion_2d = {'train': train_data, 'test': test_data}[data_split] + + dt = datareader.dt_dataset + smpl_pose_train = dt['train']['smpl_pose'][split_id_train] # (N, T, 72) + smpl_shape_train = dt['train']['smpl_shape'][split_id_train] # (N, T, 10) + smpl_pose_test = dt['test']['smpl_pose'][split_id_test] # (N, T, 72) + smpl_shape_test = dt['test']['smpl_shape'][split_id_test] # (N, T, 10) + + self.motion_smpl_3d = {'train': {'pose': smpl_pose_train, 'shape': smpl_shape_train}, 'test': {'pose': smpl_pose_test, 'shape': smpl_shape_test}}[data_split] + self.smpl = SMPL( + args.data_root, + batch_size=1, + ) + + def __len__(self): + 'Denotes the total number of samples' + return len(self.motion_2d) + + def __getitem__(self, index): + raise NotImplementedError + +class MotionSMPL(SMPLDataset): + def __init__(self, args, data_split, dataset): + super(MotionSMPL, self).__init__(args, data_split, dataset) + self.flip = args.flip + + def __getitem__(self, index): + 'Generates one sample of data' + # Select sample + motion_2d = self.motion_2d[index] # motion_2d: (T,17,3) + motion_2d[:,:,2] = np.clip(motion_2d[:,:,2], 0, 1) + motion_smpl_pose = self.motion_smpl_3d['pose'][index].reshape(-1, 24, 3) # motion_smpl_3d: (T, 24, 3) + motion_smpl_shape = self.motion_smpl_3d['shape'][index] # motion_smpl_3d: (T,10) + + if self.data_split=="train": + if self.flip and random.random() > 0.5: # Training augmentation - random flipping + motion_2d = flip_data(motion_2d) + motion_smpl_pose = flip_thetas(motion_smpl_pose) + + + motion_smpl_pose = torch.from_numpy(motion_smpl_pose).reshape(-1, 72).float() + motion_smpl_shape = torch.from_numpy(motion_smpl_shape).reshape(-1, 10).float() + motion_smpl = self.smpl( + betas=motion_smpl_shape, + body_pose=motion_smpl_pose[:, 3:], + global_orient=motion_smpl_pose[:, :3], + pose2rot=True + ) + motion_verts = motion_smpl.vertices.detach()*1000.0 + J_regressor = self.smpl.J_regressor_h36m + J_regressor_batch = J_regressor[None, :].expand(motion_verts.shape[0], -1, -1).to(motion_verts.device) + motion_3d_reg = torch.matmul(J_regressor_batch, motion_verts) # motion_3d: (T,17,3) + motion_verts = motion_verts - motion_3d_reg[:, :1, :] + motion_3d_reg = motion_3d_reg - motion_3d_reg[:, :1, :] # motion_3d: (T,17,3) + motion_theta = torch.cat((motion_smpl_pose, motion_smpl_shape), -1) + motion_smpl_3d = { + 'theta': motion_theta, # smpl pose and shape + 'kp_3d': motion_3d_reg, # 3D keypoints + 'verts': motion_verts, # 3D mesh vertices + } + return motion_2d, motion_smpl_3d \ No newline at end of file diff --git a/lib/data/dataset_motion_2d.py b/lib/data/dataset_motion_2d.py new file mode 100644 index 0000000..b136f33 --- /dev/null +++ b/lib/data/dataset_motion_2d.py @@ -0,0 +1,148 @@ +import sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import Dataset, DataLoader +import numpy as np +import os +import random +import copy +import json +from collections import defaultdict +from lib.utils.utils_data import crop_scale, flip_data, resample, split_clips + +def posetrack2h36m(x): + ''' + Input: x (T x V x C) + + PoseTrack keypoints = [ 'nose', + 'head_bottom', + 'head_top', + 'left_ear', + 'right_ear', + 'left_shoulder', + 'right_shoulder', + 'left_elbow', + 'right_elbow', + 'left_wrist', + 'right_wrist', + 'left_hip', + 'right_hip', + 'left_knee', + 'right_knee', + 'left_ankle', + 'right_ankle'] + H36M: + 0: 'root', + 1: 'rhip', + 2: 'rkne', + 3: 'rank', + 4: 'lhip', + 5: 'lkne', + 6: 'lank', + 7: 'belly', + 8: 'neck', + 9: 'nose', + 10: 'head', + 11: 'lsho', + 12: 'lelb', + 13: 'lwri', + 14: 'rsho', + 15: 'relb', + 16: 'rwri' + ''' + y = np.zeros(x.shape) + y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5 + y[:,1,:] = x[:,12,:] + y[:,2,:] = x[:,14,:] + y[:,3,:] = x[:,16,:] + y[:,4,:] = x[:,11,:] + y[:,5,:] = x[:,13,:] + y[:,6,:] = x[:,15,:] + y[:,8,:] = x[:,1,:] + y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5 + y[:,9,:] = x[:,0,:] + y[:,10,:] = x[:,2,:] + y[:,11,:] = x[:,5,:] + y[:,12,:] = x[:,7,:] + y[:,13,:] = x[:,9,:] + y[:,14,:] = x[:,6,:] + y[:,15,:] = x[:,8,:] + y[:,16,:] = x[:,10,:] + y[:,0,2] = np.minimum(x[:,11,2], x[:,12,2]) + y[:,7,2] = np.minimum(y[:,0,2], y[:,8,2]) + return y + + +class PoseTrackDataset2D(Dataset): + def __init__(self, flip=True, scale_range=[0.25, 1]): + super(PoseTrackDataset2D, self).__init__() + self.flip = flip + data_root = "data/motion2d/posetrack18_annotations/train/" + file_list = sorted(os.listdir(data_root)) + all_motions = [] + all_motions_filtered = [] + self.scale_range = scale_range + for filename in file_list: + with open(os.path.join(data_root, filename), 'r') as file: + json_dict = json.load(file) + annots = json_dict['annotations'] + imgs = json_dict['images'] + motions = defaultdict(list) + for annot in annots: + tid = annot['track_id'] + pose2d = np.array(annot['keypoints']).reshape(-1,3) + motions[tid].append(pose2d) + all_motions += list(motions.values()) + for motion in all_motions: + if len(motion)<30: + continue + motion = np.array(motion[:30]) + if np.sum(motion[:,:,2]) <= 306: # Valid joint num threshold + continue + motion = crop_scale(motion, self.scale_range) + motion = posetrack2h36m(motion) + motion[motion[:,:,2]==0] = 0 + if np.sum(motion[:,0,2]) < 30: + continue # Root all visible (needed for framewise rootrel) + all_motions_filtered.append(motion) + all_motions_filtered = np.array(all_motions_filtered) + self.motions_2d = all_motions_filtered + + def __len__(self): + 'Denotes the total number of samples' + return len(self.motions_2d) + + def __getitem__(self, index): + 'Generates one sample of data' + motion_2d = torch.FloatTensor(self.motions_2d[index]) + if self.flip and random.random()>0.5: + motion_2d = flip_data(motion_2d) + return motion_2d, motion_2d + +class InstaVDataset2D(Dataset): + def __init__(self, n_frames=81, data_stride=27, flip=True, valid_threshold=0.0, scale_range=[0.25, 1]): + super(InstaVDataset2D, self).__init__() + self.flip = flip + self.scale_range = scale_range + motion_all = np.load('data/motion2d/InstaVariety/motion_all.npy') + id_all = np.load('data/motion2d/InstaVariety/id_all.npy') + split_id = split_clips(id_all, n_frames, data_stride) + motions_2d = motion_all[split_id] # [N, T, 17, 3] + valid_idx = (motions_2d[:,0,0,2] > valid_threshold) + self.motions_2d = motions_2d[valid_idx] + + def __len__(self): + 'Denotes the total number of samples' + return len(self.motions_2d) + + def __getitem__(self, index): + 'Generates one sample of data' + motion_2d = self.motions_2d[index] + motion_2d = crop_scale(motion_2d, self.scale_range) + motion_2d[motion_2d[:,:,2]==0] = 0 + if self.flip and random.random()>0.5: + motion_2d = flip_data(motion_2d) + motion_2d = torch.FloatTensor(motion_2d) + return motion_2d, motion_2d + \ No newline at end of file diff --git a/lib/data/dataset_motion_3d.py b/lib/data/dataset_motion_3d.py new file mode 100644 index 0000000..a2de10d --- /dev/null +++ b/lib/data/dataset_motion_3d.py @@ -0,0 +1,68 @@ +import torch +import numpy as np +import glob +import os +import io +import random +import pickle +from torch.utils.data import Dataset, DataLoader +from lib.data.augmentation import Augmenter3D +from lib.utils.tools import read_pkl +from lib.utils.utils_data import flip_data + +class MotionDataset(Dataset): + def __init__(self, args, subset_list, data_split): # data_split: train/test + np.random.seed(0) + self.data_root = args.data_root + self.subset_list = subset_list + self.data_split = data_split + file_list_all = [] + for subset in self.subset_list: + data_path = os.path.join(self.data_root, subset, self.data_split) + motion_list = sorted(os.listdir(data_path)) + for i in motion_list: + file_list_all.append(os.path.join(data_path, i)) + self.file_list = file_list_all + + def __len__(self): + 'Denotes the total number of samples' + return len(self.file_list) + + def __getitem__(self, index): + raise NotImplementedError + +class MotionDataset3D(MotionDataset): + def __init__(self, args, subset_list, data_split): + super(MotionDataset3D, self).__init__(args, subset_list, data_split) + self.flip = args.flip + self.synthetic = args.synthetic + self.aug = Augmenter3D(args) + self.gt_2d = args.gt_2d + + def __getitem__(self, index): + 'Generates one sample of data' + # Select sample + file_path = self.file_list[index] + motion_file = read_pkl(file_path) + motion_3d = motion_file["data_label"] + if self.data_split=="train": + if self.synthetic or self.gt_2d: + motion_3d = self.aug.augment3D(motion_3d) + motion_2d = np.zeros(motion_3d.shape, dtype=np.float32) + motion_2d[:,:,:2] = motion_3d[:,:,:2] + motion_2d[:,:,2] = 1 # No 2D detection, use GT xy and c=1. + elif motion_file["data_input"] is not None: # Have 2D detection + motion_2d = motion_file["data_input"] + if self.flip and random.random() > 0.5: # Training augmentation - random flipping + motion_2d = flip_data(motion_2d) + motion_3d = flip_data(motion_3d) + else: + raise ValueError('Training illegal.') + elif self.data_split=="test": + motion_2d = motion_file["data_input"] + if self.gt_2d: + motion_2d[:,:,:2] = motion_3d[:,:,:2] + motion_2d[:,:,2] = 1 + else: + raise ValueError('Data split unknown.') + return torch.FloatTensor(motion_2d), torch.FloatTensor(motion_3d) \ No newline at end of file diff --git a/lib/data/dataset_wild.py b/lib/data/dataset_wild.py new file mode 100644 index 0000000..1176b4e --- /dev/null +++ b/lib/data/dataset_wild.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +# @Author: Raphael +# @Date: 2024-10-09 11:02:29 +# @Last Modified by: Raphael +# @Last Modified time: 2024-10-14 15:26:52 +import torch +import numpy as np +import ipdb +import glob +import os +import io +import math +import random +import json +import pickle +import math +from torch.utils.data import Dataset, DataLoader +from lib.utils.utils_data import crop_scale + +def halpe2h36m(x): + ''' + Input: x (T x V x C) + //Halpe 26 body keypoints + {0, "Nose"}, + {1, "LEye"}, + {2, "REye"}, + {3, "LEar"}, + {4, "REar"}, + {5, "LShoulder"}, + {6, "RShoulder"}, + {7, "LElbow"}, + {8, "RElbow"}, + {9, "LWrist"}, + {10, "RWrist"}, + {11, "LHip"}, + {12, "RHip"}, + {13, "LKnee"}, + {14, "Rknee"}, + {15, "LAnkle"}, + {16, "RAnkle"}, + {17, "Head"}, + {18, "Neck"}, + {19, "Hip"}, + {20, "LBigToe"}, + {21, "RBigToe"}, + {22, "LSmallToe"}, + {23, "RSmallToe"}, + {24, "LHeel"}, + {25, "RHeel"}, + ''' + T, V, C = x.shape + y = np.zeros([T,17,C]) + y[:,0,:] = x[:,19,:] + y[:,1,:] = x[:,12,:] + y[:,2,:] = x[:,14,:] + y[:,3,:] = x[:,16,:] + y[:,4,:] = x[:,11,:] + y[:,5,:] = x[:,13,:] + y[:,6,:] = x[:,15,:] + y[:,7,:] = (x[:,18,:] + x[:,19,:]) * 0.5 + y[:,8,:] = x[:,18,:] + y[:,9,:] = x[:,0,:] + y[:,10,:] = x[:,17,:] + y[:,11,:] = x[:,5,:] + y[:,12,:] = x[:,7,:] + y[:,13,:] = x[:,9,:] + y[:,14,:] = x[:,6,:] + y[:,15,:] = x[:,8,:] + y[:,16,:] = x[:,10,:] + return y + + +def coco2h36m(x): + ''' + Input: x (M x T x V x C) + + COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank} + + H36M: + 0: 'root', + 1: 'rhip', + 2: 'rkne', + 3: 'rank', + 4: 'lhip', + 5: 'lkne', + 6: 'lank', + 7: 'belly', + 8: 'neck', + 9: 'nose', + 10: 'head', + 11: 'lsho', + 12: 'lelb', + 13: 'lwri', + 14: 'rsho', + 15: 'relb', + 16: 'rwri' + ''' + y = np.zeros(x.shape) + y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5 + y[:,1,:] = x[:,12,:] + y[:,2,:] = x[:,14,:] + y[:,3,:] = x[:,16,:] + y[:,4,:] = x[:,11,:] + y[:,5,:] = x[:,13,:] + y[:,6,:] = x[:,15,:] + y[:,8,:] = (x[:,5,:] + x[:,6,:]) * 0.5 + y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5 + y[:,9,:] = x[:,0,:] + y[:,10,:] = (x[:,1,:] + x[:,2,:]) * 0.5 + y[:,11,:] = x[:,5,:] + y[:,12,:] = x[:,7,:] + y[:,13,:] = x[:,9,:] + y[:,14,:] = x[:,6,:] + y[:,15,:] = x[:,8,:] + y[:,16,:] = x[:,10,:] + return y + + +def read_input(json_path, vid_size, scale_range, focus): + with open(json_path, "r") as read_file: + results = json.load(read_file) + kpts_all = [] + image_ids = [] + kpts_3d_all = [] + for item in results: + if focus!=None and item['idx']!=focus: + continue + kpts = np.array(item['keypoints']).reshape([-1,3]) + kpts_all.append(kpts) + image_ids.append(item["image_id"]) + if "keypoints_3d" in item.keys(): + kpts_3d = np.array(item['keypoints_3d']).reshape([-1,3]) + kpts_3d_all.append(kpts_3d) + + kpts_all = np.array(kpts_all) + kpts_3d_all = np.array(kpts_3d_all) + + print(kpts_all.shape) + + if kpts_all.shape[1] == 26: + kpts_all = halpe2h36m(kpts_all) + if len(kpts_3d_all) > 0: + assert(kpts_3d_all.shape[1] == 26) + kpts_3d_all = halpe2h36m(kpts_3d_all) + + elif kpts_all.shape[1] == 17: + print("WARNING : Using COCO17 input !") + kpts_all = coco2h36m(kpts_all) + if len(kpts_3d_all) > 0: + assert(kpts_3d_all.shape[1] == 17) + kpts_3d_all = coco2h36m(kpts_3d_all) + else: + print("Error, expecting kpts_all of shape [..., 17 or 26, ...]") + exit(0) + + + if vid_size: + w, h = vid_size + scale = min(w,h) / 2.0 + kpts_all[:,:,:2] = kpts_all[:,:,:2] - np.array([w, h]) / 2.0 + kpts_all[:,:,:2] = kpts_all[:,:,:2] / scale + motion = kpts_all + + if scale_range: + motion = crop_scale(kpts_all, scale_range) + + motion_3d = kpts_3d_all.astype(np.float32) + + return motion.astype(np.float32), image_ids, motion_3d + +class WildDetDataset(Dataset): + def __init__(self, json_path, clip_len=243, vid_size=None, scale_range=None, focus=None): + self.json_path = json_path + self.clip_len = clip_len + self.vid_all, self.image_ids, self.motion_3d = read_input(json_path, vid_size, scale_range, focus) + + def __len__(self): + 'Denotes the total number of samples' + return math.ceil(len(self.vid_all) / self.clip_len) + + def __getitem__(self, index): + 'Generates one sample of data' + st = index*self.clip_len + end = min((index+1)*self.clip_len, len(self.vid_all)) + return self.vid_all[st:end] \ No newline at end of file diff --git a/lib/model/DSTformer.py b/lib/model/DSTformer.py new file mode 100644 index 0000000..2af2388 --- /dev/null +++ b/lib/model/DSTformer.py @@ -0,0 +1,362 @@ +import torch +import torch.nn as nn +import math +import warnings +import random +import numpy as np +from collections import OrderedDict +from functools import partial +from itertools import repeat +from lib.model.drop import DropPath + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + # type: (Tensor, float, float, float, float) -> Tensor + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +class MLP(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., st_mode='vanilla'): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.mode = st_mode + if self.mode == 'parallel': + self.ts_attn = nn.Linear(dim*2, dim*2) + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + else: + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj_drop = nn.Dropout(proj_drop) + + self.attn_count_s = None + self.attn_count_t = None + + def forward(self, x, seqlen=1): + B, N, C = x.shape + + if self.mode == 'series': + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + x = self.forward_spatial(q, k, v) + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + x = self.forward_temporal(q, k, v, seqlen=seqlen) + elif self.mode == 'parallel': + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + x_t = self.forward_temporal(q, k, v, seqlen=seqlen) + x_s = self.forward_spatial(q, k, v) + + alpha = torch.cat([x_s, x_t], dim=-1) + alpha = alpha.mean(dim=1, keepdim=True) + alpha = self.ts_attn(alpha).reshape(B, 1, C, 2) + alpha = alpha.softmax(dim=-1) + x = x_t * alpha[:,:,:,1] + x_s * alpha[:,:,:,0] + elif self.mode == 'coupling': + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + x = self.forward_coupling(q, k, v, seqlen=seqlen) + elif self.mode == 'vanilla': + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + x = self.forward_spatial(q, k, v) + elif self.mode == 'temporal': + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + x = self.forward_temporal(q, k, v, seqlen=seqlen) + elif self.mode == 'spatial': + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + x = self.forward_spatial(q, k, v) + else: + raise NotImplementedError(self.mode) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def reshape_T(self, x, seqlen=1, inverse=False): + if not inverse: + N, C = x.shape[-2:] + x = x.reshape(-1, seqlen, self.num_heads, N, C).transpose(1,2) + x = x.reshape(-1, self.num_heads, seqlen*N, C) #(B, H, TN, c) + else: + TN, C = x.shape[-2:] + x = x.reshape(-1, self.num_heads, seqlen, TN // seqlen, C).transpose(1,2) + x = x.reshape(-1, self.num_heads, TN // seqlen, C) #(BT, H, N, C) + return x + + def forward_coupling(self, q, k, v, seqlen=8): + BT, _, N, C = q.shape + q = self.reshape_T(q, seqlen) + k = self.reshape_T(k, seqlen) + v = self.reshape_T(v, seqlen) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = attn @ v + x = self.reshape_T(x, seqlen, inverse=True) + x = x.transpose(1,2).reshape(BT, N, C*self.num_heads) + return x + + def forward_spatial(self, q, k, v): + B, _, N, C = q.shape + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = attn @ v + x = x.transpose(1,2).reshape(B, N, C*self.num_heads) + return x + + def forward_temporal(self, q, k, v, seqlen=8): + B, _, N, C = q.shape + qt = q.reshape(-1, seqlen, self.num_heads, N, C).permute(0, 2, 3, 1, 4) #(B, H, N, T, C) + kt = k.reshape(-1, seqlen, self.num_heads, N, C).permute(0, 2, 3, 1, 4) #(B, H, N, T, C) + vt = v.reshape(-1, seqlen, self.num_heads, N, C).permute(0, 2, 3, 1, 4) #(B, H, N, T, C) + + attn = (qt @ kt.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = attn @ vt #(B, H, N, T, C) + x = x.permute(0, 3, 2, 1, 4).reshape(B, N, C*self.num_heads) + return x + + def count_attn(self, attn): + attn = attn.detach().cpu().numpy() + attn = attn.mean(axis=1) + attn_t = attn[:, :, 1].mean(axis=1) + attn_s = attn[:, :, 0].mean(axis=1) + if self.attn_count_s is None: + self.attn_count_s = attn_s + self.attn_count_t = attn_t + else: + self.attn_count_s = np.concatenate([self.attn_count_s, attn_s], axis=0) + self.attn_count_t = np.concatenate([self.attn_count_t, attn_t], axis=0) + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., mlp_out_ratio=1., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, st_mode='stage_st', att_fuse=False): + super().__init__() + # assert 'stage' in st_mode + self.st_mode = st_mode + self.norm1_s = norm_layer(dim) + self.norm1_t = norm_layer(dim) + self.attn_s = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, st_mode="spatial") + self.attn_t = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, st_mode="temporal") + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2_s = norm_layer(dim) + self.norm2_t = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + mlp_out_dim = int(dim * mlp_out_ratio) + self.mlp_s = MLP(in_features=dim, hidden_features=mlp_hidden_dim, out_features=mlp_out_dim, act_layer=act_layer, drop=drop) + self.mlp_t = MLP(in_features=dim, hidden_features=mlp_hidden_dim, out_features=mlp_out_dim, act_layer=act_layer, drop=drop) + self.att_fuse = att_fuse + if self.att_fuse: + self.ts_attn = nn.Linear(dim*2, dim*2) + def forward(self, x, seqlen=1): + if self.st_mode=='stage_st': + x = x + self.drop_path(self.attn_s(self.norm1_s(x), seqlen)) + x = x + self.drop_path(self.mlp_s(self.norm2_s(x))) + x = x + self.drop_path(self.attn_t(self.norm1_t(x), seqlen)) + x = x + self.drop_path(self.mlp_t(self.norm2_t(x))) + elif self.st_mode=='stage_ts': + x = x + self.drop_path(self.attn_t(self.norm1_t(x), seqlen)) + x = x + self.drop_path(self.mlp_t(self.norm2_t(x))) + x = x + self.drop_path(self.attn_s(self.norm1_s(x), seqlen)) + x = x + self.drop_path(self.mlp_s(self.norm2_s(x))) + elif self.st_mode=='stage_para': + x_t = x + self.drop_path(self.attn_t(self.norm1_t(x), seqlen)) + x_t = x_t + self.drop_path(self.mlp_t(self.norm2_t(x_t))) + x_s = x + self.drop_path(self.attn_s(self.norm1_s(x), seqlen)) + x_s = x_s + self.drop_path(self.mlp_s(self.norm2_s(x_s))) + if self.att_fuse: + # x_s, x_t: [BF, J, dim] + alpha = torch.cat([x_s, x_t], dim=-1) + BF, J = alpha.shape[:2] + # alpha = alpha.mean(dim=1, keepdim=True) + alpha = self.ts_attn(alpha).reshape(BF, J, -1, 2) + alpha = alpha.softmax(dim=-1) + x = x_t * alpha[:,:,:,1] + x_s * alpha[:,:,:,0] + else: + x = (x_s + x_t)*0.5 + else: + raise NotImplementedError(self.st_mode) + return x + +class DSTformer(nn.Module): + def __init__(self, dim_in=3, dim_out=3, dim_feat=256, dim_rep=512, + depth=5, num_heads=8, mlp_ratio=4, + num_joints=17, maxlen=243, + qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, att_fuse=True): + super().__init__() + self.dim_out = dim_out + self.dim_feat = dim_feat + self.joints_embed = nn.Linear(dim_in, dim_feat) + self.pos_drop = nn.Dropout(p=drop_rate) + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks_st = nn.ModuleList([ + Block( + dim=dim_feat, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + st_mode="stage_st") + for i in range(depth)]) + self.blocks_ts = nn.ModuleList([ + Block( + dim=dim_feat, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + st_mode="stage_ts") + for i in range(depth)]) + self.norm = norm_layer(dim_feat) + if dim_rep: + self.pre_logits = nn.Sequential(OrderedDict([ + ('fc', nn.Linear(dim_feat, dim_rep)), + ('act', nn.Tanh()) + ])) + else: + self.pre_logits = nn.Identity() + self.head = nn.Linear(dim_rep, dim_out) if dim_out > 0 else nn.Identity() + self.temp_embed = nn.Parameter(torch.zeros(1, maxlen, 1, dim_feat)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_joints, dim_feat)) + trunc_normal_(self.temp_embed, std=.02) + trunc_normal_(self.pos_embed, std=.02) + self.apply(self._init_weights) + self.att_fuse = att_fuse + if self.att_fuse: + self.ts_attn = nn.ModuleList([nn.Linear(dim_feat*2, 2) for i in range(depth)]) + for i in range(depth): + self.ts_attn[i].weight.data.fill_(0) + self.ts_attn[i].bias.data.fill_(0.5) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_classifier(self): + return self.head + + def reset_classifier(self, dim_out, global_pool=''): + self.dim_out = dim_out + self.head = nn.Linear(self.dim_feat, dim_out) if dim_out > 0 else nn.Identity() + + def forward(self, x, return_rep=False): + B, F, J, C = x.shape + x = x.reshape(-1, J, C) + BF = x.shape[0] + x = self.joints_embed(x) + x = x + self.pos_embed + _, J, C = x.shape + x = x.reshape(-1, F, J, C) + self.temp_embed[:,:F,:,:] + x = x.reshape(BF, J, C) + x = self.pos_drop(x) + alphas = [] + for idx, (blk_st, blk_ts) in enumerate(zip(self.blocks_st, self.blocks_ts)): + x_st = blk_st(x, F) + x_ts = blk_ts(x, F) + if self.att_fuse: + att = self.ts_attn[idx] + alpha = torch.cat([x_st, x_ts], dim=-1) + BF, J = alpha.shape[:2] + alpha = att(alpha) + alpha = alpha.softmax(dim=-1) + x = x_st * alpha[:,:,0:1] + x_ts * alpha[:,:,1:2] + else: + x = (x_st + x_ts)*0.5 + x = self.norm(x) + x = x.reshape(B, F, J, -1) + x = self.pre_logits(x) # [B, F, J, dim_feat] + if return_rep: + return x + x = self.head(x) + return x + + def get_representation(self, x): + return self.forward(x, return_rep=True) + \ No newline at end of file diff --git a/lib/model/drop.py b/lib/model/drop.py new file mode 100644 index 0000000..efbed35 --- /dev/null +++ b/lib/model/drop.py @@ -0,0 +1,43 @@ +""" DropBlock, DropPath +PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers. +Papers: +DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890) +Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382) +Code: +DropBlock impl inspired by two Tensorflow impl that I liked: + - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74 + - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py +Hacked together by / Copyright 2020 Ross Wightman +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +def drop_path(x, drop_prob: float = 0., training: bool = False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) \ No newline at end of file diff --git a/lib/model/loss.py b/lib/model/loss.py new file mode 100644 index 0000000..4397ce1 --- /dev/null +++ b/lib/model/loss.py @@ -0,0 +1,204 @@ +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F + +# Numpy-based errors + +def mpjpe(predicted, target): + """ + Mean per-joint position error (i.e. mean Euclidean distance), + often referred to as "Protocol #1" in many papers. + """ + assert predicted.shape == target.shape + return np.mean(np.linalg.norm(predicted - target, axis=len(target.shape)-1), axis=1) + +def p_mpjpe(predicted, target): + """ + Pose error: MPJPE after rigid alignment (scale, rotation, and translation), + often referred to as "Protocol #2" in many papers. + """ + assert predicted.shape == target.shape + + muX = np.mean(target, axis=1, keepdims=True) + muY = np.mean(predicted, axis=1, keepdims=True) + + X0 = target - muX + Y0 = predicted - muY + + normX = np.sqrt(np.sum(X0**2, axis=(1, 2), keepdims=True)) + normY = np.sqrt(np.sum(Y0**2, axis=(1, 2), keepdims=True)) + + X0 /= normX + Y0 /= normY + + H = np.matmul(X0.transpose(0, 2, 1), Y0) + U, s, Vt = np.linalg.svd(H) + V = Vt.transpose(0, 2, 1) + R = np.matmul(V, U.transpose(0, 2, 1)) + + # Avoid improper rotations (reflections), i.e. rotations with det(R) = -1 + sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1)) + V[:, :, -1] *= sign_detR + s[:, -1] *= sign_detR.flatten() + R = np.matmul(V, U.transpose(0, 2, 1)) # Rotation + tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2) + a = tr * normX / normY # Scale + t = muX - a*np.matmul(muY, R) # Translation + # Perform rigid transformation on the input + predicted_aligned = a*np.matmul(predicted, R) + t + # Return MPJPE + return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape)-1), axis=1) + + +# PyTorch-based errors (for losses) + +def loss_mpjpe(predicted, target): + """ + Mean per-joint position error (i.e. mean Euclidean distance), + often referred to as "Protocol #1" in many papers. + """ + assert predicted.shape == target.shape + return torch.mean(torch.norm(predicted - target, dim=len(target.shape)-1)) + +def weighted_mpjpe(predicted, target, w): + """ + Weighted mean per-joint position error (i.e. mean Euclidean distance) + """ + assert predicted.shape == target.shape + assert w.shape[0] == predicted.shape[0] + return torch.mean(w * torch.norm(predicted - target, dim=len(target.shape)-1)) + +def loss_2d_weighted(predicted, target, conf): + assert predicted.shape == target.shape + predicted_2d = predicted[:,:,:,:2] + target_2d = target[:,:,:,:2] + diff = (predicted_2d - target_2d) * conf + return torch.mean(torch.norm(diff, dim=-1)) + +def n_mpjpe(predicted, target): + """ + Normalized MPJPE (scale only), adapted from: + https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py + """ + assert predicted.shape == target.shape + norm_predicted = torch.mean(torch.sum(predicted**2, dim=3, keepdim=True), dim=2, keepdim=True) + norm_target = torch.mean(torch.sum(target*predicted, dim=3, keepdim=True), dim=2, keepdim=True) + scale = norm_target / norm_predicted + return loss_mpjpe(scale * predicted, target) + +def weighted_bonelen_loss(predict_3d_length, gt_3d_length): + loss_length = 0.001 * torch.pow(predict_3d_length - gt_3d_length, 2).mean() + return loss_length + +def weighted_boneratio_loss(predict_3d_length, gt_3d_length): + loss_length = 0.1 * torch.pow((predict_3d_length - gt_3d_length)/gt_3d_length, 2).mean() + return loss_length + +def get_limb_lens(x): + ''' + Input: (N, T, 17, 3) + Output: (N, T, 16) + ''' + limbs_id = [[0,1], [1,2], [2,3], + [0,4], [4,5], [5,6], + [0,7], [7,8], [8,9], [9,10], + [8,11], [11,12], [12,13], + [8,14], [14,15], [15,16] + ] + limbs = x[:,:,limbs_id,:] + limbs = limbs[:,:,:,0,:]-limbs[:,:,:,1,:] + limb_lens = torch.norm(limbs, dim=-1) + return limb_lens + +def loss_limb_var(x): + ''' + Input: (N, T, 17, 3) + ''' + if x.shape[1]<=1: + return torch.FloatTensor(1).fill_(0.)[0].to(x.device) + limb_lens = get_limb_lens(x) + limb_lens_var = torch.var(limb_lens, dim=1) + limb_loss_var = torch.mean(limb_lens_var) + return limb_loss_var + +def loss_limb_gt(x, gt): + ''' + Input: (N, T, 17, 3), (N, T, 17, 3) + ''' + limb_lens_x = get_limb_lens(x) + limb_lens_gt = get_limb_lens(gt) # (N, T, 16) + return nn.L1Loss()(limb_lens_x, limb_lens_gt) + +def loss_velocity(predicted, target): + """ + Mean per-joint velocity error (i.e. mean Euclidean distance of the 1st derivative) + """ + assert predicted.shape == target.shape + if predicted.shape[1]<=1: + return torch.FloatTensor(1).fill_(0.)[0].to(predicted.device) + velocity_predicted = predicted[:,1:] - predicted[:,:-1] + velocity_target = target[:,1:] - target[:,:-1] + return torch.mean(torch.norm(velocity_predicted - velocity_target, dim=-1)) + +def loss_joint(predicted, target): + assert predicted.shape == target.shape + return nn.L1Loss()(predicted, target) + +def get_angles(x): + ''' + Input: (N, T, 17, 3) + Output: (N, T, 16) + ''' + limbs_id = [[0,1], [1,2], [2,3], + [0,4], [4,5], [5,6], + [0,7], [7,8], [8,9], [9,10], + [8,11], [11,12], [12,13], + [8,14], [14,15], [15,16] + ] + angle_id = [[ 0, 3], + [ 0, 6], + [ 3, 6], + [ 0, 1], + [ 1, 2], + [ 3, 4], + [ 4, 5], + [ 6, 7], + [ 7, 10], + [ 7, 13], + [ 8, 13], + [10, 13], + [ 7, 8], + [ 8, 9], + [10, 11], + [11, 12], + [13, 14], + [14, 15] ] + eps = 1e-7 + limbs = x[:,:,limbs_id,:] + limbs = limbs[:,:,:,0,:]-limbs[:,:,:,1,:] + angles = limbs[:,:,angle_id,:] + angle_cos = F.cosine_similarity(angles[:,:,:,0,:], angles[:,:,:,1,:], dim=-1) + return torch.acos(angle_cos.clamp(-1+eps, 1-eps)) + +def loss_angle(x, gt): + ''' + Input: (N, T, 17, 3), (N, T, 17, 3) + ''' + limb_angles_x = get_angles(x) + limb_angles_gt = get_angles(gt) + return nn.L1Loss()(limb_angles_x, limb_angles_gt) + +def loss_angle_velocity(x, gt): + """ + Mean per-angle velocity error (i.e. mean Euclidean distance of the 1st derivative) + """ + assert x.shape == gt.shape + if x.shape[1]<=1: + return torch.FloatTensor(1).fill_(0.)[0].to(x.device) + x_a = get_angles(x) + gt_a = get_angles(gt) + x_av = x_a[:,1:] - x_a[:,:-1] + gt_av = gt_a[:,1:] - gt_a[:,:-1] + return nn.L1Loss()(x_av, gt_av) + diff --git a/lib/model/loss_mesh.py b/lib/model/loss_mesh.py new file mode 100644 index 0000000..82f615f --- /dev/null +++ b/lib/model/loss_mesh.py @@ -0,0 +1,68 @@ +import torch +import torch.nn as nn +import ipdb +from lib.utils.utils_mesh import batch_rodrigues +from lib.model.loss import * + +class MeshLoss(nn.Module): + def __init__( + self, + loss_type='MSE', + device='cuda', + ): + super(MeshLoss, self).__init__() + self.device = device + self.loss_type = loss_type + if loss_type == 'MSE': + self.criterion_keypoints = nn.MSELoss(reduction='none').to(self.device) + self.criterion_regr = nn.MSELoss().to(self.device) + elif loss_type == 'L1': + self.criterion_keypoints = nn.L1Loss(reduction='none').to(self.device) + self.criterion_regr = nn.L1Loss().to(self.device) + + def forward( + self, + smpl_output, + data_gt, + ): + # to reduce time dimension + reduce = lambda x: x.reshape((x.shape[0] * x.shape[1],) + x.shape[2:]) + data_3d_theta = reduce(data_gt['theta']) + + preds = smpl_output[-1] + pred_theta = preds['theta'] + theta_size = pred_theta.shape[:2] + pred_theta = reduce(pred_theta) + preds_local = preds['kp_3d'] - preds['kp_3d'][:, :, 0:1,:] # (N, T, 17, 3) + gt_local = data_gt['kp_3d'] - data_gt['kp_3d'][:, :, 0:1,:] + real_shape, pred_shape = data_3d_theta[:, 72:], pred_theta[:, 72:] + real_pose, pred_pose = data_3d_theta[:, :72], pred_theta[:, :72] + loss_dict = {} + loss_dict['loss_3d_pos'] = loss_mpjpe(preds_local, gt_local) + loss_dict['loss_3d_scale'] = n_mpjpe(preds_local, gt_local) + loss_dict['loss_3d_velocity'] = loss_velocity(preds_local, gt_local) + loss_dict['loss_lv'] = loss_limb_var(preds_local) + loss_dict['loss_lg'] = loss_limb_gt(preds_local, gt_local) + loss_dict['loss_a'] = loss_angle(preds_local, gt_local) + loss_dict['loss_av'] = loss_angle_velocity(preds_local, gt_local) + + if pred_theta.shape[0] > 0: + loss_pose, loss_shape = self.smpl_losses(pred_pose, pred_shape, real_pose, real_shape) + loss_norm = torch.norm(pred_theta, dim=-1).mean() + loss_dict['loss_shape'] = loss_shape + loss_dict['loss_pose'] = loss_pose + loss_dict['loss_norm'] = loss_norm + return loss_dict + + def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas): + pred_rotmat_valid = batch_rodrigues(pred_rotmat.reshape(-1,3)).reshape(-1, 24, 3, 3) + gt_rotmat_valid = batch_rodrigues(gt_pose.reshape(-1,3)).reshape(-1, 24, 3, 3) + pred_betas_valid = pred_betas + gt_betas_valid = gt_betas + if len(pred_rotmat_valid) > 0: + loss_regr_pose = self.criterion_regr(pred_rotmat_valid, gt_rotmat_valid) + loss_regr_betas = self.criterion_regr(pred_betas_valid, gt_betas_valid) + else: + loss_regr_pose = torch.FloatTensor(1).fill_(0.).to(self.device) + loss_regr_betas = torch.FloatTensor(1).fill_(0.).to(self.device) + return loss_regr_pose, loss_regr_betas diff --git a/lib/model/loss_supcon.py b/lib/model/loss_supcon.py new file mode 100644 index 0000000..17117d4 --- /dev/null +++ b/lib/model/loss_supcon.py @@ -0,0 +1,98 @@ +""" +Author: Yonglong Tian (yonglong@mit.edu) +Date: May 07, 2020 +""" +from __future__ import print_function + +import torch +import torch.nn as nn + + +class SupConLoss(nn.Module): + """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf. + It also supports the unsupervised contrastive loss in SimCLR""" + def __init__(self, temperature=0.07, contrast_mode='all', + base_temperature=0.07): + super(SupConLoss, self).__init__() + self.temperature = temperature + self.contrast_mode = contrast_mode + self.base_temperature = base_temperature + + def forward(self, features, labels=None, mask=None): + """Compute loss for model. If both `labels` and `mask` are None, + it degenerates to SimCLR unsupervised loss: + https://arxiv.org/pdf/2002.05709.pdf + + Args: + features: hidden vector of shape [bsz, n_views, ...]. + labels: ground truth of shape [bsz]. + mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j + has the same class as sample i. Can be asymmetric. + Returns: + A loss scalar. + """ + device = (torch.device('cuda') + if features.is_cuda + else torch.device('cpu')) + + if len(features.shape) < 3: + raise ValueError('`features` needs to be [bsz, n_views, ...],' + 'at least 3 dimensions are required') + if len(features.shape) > 3: + features = features.view(features.shape[0], features.shape[1], -1) + + batch_size = features.shape[0] + if labels is not None and mask is not None: + raise ValueError('Cannot define both `labels` and `mask`') + elif labels is None and mask is None: + mask = torch.eye(batch_size, dtype=torch.float32).to(device) + elif labels is not None: + labels = labels.contiguous().view(-1, 1) + if labels.shape[0] != batch_size: + raise ValueError('Num of labels does not match num of features') + mask = torch.eq(labels, labels.T).float().to(device) + else: + mask = mask.float().to(device) + + contrast_count = features.shape[1] + contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0) + if self.contrast_mode == 'one': + anchor_feature = features[:, 0] + anchor_count = 1 + elif self.contrast_mode == 'all': + anchor_feature = contrast_feature + anchor_count = contrast_count + else: + raise ValueError('Unknown mode: {}'.format(self.contrast_mode)) + + # compute logits + anchor_dot_contrast = torch.div( + torch.matmul(anchor_feature, contrast_feature.T), + self.temperature) + # for numerical stability + logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True) + logits = anchor_dot_contrast - logits_max.detach() + + # tile mask + mask = mask.repeat(anchor_count, contrast_count) + # mask-out self-contrast cases + logits_mask = torch.scatter( + torch.ones_like(mask), + 1, + torch.arange(batch_size * anchor_count).view(-1, 1).to(device), + 0 + ) + mask = mask * logits_mask + + # compute log_prob + exp_logits = torch.exp(logits) * logits_mask + log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True)) + + # compute mean of log-likelihood over positive + mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1) + + # loss + loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos + loss = loss.view(anchor_count, batch_size).mean() + + return loss diff --git a/lib/model/model_action.py b/lib/model/model_action.py new file mode 100644 index 0000000..785ec26 --- /dev/null +++ b/lib/model/model_action.py @@ -0,0 +1,71 @@ +import sys +import torch +import torch.nn as nn +import torch.nn.functional as F + +class ActionHeadClassification(nn.Module): + def __init__(self, dropout_ratio=0., dim_rep=512, num_classes=60, num_joints=17, hidden_dim=2048): + super(ActionHeadClassification, self).__init__() + self.dropout = nn.Dropout(p=dropout_ratio) + self.bn = nn.BatchNorm1d(hidden_dim, momentum=0.1) + self.relu = nn.ReLU(inplace=True) + self.fc1 = nn.Linear(dim_rep*num_joints, hidden_dim) + self.fc2 = nn.Linear(hidden_dim, num_classes) + + def forward(self, feat): + ''' + Input: (N, M, T, J, C) + ''' + N, M, T, J, C = feat.shape + feat = self.dropout(feat) + feat = feat.permute(0, 1, 3, 4, 2) # (N, M, T, J, C) -> (N, M, J, C, T) + feat = feat.mean(dim=-1) + feat = feat.reshape(N, M, -1) # (N, M, J*C) + feat = feat.mean(dim=1) + feat = self.fc1(feat) + feat = self.bn(feat) + feat = self.relu(feat) + feat = self.fc2(feat) + return feat + +class ActionHeadEmbed(nn.Module): + def __init__(self, dropout_ratio=0., dim_rep=512, num_joints=17, hidden_dim=2048): + super(ActionHeadEmbed, self).__init__() + self.dropout = nn.Dropout(p=dropout_ratio) + self.fc1 = nn.Linear(dim_rep*num_joints, hidden_dim) + def forward(self, feat): + ''' + Input: (N, M, T, J, C) + ''' + N, M, T, J, C = feat.shape + feat = self.dropout(feat) + feat = feat.permute(0, 1, 3, 4, 2) # (N, M, T, J, C) -> (N, M, J, C, T) + feat = feat.mean(dim=-1) + feat = feat.reshape(N, M, -1) # (N, M, J*C) + feat = feat.mean(dim=1) + feat = self.fc1(feat) + feat = F.normalize(feat, dim=-1) + return feat + +class ActionNet(nn.Module): + def __init__(self, backbone, dim_rep=512, num_classes=60, dropout_ratio=0., version='class', hidden_dim=2048, num_joints=17): + super(ActionNet, self).__init__() + self.backbone = backbone + self.feat_J = num_joints + if version=='class': + self.head = ActionHeadClassification(dropout_ratio=dropout_ratio, dim_rep=dim_rep, num_classes=num_classes, num_joints=num_joints) + elif version=='embed': + self.head = ActionHeadEmbed(dropout_ratio=dropout_ratio, dim_rep=dim_rep, hidden_dim=hidden_dim, num_joints=num_joints) + else: + raise Exception('Version Error.') + + def forward(self, x): + ''' + Input: (N, M x T x 17 x 3) + ''' + N, M, T, J, C = x.shape + x = x.reshape(N*M, T, J, C) + feat = self.backbone.get_representation(x) + feat = feat.reshape([N, M, T, self.feat_J, -1]) # (N, M, T, J, C) + out = self.head(feat) + return out \ No newline at end of file diff --git a/lib/model/model_mesh.py b/lib/model/model_mesh.py new file mode 100644 index 0000000..dff579d --- /dev/null +++ b/lib/model/model_mesh.py @@ -0,0 +1,101 @@ +import sys +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from lib.utils.utils_smpl import SMPL +from lib.utils.utils_mesh import rotation_matrix_to_angle_axis, rot6d_to_rotmat + +class SMPLRegressor(nn.Module): + def __init__(self, args, dim_rep=512, num_joints=17, hidden_dim=2048, dropout_ratio=0.): + super(SMPLRegressor, self).__init__() + param_pose_dim = 24 * 6 + self.dropout = nn.Dropout(p=dropout_ratio) + self.fc1 = nn.Linear(num_joints*dim_rep, hidden_dim) + self.pool2 = nn.AdaptiveAvgPool2d((None, 1)) + self.fc2 = nn.Linear(num_joints*dim_rep, hidden_dim) + self.bn1 = nn.BatchNorm1d(hidden_dim, momentum=0.1) + self.bn2 = nn.BatchNorm1d(hidden_dim, momentum=0.1) + self.relu1 = nn.ReLU(inplace=True) + self.relu2 = nn.ReLU(inplace=True) + self.head_pose = nn.Linear(hidden_dim, param_pose_dim) + self.head_shape = nn.Linear(hidden_dim, 10) + nn.init.xavier_uniform_(self.head_pose.weight, gain=0.01) + nn.init.xavier_uniform_(self.head_shape.weight, gain=0.01) + self.smpl = SMPL( + args.data_root, + batch_size=64, + create_transl=False, + ) + mean_params = np.load(self.smpl.smpl_mean_params) + init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0) + init_shape = torch.from_numpy(mean_params['shape'][:].astype('float32')).unsqueeze(0) + self.register_buffer('init_pose', init_pose) + self.register_buffer('init_shape', init_shape) + self.J_regressor = self.smpl.J_regressor_h36m + + def forward(self, feat, init_pose=None, init_shape=None): + N, T, J, C = feat.shape + NT = N * T + feat = feat.reshape(N, T, -1) + + feat_pose = feat.reshape(NT, -1) # (N*T, J*C) + + feat_pose = self.dropout(feat_pose) + feat_pose = self.fc1(feat_pose) + feat_pose = self.bn1(feat_pose) + feat_pose = self.relu1(feat_pose) # (NT, C) + + feat_shape = feat.permute(0,2,1) # (N, T, J*C) -> (N, J*C, T) + feat_shape = self.pool2(feat_shape).reshape(N, -1) # (N, J*C) + + feat_shape = self.dropout(feat_shape) + feat_shape = self.fc2(feat_shape) + feat_shape = self.bn2(feat_shape) + feat_shape = self.relu2(feat_shape) # (N, C) + + pred_pose = self.init_pose.expand(NT, -1) # (NT, C) + pred_shape = self.init_shape.expand(N, -1) # (N, C) + + pred_pose = self.head_pose(feat_pose) + pred_pose + pred_shape = self.head_shape(feat_shape) + pred_shape + pred_shape = pred_shape.expand(T, N, -1).permute(1, 0, 2).reshape(NT, -1) + pred_rotmat = rot6d_to_rotmat(pred_pose).view(-1, 24, 3, 3) + pred_output = self.smpl( + betas=pred_shape, + body_pose=pred_rotmat[:, 1:], + global_orient=pred_rotmat[:, 0].unsqueeze(1), + pose2rot=False + ) + pred_vertices = pred_output.vertices*1000.0 + assert self.J_regressor is not None + J_regressor_batch = self.J_regressor[None, :].expand(pred_vertices.shape[0], -1, -1).to(pred_vertices.device) + pred_joints = torch.matmul(J_regressor_batch, pred_vertices) + pose = rotation_matrix_to_angle_axis(pred_rotmat.reshape(-1, 3, 3)).reshape(-1, 72) + output = [{ + 'theta' : torch.cat([pose, pred_shape], dim=1), # (N*T, 72+10) + 'verts' : pred_vertices, # (N*T, 6890, 3) + 'kp_3d' : pred_joints, # (N*T, 17, 3) + }] + return output + +class MeshRegressor(nn.Module): + def __init__(self, args, backbone, dim_rep=512, num_joints=17, hidden_dim=2048, dropout_ratio=0.5): + super(MeshRegressor, self).__init__() + self.backbone = backbone + self.feat_J = num_joints + self.head = SMPLRegressor(args, dim_rep, num_joints, hidden_dim, dropout_ratio) + + def forward(self, x, init_pose=None, init_shape=None, n_iter=3): + ''' + Input: (N x T x 17 x 3) + ''' + N, T, J, C = x.shape + feat = self.backbone.get_representation(x) + feat = feat.reshape([N, T, self.feat_J, -1]) # (N, T, J, C) + smpl_output = self.head(feat) + for s in smpl_output: + s['theta'] = s['theta'].reshape(N, T, -1) + s['verts'] = s['verts'].reshape(N, T, -1, 3) + s['kp_3d'] = s['kp_3d'].reshape(N, T, -1, 3) + return smpl_output \ No newline at end of file diff --git a/lib/utils/learning.py b/lib/utils/learning.py new file mode 100644 index 0000000..191e669 --- /dev/null +++ b/lib/utils/learning.py @@ -0,0 +1,102 @@ +import os +import numpy as np +import torch +import torch.nn as nn +from functools import partial +from lib.model.DSTformer import DSTformer + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + +def load_pretrained_weights(model, checkpoint): + """Load pretrianed weights to model + Incompatible layers (unmatched in name or size) will be ignored + Args: + - model (nn.Module): network model, which must not be nn.DataParallel + - weight_path (str): path to pretrained weights + """ + import collections + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + model_dict = model.state_dict() + new_state_dict = collections.OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + # If the pretrained state_dict was saved as nn.DataParallel, + # keys would contain "module.", which should be ignored. + if k.startswith('module.'): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict, strict=True) + print('load_weight', len(matched_layers)) + return model + +def partial_train_layers(model, partial_list): + """Train partial layers of a given model.""" + for name, p in model.named_parameters(): + p.requires_grad = False + for trainable in partial_list: + if trainable in name: + p.requires_grad = True + break + return model + +def load_backbone(args): + if not(hasattr(args, "backbone")): + args.backbone = 'DSTformer' # Default + if args.backbone=='DSTformer': + model_backbone = DSTformer(dim_in=3, dim_out=3, dim_feat=args.dim_feat, dim_rep=args.dim_rep, + depth=args.depth, num_heads=args.num_heads, mlp_ratio=args.mlp_ratio, norm_layer=partial(nn.LayerNorm, eps=1e-6), + maxlen=args.maxlen, num_joints=args.num_joints) + elif args.backbone=='TCN': + from lib.model.model_tcn import PoseTCN + model_backbone = PoseTCN() + elif args.backbone=='poseformer': + from lib.model.model_poseformer import PoseTransformer + model_backbone = PoseTransformer(num_frame=args.maxlen, num_joints=args.num_joints, in_chans=3, embed_dim_ratio=32, depth=4, + num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,drop_path_rate=0, attn_mask=None) + elif args.backbone=='mixste': + from lib.model.model_mixste import MixSTE2 + model_backbone = MixSTE2(num_frame=args.maxlen, num_joints=args.num_joints, in_chans=3, embed_dim_ratio=512, depth=8, + num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,drop_path_rate=0) + elif args.backbone=='stgcn': + from lib.model.model_stgcn import Model as STGCN + model_backbone = STGCN() + else: + raise Exception("Undefined backbone type.") + return model_backbone \ No newline at end of file diff --git a/lib/utils/tools.py b/lib/utils/tools.py new file mode 100644 index 0000000..b2b780f --- /dev/null +++ b/lib/utils/tools.py @@ -0,0 +1,69 @@ +import numpy as np +import os, sys +import pickle +import yaml +from easydict import EasyDict as edict +from typing import Any, IO + +ROOT_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..') + +class TextLogger: + def __init__(self, log_path): + self.log_path = log_path + with open(self.log_path, "w") as f: + f.write("") + def log(self, log): + with open(self.log_path, "a+") as f: + f.write(log + "\n") + +class Loader(yaml.SafeLoader): + """YAML Loader with `!include` constructor.""" + + def __init__(self, stream: IO) -> None: + """Initialise Loader.""" + + try: + self._root = os.path.split(stream.name)[0] + except AttributeError: + self._root = os.path.curdir + + super().__init__(stream) + +def construct_include(loader: Loader, node: yaml.Node) -> Any: + """Include file referenced at node.""" + + filename = os.path.abspath(os.path.join(loader._root, loader.construct_scalar(node))) + extension = os.path.splitext(filename)[1].lstrip('.') + + with open(filename, 'r') as f: + if extension in ('yaml', 'yml'): + return yaml.load(f, Loader) + elif extension in ('json', ): + return json.load(f) + else: + return ''.join(f.readlines()) + +def get_config(config_path): + yaml.add_constructor('!include', construct_include, Loader) + with open(config_path, 'r') as stream: + config = yaml.load(stream, Loader=Loader) + config = edict(config) + _, config_filename = os.path.split(config_path) + config_name, _ = os.path.splitext(config_filename) + config.name = config_name + return config + +def ensure_dir(path): + """ + create path by first checking its existence, + :param paths: path + :return: + """ + if not os.path.exists(path): + os.makedirs(path) + +def read_pkl(data_url): + file = open(data_url,'rb') + content = pickle.load(file) + file.close() + return content \ No newline at end of file diff --git a/lib/utils/utils_data.py b/lib/utils/utils_data.py new file mode 100644 index 0000000..df7b61e --- /dev/null +++ b/lib/utils/utils_data.py @@ -0,0 +1,112 @@ +import os +import torch +import torch.nn.functional as F +import numpy as np +import copy + +def crop_scale(motion, scale_range=[1, 1]): + ''' + Motion: [(M), T, 17, 3]. + Normalize to [-1, 1] + ''' + result = copy.deepcopy(motion) + valid_coords = motion[motion[..., 2]!=0][:,:2] + if len(valid_coords) < 4: + return np.zeros(motion.shape) + xmin = min(valid_coords[:,0]) + xmax = max(valid_coords[:,0]) + ymin = min(valid_coords[:,1]) + ymax = max(valid_coords[:,1]) + ratio = np.random.uniform(low=scale_range[0], high=scale_range[1], size=1)[0] + scale = max(xmax-xmin, ymax-ymin) * ratio + if scale==0: + return np.zeros(motion.shape) + xs = (xmin+xmax-scale) / 2 + ys = (ymin+ymax-scale) / 2 + result[...,:2] = (motion[..., :2]- [xs,ys]) / scale + result[...,:2] = (result[..., :2] - 0.5) * 2 + result = np.clip(result, -1, 1) + return result + +def crop_scale_3d(motion, scale_range=[1, 1]): + ''' + Motion: [T, 17, 3]. (x, y, z) + Normalize to [-1, 1] + Z is relative to the first frame's root. + ''' + result = copy.deepcopy(motion) + result[:,:,2] = result[:,:,2] - result[0,0,2] + xmin = np.min(motion[...,0]) + xmax = np.max(motion[...,0]) + ymin = np.min(motion[...,1]) + ymax = np.max(motion[...,1]) + ratio = np.random.uniform(low=scale_range[0], high=scale_range[1], size=1)[0] + scale = max(xmax-xmin, ymax-ymin) / ratio + if scale==0: + return np.zeros(motion.shape) + xs = (xmin+xmax-scale) / 2 + ys = (ymin+ymax-scale) / 2 + result[...,:2] = (motion[..., :2]- [xs,ys]) / scale + result[...,2] = result[...,2] / scale + result = (result - 0.5) * 2 + return result + +def flip_data(data): + """ + horizontal flip + data: [N, F, 17, D] or [F, 17, D]. X (horizontal coordinate) is the first channel in D. + Return + result: same + """ + left_joints = [4, 5, 6, 11, 12, 13] + right_joints = [1, 2, 3, 14, 15, 16] + flipped_data = copy.deepcopy(data) + flipped_data[..., 0] *= -1 # flip x of all joints + flipped_data[..., left_joints+right_joints, :] = flipped_data[..., right_joints+left_joints, :] + return flipped_data + +def resample(ori_len, target_len, replay=False, randomness=True): + if replay: + if ori_len > target_len: + st = np.random.randint(ori_len-target_len) + return range(st, st+target_len) # Random clipping from sequence + else: + return np.array(range(target_len)) % ori_len # Replay padding + else: + if randomness: + even = np.linspace(0, ori_len, num=target_len, endpoint=False) + if ori_len < target_len: + low = np.floor(even) + high = np.ceil(even) + sel = np.random.randint(2, size=even.shape) + result = np.sort(sel*low+(1-sel)*high) + else: + interval = even[1] - even[0] + result = np.random.random(even.shape)*interval + even + result = np.clip(result, a_min=0, a_max=ori_len-1).astype(np.uint32) + else: + result = np.linspace(0, ori_len, num=target_len, endpoint=False, dtype=int) + return result + +def split_clips(vid_list, n_frames, data_stride): + result = [] + n_clips = 0 + st = 0 + i = 0 + saved = set() + while i(w, x, y, z) + Returns: + Rotation matrix corresponding to the quaternion -- size = [batch_size, 3, 3] + """ + norm_quat = quat + norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True) + w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:, + 2], norm_quat[:, + 3] + + batch_size = quat.size(0) + + w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2) + wx, wy, wz = w * x, w * y, w * z + xy, xz, yz = x * y, x * z, y * z + + rotMat = torch.stack([ + w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy, + w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz, + w2 - x2 - y2 + z2 + ], + dim=1).view(batch_size, 3, 3) + return rotMat + + +def rotation_matrix_to_angle_axis(rotation_matrix): + """ + This function is borrowed from https://github.com/kornia/kornia + + Convert 3x4 rotation matrix to Rodrigues vector + + Args: + rotation_matrix (Tensor): rotation matrix. + + Returns: + Tensor: Rodrigues vector transformation. + + Shape: + - Input: :math:`(N, 3, 4)` + - Output: :math:`(N, 3)` + + Example: + >>> input = torch.rand(2, 3, 4) # Nx4x4 + >>> output = tgm.rotation_matrix_to_angle_axis(input) # Nx3 + """ + if rotation_matrix.shape[1:] == (3,3): + rot_mat = rotation_matrix.reshape(-1, 3, 3) + hom = torch.tensor([0, 0, 1], dtype=torch.float32, + device=rotation_matrix.device).reshape(1, 3, 1).expand(rot_mat.shape[0], -1, -1) + rotation_matrix = torch.cat([rot_mat, hom], dim=-1) + + quaternion = rotation_matrix_to_quaternion(rotation_matrix) + aa = quaternion_to_angle_axis(quaternion) + aa[torch.isnan(aa)] = 0.0 + return aa + + +def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor: + """ + This function is borrowed from https://github.com/kornia/kornia + + Convert quaternion vector to angle axis of rotation. + + Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h + + Args: + quaternion (torch.Tensor): tensor with quaternions. + + Return: + torch.Tensor: tensor with angle axis of rotation. + + Shape: + - Input: :math:`(*, 4)` where `*` means, any number of dimensions + - Output: :math:`(*, 3)` + + Example: + >>> quaternion = torch.rand(2, 4) # Nx4 + >>> angle_axis = tgm.quaternion_to_angle_axis(quaternion) # Nx3 + """ + if not torch.is_tensor(quaternion): + raise TypeError("Input type is not a torch.Tensor. Got {}".format( + type(quaternion))) + + if not quaternion.shape[-1] == 4: + raise ValueError("Input must be a tensor of shape Nx4 or 4. Got {}" + .format(quaternion.shape)) + # unpack input and compute conversion + q1: torch.Tensor = quaternion[..., 1] + q2: torch.Tensor = quaternion[..., 2] + q3: torch.Tensor = quaternion[..., 3] + sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3 + + sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta) + cos_theta: torch.Tensor = quaternion[..., 0] + two_theta: torch.Tensor = 2.0 * torch.where( + cos_theta < 0.0, + torch.atan2(-sin_theta, -cos_theta), + torch.atan2(sin_theta, cos_theta)) + + k_pos: torch.Tensor = two_theta / sin_theta + k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta) + k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg) + + angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3] + angle_axis[..., 0] += q1 * k + angle_axis[..., 1] += q2 * k + angle_axis[..., 2] += q3 * k + return angle_axis + + +def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6): + """ + This function is borrowed from https://github.com/kornia/kornia + + Convert 3x4 rotation matrix to 4d quaternion vector + + This algorithm is based on algorithm described in + https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201 + + Args: + rotation_matrix (Tensor): the rotation matrix to convert. + + Return: + Tensor: the rotation in quaternion + + Shape: + - Input: :math:`(N, 3, 4)` + - Output: :math:`(N, 4)` + + Example: + >>> input = torch.rand(4, 3, 4) # Nx3x4 + >>> output = tgm.rotation_matrix_to_quaternion(input) # Nx4 + """ + if not torch.is_tensor(rotation_matrix): + raise TypeError("Input type is not a torch.Tensor. Got {}".format( + type(rotation_matrix))) + + if len(rotation_matrix.shape) > 3: + raise ValueError( + "Input size must be a three dimensional tensor. Got {}".format( + rotation_matrix.shape)) + if not rotation_matrix.shape[-2:] == (3, 4): + raise ValueError( + "Input size must be a N x 3 x 4 tensor. Got {}".format( + rotation_matrix.shape)) + + rmat_t = torch.transpose(rotation_matrix, 1, 2) + + mask_d2 = rmat_t[:, 2, 2] < eps + + mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1] + mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1] + + t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2] + q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1], + t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0], + rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1) + t0_rep = t0.repeat(4, 1).t() + + t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2] + q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2], + rmat_t[:, 0, 1] + rmat_t[:, 1, 0], + t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1) + t1_rep = t1.repeat(4, 1).t() + + t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2] + q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0], + rmat_t[:, 2, 0] + rmat_t[:, 0, 2], + rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1) + t2_rep = t2.repeat(4, 1).t() + + t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2] + q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1], + rmat_t[:, 2, 0] - rmat_t[:, 0, 2], + rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1) + t3_rep = t3.repeat(4, 1).t() + + mask_c0 = mask_d2 * mask_d0_d1 + mask_c1 = mask_d2 * ~mask_d0_d1 + mask_c2 = ~mask_d2 * mask_d0_nd1 + mask_c3 = ~mask_d2 * ~mask_d0_nd1 + mask_c0 = mask_c0.view(-1, 1).type_as(q0) + mask_c1 = mask_c1.view(-1, 1).type_as(q1) + mask_c2 = mask_c2.view(-1, 1).type_as(q2) + mask_c3 = mask_c3.view(-1, 1).type_as(q3) + + q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3 + q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 + # noqa + t2_rep * mask_c2 + t3_rep * mask_c3) # noqa + q *= 0.5 + return q + + +def estimate_translation_np(S, joints_2d, joints_conf, focal_length=5000., img_size=224.): + """ + This function is borrowed from https://github.com/nkolot/SPIN/utils/geometry.py + + Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. + Input: + S: (25, 3) 3D joint locations + joints: (25, 3) 2D joint locations and confidence + Returns: + (3,) camera translation vector + """ + + num_joints = S.shape[0] + # focal length + f = np.array([focal_length,focal_length]) + # optical center + center = np.array([img_size/2., img_size/2.]) + + # transformations + Z = np.reshape(np.tile(S[:,2],(2,1)).T,-1) + XY = np.reshape(S[:,0:2],-1) + O = np.tile(center,num_joints) + F = np.tile(f,num_joints) + weight2 = np.reshape(np.tile(np.sqrt(joints_conf),(2,1)).T,-1) + + # least squares + Q = np.array([F*np.tile(np.array([1,0]),num_joints), F*np.tile(np.array([0,1]),num_joints), O-np.reshape(joints_2d,-1)]).T + c = (np.reshape(joints_2d,-1)-O)*Z - F*XY + + # weighted least squares + W = np.diagflat(weight2) + Q = np.dot(W,Q) + c = np.dot(W,c) + + # square matrix + A = np.dot(Q.T,Q) + b = np.dot(Q.T,c) + + # solution + trans = np.linalg.solve(A, b) + + return trans + + +def estimate_translation(S, joints_2d, focal_length=5000., img_size=224.): + """ + This function is borrowed from https://github.com/nkolot/SPIN/utils/geometry.py + + Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. + Input: + S: (B, 49, 3) 3D joint locations + joints: (B, 49, 3) 2D joint locations and confidence + Returns: + (B, 3) camera translation vectors + """ + + device = S.device + # Use only joints 25:49 (GT joints) + S = S[:, 25:, :].cpu().numpy() + joints_2d = joints_2d[:, 25:, :].cpu().numpy() + joints_conf = joints_2d[:, :, -1] + joints_2d = joints_2d[:, :, :-1] + trans = np.zeros((S.shape[0], 3), dtype=np.float32) + # Find the translation for each example in the batch + for i in range(S.shape[0]): + S_i = S[i] + joints_i = joints_2d[i] + conf_i = joints_conf[i] + trans[i] = estimate_translation_np(S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size) + return torch.from_numpy(trans).to(device) + + +def rot6d_to_rotmat_spin(x): + """Convert 6D rotation representation to 3x3 rotation matrix. + Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019 + Input: + (B,6) Batch of 6-D rotation representations + Output: + (B,3,3) Batch of corresponding rotation matrices + """ + x = x.view(-1,3,2) + a1 = x[:, :, 0] + a2 = x[:, :, 1] + b1 = F.normalize(a1) + b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1) + + # inp = a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1 + # denom = inp.pow(2).sum(dim=1).sqrt().unsqueeze(-1) + 1e-8 + # b2 = inp / denom + + b3 = torch.cross(b1, b2) + return torch.stack((b1, b2, b3), dim=-1) + + +def rot6d_to_rotmat(x): + x = x.view(-1,3,2) + + # Normalize the first vector + b1 = F.normalize(x[:, :, 0], dim=1, eps=1e-6) + + dot_prod = torch.sum(b1 * x[:, :, 1], dim=1, keepdim=True) + # Compute the second vector by finding the orthogonal complement to it + b2 = F.normalize(x[:, :, 1] - dot_prod * b1, dim=-1, eps=1e-6) + + # Finish building the basis by taking the cross product + b3 = torch.cross(b1, b2, dim=1) + rot_mats = torch.stack([b1, b2, b3], dim=-1) + + return rot_mats + + +def rigid_transform_3D(A, B): + n, dim = A.shape + centroid_A = np.mean(A, axis = 0) + centroid_B = np.mean(B, axis = 0) + H = np.dot(np.transpose(A - centroid_A), B - centroid_B) / n + U, s, V = np.linalg.svd(H) + R = np.dot(np.transpose(V), np.transpose(U)) + if np.linalg.det(R) < 0: + s[-1] = -s[-1] + V[2] = -V[2] + R = np.dot(np.transpose(V), np.transpose(U)) + + varP = np.var(A, axis=0).sum() + c = 1/varP * np.sum(s) + + t = -np.dot(c*R, np.transpose(centroid_A)) + np.transpose(centroid_B) + return c, R, t + + +def rigid_align(A, B): + c, R, t = rigid_transform_3D(A, B) + A2 = np.transpose(np.dot(c*R, np.transpose(A))) + t + return A2 + +def compute_error(output, target): + with torch.no_grad(): + pred_verts = output[0]['verts'].reshape(-1, 6890, 3) + target_verts = target['verts'].reshape(-1, 6890, 3) + + pred_j3ds = output[0]['kp_3d'].reshape(-1, 17, 3) + target_j3ds = target['kp_3d'].reshape(-1, 17, 3) + + # mpve + pred_verts = pred_verts - pred_j3ds[:, :1, :] + target_verts = target_verts - target_j3ds[:, :1, :] + mpves = torch.sqrt(((pred_verts - target_verts) ** 2).sum(dim=-1)).mean(dim=-1).cpu() + + # mpjpe + pred_j3ds = pred_j3ds - pred_j3ds[:, :1, :] + target_j3ds = target_j3ds - target_j3ds[:, :1, :] + mpjpes = torch.sqrt(((pred_j3ds - target_j3ds) ** 2).sum(dim=-1)).mean(dim=-1).cpu() + return mpjpes.mean(), mpves.mean() + +def compute_error_frames(output, target): + with torch.no_grad(): + pred_verts = output[0]['verts'].reshape(-1, 6890, 3) + target_verts = target['verts'].reshape(-1, 6890, 3) + + pred_j3ds = output[0]['kp_3d'].reshape(-1, 17, 3) + target_j3ds = target['kp_3d'].reshape(-1, 17, 3) + + # mpve + pred_verts = pred_verts - pred_j3ds[:, :1, :] + target_verts = target_verts - target_j3ds[:, :1, :] + mpves = torch.sqrt(((pred_verts - target_verts) ** 2).sum(dim=-1)).mean(dim=-1).cpu() + + # mpjpe + pred_j3ds = pred_j3ds - pred_j3ds[:, :1, :] + target_j3ds = target_j3ds - target_j3ds[:, :1, :] + mpjpes = torch.sqrt(((pred_j3ds - target_j3ds) ** 2).sum(dim=-1)).mean(dim=-1).cpu() + return mpjpes, mpves + +def evaluate_mesh(results): + pred_verts = results['verts'].reshape(-1, 6890, 3) + target_verts = results['verts_gt'].reshape(-1, 6890, 3) + + pred_j3ds = results['kp_3d'].reshape(-1, 17, 3) + target_j3ds = results['kp_3d_gt'].reshape(-1, 17, 3) + num_samples = pred_j3ds.shape[0] + + # mpve + pred_verts = pred_verts - pred_j3ds[:, :1, :] + target_verts = target_verts - target_j3ds[:, :1, :] + mpve = np.mean(np.mean(np.sqrt(np.square(pred_verts - target_verts).sum(axis=2)), axis=1)) + + + # mpjpe-17 & mpjpe-14 + h36m_17_to_14 = (1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16) + pred_j3ds_17j = (pred_j3ds - pred_j3ds[:, :1, :]) + target_j3ds_17j = (target_j3ds - target_j3ds[:, :1, :]) + + pred_j3ds = pred_j3ds_17j[:, h36m_17_to_14, :].copy() + target_j3ds = target_j3ds_17j[:, h36m_17_to_14, :].copy() + + mpjpe = np.mean(np.sqrt(np.square(pred_j3ds - target_j3ds).sum(axis=2)), axis=1) # (N, ) + mpjpe_17j = np.mean(np.sqrt(np.square(pred_j3ds_17j - target_j3ds_17j).sum(axis=2)), axis=1) # (N, ) + + pred_j3ds_pa, pred_j3ds_pa_17j = [], [] + for n in range(num_samples): + pred_j3ds_pa.append(rigid_align(pred_j3ds[n], target_j3ds[n])) + pred_j3ds_pa_17j.append(rigid_align(pred_j3ds_17j[n], target_j3ds_17j[n])) + pred_j3ds_pa = np.array(pred_j3ds_pa) + pred_j3ds_pa_17j = np.array(pred_j3ds_pa_17j) + + pa_mpjpe = np.mean(np.sqrt(np.square(pred_j3ds_pa - target_j3ds).sum(axis=2)), axis=1) # (N, ) + pa_mpjpe_17j = np.mean(np.sqrt(np.square(pred_j3ds_pa_17j - target_j3ds_17j).sum(axis=2)), axis=1) # (N, ) + + + error_dict = { + 'mpve': mpve.mean(), + 'mpjpe': mpjpe.mean(), + 'pa_mpjpe': pa_mpjpe.mean(), + 'mpjpe_17j': mpjpe_17j.mean(), + 'pa_mpjpe_17j': pa_mpjpe_17j.mean(), + } + return error_dict + + +def rectify_pose(pose): + """ + Rectify "upside down" people in global coord + + Args: + pose (72,): Pose. + + Returns: + Rotated pose. + """ + pose = pose.copy() + R_mod = cv2.Rodrigues(np.array([np.pi, 0, 0]))[0] + R_root = cv2.Rodrigues(pose[:3])[0] + new_root = R_root.dot(R_mod) + pose[:3] = cv2.Rodrigues(new_root)[0].reshape(3) + return pose + +def flip_thetas(thetas): + """Flip thetas. + + Parameters + ---------- + thetas : numpy.ndarray + Joints in shape (F, num_thetas, 3) + theta_pairs : list + List of theta pairs. + + Returns + ------- + numpy.ndarray + Flipped thetas with shape (F, num_thetas, 3) + + """ + #Joint pairs which defines the pairs of joint to be swapped when the image is flipped horizontally. + theta_pairs = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23)) + thetas_flip = thetas.copy() + # reflect horizontally + thetas_flip[:, :, 1] = -1 * thetas_flip[:, :, 1] + thetas_flip[:, :, 2] = -1 * thetas_flip[:, :, 2] + # change left-right parts + for pair in theta_pairs: + thetas_flip[:, pair[0], :], thetas_flip[:, pair[1], :] = \ + thetas_flip[:, pair[1], :], thetas_flip[:, pair[0], :].copy() + return thetas_flip + +def flip_thetas_batch(thetas): + """Flip thetas in batch. + + Parameters + ---------- + thetas : numpy.array + Joints in shape (N, F, num_thetas*3) + theta_pairs : list + List of theta pairs. + + Returns + ------- + numpy.array + Flipped thetas with shape (N, F, num_thetas*3) + + """ + #Joint pairs which defines the pairs of joint to be swapped when the image is flipped horizontally. + theta_pairs = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23)) + thetas_flip = copy.deepcopy(thetas).reshape(*thetas.shape[:2], 24, 3) + # reflect horizontally + thetas_flip[:, :, :, 1] = -1 * thetas_flip[:, :, :, 1] + thetas_flip[:, :, :, 2] = -1 * thetas_flip[:, :, :, 2] + # change left-right parts + for pair in theta_pairs: + thetas_flip[:, :, pair[0], :], thetas_flip[:, :, pair[1], :] = \ + thetas_flip[:, :, pair[1], :], thetas_flip[:, :, pair[0], :].clone() + + return thetas_flip.reshape(*thetas.shape[:2], -1) + +# def smpl_aa_to_ortho6d(smpl_aa): +# # [...,72] -> [...,144] +# rot_aa = smpl_aa.reshape([-1,24,3]) +# rotmat = axis_angle_to_matrix(rot_aa) +# rot6d = matrix_to_rotation_6d(rotmat) +# rot6d = rot6d.reshape(-1,24*6) +# return rot6d \ No newline at end of file diff --git a/lib/utils/utils_smpl.py b/lib/utils/utils_smpl.py new file mode 100644 index 0000000..2215dd8 --- /dev/null +++ b/lib/utils/utils_smpl.py @@ -0,0 +1,88 @@ +# This script is borrowed and extended from https://github.com/nkolot/SPIN/blob/master/models/hmr.py +# Adhere to their licence to use this script + +import torch +import numpy as np +import os.path as osp +from smplx import SMPL as _SMPL +from smplx.utils import ModelOutput, SMPLOutput +from smplx.lbs import vertices2joints + + +# Map joints to SMPL joints +JOINT_MAP = { + 'OP Nose': 24, 'OP Neck': 12, 'OP RShoulder': 17, + 'OP RElbow': 19, 'OP RWrist': 21, 'OP LShoulder': 16, + 'OP LElbow': 18, 'OP LWrist': 20, 'OP MidHip': 0, + 'OP RHip': 2, 'OP RKnee': 5, 'OP RAnkle': 8, + 'OP LHip': 1, 'OP LKnee': 4, 'OP LAnkle': 7, + 'OP REye': 25, 'OP LEye': 26, 'OP REar': 27, + 'OP LEar': 28, 'OP LBigToe': 29, 'OP LSmallToe': 30, + 'OP LHeel': 31, 'OP RBigToe': 32, 'OP RSmallToe': 33, 'OP RHeel': 34, + 'Right Ankle': 8, 'Right Knee': 5, 'Right Hip': 45, + 'Left Hip': 46, 'Left Knee': 4, 'Left Ankle': 7, + 'Right Wrist': 21, 'Right Elbow': 19, 'Right Shoulder': 17, + 'Left Shoulder': 16, 'Left Elbow': 18, 'Left Wrist': 20, + 'Neck (LSP)': 47, 'Top of Head (LSP)': 48, + 'Pelvis (MPII)': 49, 'Thorax (MPII)': 50, + 'Spine (H36M)': 51, 'Jaw (H36M)': 52, + 'Head (H36M)': 53, 'Nose': 24, 'Left Eye': 26, + 'Right Eye': 25, 'Left Ear': 28, 'Right Ear': 27 +} +JOINT_NAMES = [ + 'OP Nose', 'OP Neck', 'OP RShoulder', + 'OP RElbow', 'OP RWrist', 'OP LShoulder', + 'OP LElbow', 'OP LWrist', 'OP MidHip', + 'OP RHip', 'OP RKnee', 'OP RAnkle', + 'OP LHip', 'OP LKnee', 'OP LAnkle', + 'OP REye', 'OP LEye', 'OP REar', + 'OP LEar', 'OP LBigToe', 'OP LSmallToe', + 'OP LHeel', 'OP RBigToe', 'OP RSmallToe', 'OP RHeel', + 'Right Ankle', 'Right Knee', 'Right Hip', + 'Left Hip', 'Left Knee', 'Left Ankle', + 'Right Wrist', 'Right Elbow', 'Right Shoulder', + 'Left Shoulder', 'Left Elbow', 'Left Wrist', + 'Neck (LSP)', 'Top of Head (LSP)', + 'Pelvis (MPII)', 'Thorax (MPII)', + 'Spine (H36M)', 'Jaw (H36M)', + 'Head (H36M)', 'Nose', 'Left Eye', + 'Right Eye', 'Left Ear', 'Right Ear' +] + +JOINT_IDS = {JOINT_NAMES[i]: i for i in range(len(JOINT_NAMES))} +SMPL_MODEL_DIR = 'data/mesh' +H36M_TO_J17 = [6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9] +H36M_TO_J14 = H36M_TO_J17[:14] + + +class SMPL(_SMPL): + """ Extension of the official SMPL implementation to support more joints """ + + def __init__(self, *args, **kwargs): + super(SMPL, self).__init__(*args, **kwargs) + joints = [JOINT_MAP[i] for i in JOINT_NAMES] + self.smpl_mean_params = osp.join(args[0], 'smpl_mean_params.npz') + J_regressor_extra = np.load(osp.join(args[0], 'J_regressor_extra.npy')) + self.register_buffer('J_regressor_extra', torch.tensor(J_regressor_extra, dtype=torch.float32)) + J_regressor_h36m = np.load(osp.join(args[0], 'J_regressor_h36m_correct.npy')) + self.register_buffer('J_regressor_h36m', torch.tensor(J_regressor_h36m, dtype=torch.float32)) + self.joint_map = torch.tensor(joints, dtype=torch.long) + + def forward(self, *args, **kwargs): + kwargs['get_skin'] = True + smpl_output = super(SMPL, self).forward(*args, **kwargs) + extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices) + joints = torch.cat([smpl_output.joints, extra_joints], dim=1) + joints = joints[:, self.joint_map, :] + output = SMPLOutput(vertices=smpl_output.vertices, + global_orient=smpl_output.global_orient, + body_pose=smpl_output.body_pose, + joints=joints, + betas=smpl_output.betas, + full_pose=smpl_output.full_pose) + return output + + +def get_smpl_faces(): + smpl = SMPL(SMPL_MODEL_DIR, batch_size=1, create_transl=False) + return smpl.faces \ No newline at end of file diff --git a/lib/utils/vismo.py b/lib/utils/vismo.py new file mode 100644 index 0000000..456c3d7 --- /dev/null +++ b/lib/utils/vismo.py @@ -0,0 +1,347 @@ +import numpy as np +import os +import cv2 +import math +import copy +import imageio +import io +from tqdm import tqdm +from PIL import Image +from lib.utils.tools import ensure_dir +import matplotlib +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D +from lib.utils.utils_smpl import * +import ipdb + +def render_and_save(motion_input, save_path, keep_imgs=False, fps=25, color="#F96706#FB8D43#FDB381", with_conf=False, draw_face=False): + ensure_dir(os.path.dirname(save_path)) + motion = copy.deepcopy(motion_input) + if motion.shape[-1]==2 or motion.shape[-1]==3: + motion = np.transpose(motion, (1,2,0)) #(T,17,D) -> (17,D,T) + if motion.shape[1]==2 or with_conf: + colors = hex2rgb(color) + if not with_conf: + J, D, T = motion.shape + motion_full = np.ones([J,3,T]) + motion_full[:,:2,:] = motion + else: + motion_full = motion + motion_full[:,:2,:] = pixel2world_vis_motion(motion_full[:,:2,:]) + motion2video(motion_full, save_path=save_path, colors=colors, fps=fps) + elif motion.shape[0]==6890: + # motion_world = pixel2world_vis_motion(motion, dim=3) + motion2video_mesh(motion, save_path=save_path, keep_imgs=keep_imgs, fps=fps, draw_face=draw_face) + else: + motion_world = pixel2world_vis_motion(motion, dim=3) + motion2video_3d(motion_world, save_path=save_path, keep_imgs=keep_imgs, fps=fps) + +def pixel2world_vis(pose): +# pose: (17,2) + return (pose + [1, 1]) * 512 / 2 + +def pixel2world_vis_motion(motion, dim=2, is_tensor=False): +# pose: (17,2,N) + N = motion.shape[-1] + if dim==2: + offset = np.ones([2,N]).astype(np.float32) + else: + offset = np.ones([3,N]).astype(np.float32) + offset[2,:] = 0 + if is_tensor: + offset = torch.tensor(offset) + return (motion + offset) * 512 / 2 + +def vis_data_batch(data_input, data_label, n_render=10, save_path='doodle/vis_train_data/'): + ''' + data_input: [N,T,17,2/3] + data_label: [N,T,17,3] + ''' + pathlib.Path(save_path).mkdir(parents=True, exist_ok=True) + for i in range(min(len(data_input), n_render)): + render_and_save(data_input[i][:,:,:2], '%s/input_%d.mp4' % (save_path, i)) + render_and_save(data_label[i], '%s/gt_%d.mp4' % (save_path, i)) + +def get_img_from_fig(fig, dpi=120): + buf = io.BytesIO() + fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight", pad_inches=0) + buf.seek(0) + img_arr = np.frombuffer(buf.getvalue(), dtype=np.uint8) + buf.close() + img = cv2.imdecode(img_arr, 1) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGBA) + return img + +def rgb2rgba(color): + return (color[0], color[1], color[2], 255) + +def hex2rgb(hex, number_of_colors=3): + h = hex + rgb = [] + for i in range(number_of_colors): + h = h.lstrip('#') + hex_color = h[0:6] + rgb_color = [int(hex_color[i:i+2], 16) for i in (0, 2 ,4)] + rgb.append(rgb_color) + h = h[6:] + return rgb + +def joints2image(joints_position, colors, transparency=False, H=1000, W=1000, nr_joints=49, imtype=np.uint8, grayscale=False, bg_color=(255, 255, 255)): +# joints_position: [17*2] + nr_joints = joints_position.shape[0] + + if nr_joints == 49: # full joints(49): basic(15) + eyes(2) + toes(2) + hands(30) + limbSeq = [[0, 1], [1, 2], [1, 5], [1, 8], [2, 3], [3, 4], [5, 6], [6, 7], \ + [8, 9], [8, 13], [9, 10], [10, 11], [11, 12], [13, 14], [14, 15], [15, 16], + ]#[0, 17], [0, 18]] #ignore eyes + + L = rgb2rgba(colors[0]) if transparency else colors[0] + M = rgb2rgba(colors[1]) if transparency else colors[1] + R = rgb2rgba(colors[2]) if transparency else colors[2] + + colors_joints = [M, M, L, L, L, R, R, + R, M, L, L, L, L, R, R, R, + R, R, L] + [L] * 15 + [R] * 15 + + colors_limbs = [M, L, R, M, L, L, R, + R, L, R, L, L, L, R, R, R, + R, R] + elif nr_joints == 15: # basic joints(15) + (eyes(2)) + limbSeq = [[0, 1], [1, 2], [1, 5], [1, 8], [2, 3], [3, 4], [5, 6], [6, 7], + [8, 9], [8, 12], [9, 10], [10, 11], [12, 13], [13, 14]] + # [0, 15], [0, 16] two eyes are not drawn + + L = rgb2rgba(colors[0]) if transparency else colors[0] + M = rgb2rgba(colors[1]) if transparency else colors[1] + R = rgb2rgba(colors[2]) if transparency else colors[2] + + colors_joints = [M, M, L, L, L, R, R, + R, M, L, L, L, R, R, R] + + colors_limbs = [M, L, R, M, L, L, R, + R, L, R, L, L, R, R] + elif nr_joints == 17: # H36M, 0: 'root', + # 1: 'rhip', + # 2: 'rkne', + # 3: 'rank', + # 4: 'lhip', + # 5: 'lkne', + # 6: 'lank', + # 7: 'belly', + # 8: 'neck', + # 9: 'nose', + # 10: 'head', + # 11: 'lsho', + # 12: 'lelb', + # 13: 'lwri', + # 14: 'rsho', + # 15: 'relb', + # 16: 'rwri' + limbSeq = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7], [7, 8], [8, 9], [8, 11], [8, 14], [9, 10], [11, 12], [12, 13], [14, 15], [15, 16]] + + L = rgb2rgba(colors[0]) if transparency else colors[0] + M = rgb2rgba(colors[1]) if transparency else colors[1] + R = rgb2rgba(colors[2]) if transparency else colors[2] + + colors_joints = [M, R, R, R, L, L, L, M, M, M, M, L, L, L, R, R, R] + colors_limbs = [R, R, R, L, L, L, M, M, M, L, R, M, L, L, R, R] + + else: + raise ValueError("Only support number of joints be 49 or 17 or 15") + + if transparency: + canvas = np.zeros(shape=(H, W, 4)) + else: + canvas = np.ones(shape=(H, W, 3)) * np.array(bg_color).reshape([1, 1, 3]) + hips = joints_position[0] + neck = joints_position[8] + torso_length = ((hips[1] - neck[1]) ** 2 + (hips[0] - neck[0]) ** 2) ** 0.5 + head_radius = int(torso_length/4.5) + end_effectors_radius = int(torso_length/15) + end_effectors_radius = 7 + joints_radius = 7 + for i in range(0, len(colors_joints)): + if i in (17, 18): + continue + elif i > 18: + radius = 2 + else: + radius = joints_radius + if len(joints_position[i])==3: # If there is confidence, weigh by confidence + weight = joints_position[i][2] + if weight==0: + continue + cv2.circle(canvas, (int(joints_position[i][0]),int(joints_position[i][1])), radius, colors_joints[i], thickness=-1) + + stickwidth = 2 + for i in range(len(limbSeq)): + limb = limbSeq[i] + cur_canvas = canvas.copy() + point1_index = limb[0] + point2_index = limb[1] + point1 = joints_position[point1_index] + point2 = joints_position[point2_index] + if len(point1)==3: # If there is confidence, weigh by confidence + limb_weight = min(point1[2], point2[2]) + if limb_weight==0: + bb = bounding_box(canvas) + canvas_cropped = canvas[:,bb[2]:bb[3], :] + continue + X = [point1[1], point2[1]] + Y = [point1[0], point2[0]] + mX = np.mean(X) + mY = np.mean(Y) + length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 + alpha = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) + polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(alpha), 0, 360, 1) + cv2.fillConvexPoly(cur_canvas, polygon, colors_limbs[i]) + canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) + bb = bounding_box(canvas) + canvas_cropped = canvas[:,bb[2]:bb[3], :] + canvas = canvas.astype(imtype) + canvas_cropped = canvas_cropped.astype(imtype) + if grayscale: + if transparency: + canvas = cv2.cvtColor(canvas, cv2.COLOR_RGBA2GRAY) + canvas_cropped = cv2.cvtColor(canvas_cropped, cv2.COLOR_RGBA2GRAY) + else: + canvas = cv2.cvtColor(canvas, cv2.COLOR_RGB2GRAY) + canvas_cropped = cv2.cvtColor(canvas_cropped, cv2.COLOR_RGB2GRAY) + return [canvas, canvas_cropped] + + +def motion2video(motion, save_path, colors, h=512, w=512, bg_color=(255, 255, 255), transparency=False, motion_tgt=None, fps=25, save_frame=False, grayscale=False, show_progress=True, as_array=False): + nr_joints = motion.shape[0] +# as_array = save_path.endswith(".npy") + vlen = motion.shape[-1] + + out_array = np.zeros([vlen, h, w, 3]) if as_array else None + videowriter = None if as_array else imageio.get_writer(save_path, fps=fps) + + if save_frame: + frames_dir = save_path[:-4] + '-frames' + ensure_dir(frames_dir) + + iterator = range(vlen) + if show_progress: iterator = tqdm(iterator) + for i in iterator: + [img, img_cropped] = joints2image(motion[:, :, i], colors, transparency=transparency, bg_color=bg_color, H=h, W=w, nr_joints=nr_joints, grayscale=grayscale) + if motion_tgt is not None: + [img_tgt, img_tgt_cropped] = joints2image(motion_tgt[:, :, i], colors, transparency=transparency, bg_color=bg_color, H=h, W=w, nr_joints=nr_joints, grayscale=grayscale) + img_ori = img.copy() + img = cv2.addWeighted(img_tgt, 0.3, img_ori, 0.7, 0) + img_cropped = cv2.addWeighted(img_tgt, 0.3, img_ori, 0.7, 0) + bb = bounding_box(img_cropped) + img_cropped = img_cropped[:, bb[2]:bb[3], :] + if save_frame: + save_image(img_cropped, os.path.join(frames_dir, "%04d.png" % i)) + if as_array: out_array[i] = img + else: videowriter.append_data(img) + + if not as_array: + videowriter.close() + + return out_array + +def motion2video_3d(motion, save_path, fps=25, keep_imgs = False): +# motion: (17,3,N) + videowriter = imageio.get_writer(save_path, fps=fps) + vlen = motion.shape[-1] + save_name = save_path.split('.')[0] + frames = [] + joint_pairs = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7], [7, 8], [8, 9], [8, 11], [8, 14], [9, 10], [11, 12], [12, 13], [14, 15], [15, 16]] + joint_pairs_left = [[8, 11], [11, 12], [12, 13], [0, 4], [4, 5], [5, 6]] + joint_pairs_right = [[8, 14], [14, 15], [15, 16], [0, 1], [1, 2], [2, 3]] + + color_mid = "#00457E" + color_left = "#02315E" + color_right = "#2F70AF" + for f in tqdm(range(vlen)): + j3d = motion[:,:,f] + fig = plt.figure(0, figsize=(10, 10)) + ax = plt.axes(projection="3d") + ax.set_xlim(-512, 0) + ax.set_ylim(-256, 256) + ax.set_zlim(-512, 0) + # ax.set_xlabel('X') + # ax.set_ylabel('Y') + # ax.set_zlabel('Z') + ax.view_init(elev=12., azim=80) + plt.tick_params(left = False, right = False , labelleft = False , + labelbottom = False, bottom = False) + for i in range(len(joint_pairs)): + limb = joint_pairs[i] + xs, ys, zs = [np.array([j3d[limb[0], j], j3d[limb[1], j]]) for j in range(3)] + if joint_pairs[i] in joint_pairs_left: + ax.plot(-xs, -zs, -ys, color=color_left, lw=3, marker='o', markerfacecolor='w', markersize=3, markeredgewidth=2) # axis transformation for visualization + elif joint_pairs[i] in joint_pairs_right: + ax.plot(-xs, -zs, -ys, color=color_right, lw=3, marker='o', markerfacecolor='w', markersize=3, markeredgewidth=2) # axis transformation for visualization + else: + ax.plot(-xs, -zs, -ys, color=color_mid, lw=3, marker='o', markerfacecolor='w', markersize=3, markeredgewidth=2) # axis transformation for visualization + + frame_vis = get_img_from_fig(fig) + videowriter.append_data(frame_vis) + plt.close() + videowriter.close() + +def motion2video_mesh(motion, save_path, fps=25, keep_imgs = False, draw_face=True): + videowriter = imageio.get_writer(save_path, fps=fps) + vlen = motion.shape[-1] + draw_skele = (motion.shape[0]==17) + save_name = save_path.split('.')[0] + smpl_faces = get_smpl_faces() + frames = [] + joint_pairs = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7], [7, 8], [8, 9], [8, 11], [8, 14], [9, 10], [11, 12], [12, 13], [14, 15], [15, 16]] + + + X, Y, Z = motion[:, 0], motion[:, 1], motion[:, 2] + max_range = np.array([X.max()-X.min(), Y.max()-Y.min(), Z.max()-Z.min()]).max() / 2.0 + mid_x = (X.max()+X.min()) * 0.5 + mid_y = (Y.max()+Y.min()) * 0.5 + mid_z = (Z.max()+Z.min()) * 0.5 + + for f in tqdm(range(vlen)): + j3d = motion[:,:,f] + plt.gca().set_axis_off() + plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) + plt.gca().xaxis.set_major_locator(plt.NullLocator()) + plt.gca().yaxis.set_major_locator(plt.NullLocator()) + fig = plt.figure(0, figsize=(8, 8)) + ax = plt.axes(projection="3d", proj_type = 'ortho') + ax.set_xlim(mid_x - max_range, mid_x + max_range) + ax.set_ylim(mid_y - max_range, mid_y + max_range) + ax.set_zlim(mid_z - max_range, mid_z + max_range) + ax.view_init(elev=-90, azim=-90) + plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) + plt.margins(0, 0, 0) + plt.gca().xaxis.set_major_locator(plt.NullLocator()) + plt.gca().yaxis.set_major_locator(plt.NullLocator()) + plt.axis('off') + plt.xticks([]) + plt.yticks([]) + + # plt.savefig("filename.png", transparent=True, bbox_inches="tight", pad_inches=0) + + if draw_skele: + for i in range(len(joint_pairs)): + limb = joint_pairs[i] + xs, ys, zs = [np.array([j3d[limb[0], j], j3d[limb[1], j]]) for j in range(3)] + ax.plot(-xs, -zs, -ys, c=[0,0,0], lw=3, marker='o', markerfacecolor='w', markersize=3, markeredgewidth=2) # axis transformation for visualization + elif draw_face: + ax.plot_trisurf(j3d[:, 0], j3d[:, 1], triangles=smpl_faces, Z=j3d[:, 2], color=(166/255.0,188/255.0,218/255.0,0.9)) + else: + ax.scatter(j3d[:, 0], j3d[:, 1], j3d[:, 2], s=3, c='w', edgecolors='grey') + frame_vis = get_img_from_fig(fig, dpi=128) + plt.cla() + videowriter.append_data(frame_vis) + plt.close() + videowriter.close() + +def save_image(image_numpy, image_path): + image_pil = Image.fromarray(image_numpy) + image_pil.save(image_path) + +def bounding_box(img): + a = np.where(img != 0) + bbox = np.min(a[0]), np.max(a[0]), np.min(a[1]), np.max(a[1]) + return bbox diff --git a/mmpose/.mim/configs b/mmpose/.mim/configs new file mode 120000 index 0000000..5992d10 --- /dev/null +++ b/mmpose/.mim/configs @@ -0,0 +1 @@ +../../configs \ No newline at end of file diff --git a/mmpose/.mim/demo b/mmpose/.mim/demo new file mode 120000 index 0000000..bf71256 --- /dev/null +++ b/mmpose/.mim/demo @@ -0,0 +1 @@ +../../demo \ No newline at end of file diff --git a/mmpose/.mim/model-index.yml b/mmpose/.mim/model-index.yml new file mode 120000 index 0000000..a18c0b3 --- /dev/null +++ b/mmpose/.mim/model-index.yml @@ -0,0 +1 @@ +../../model-index.yml \ No newline at end of file diff --git a/mmpose/.mim/tools b/mmpose/.mim/tools new file mode 120000 index 0000000..31941e9 --- /dev/null +++ b/mmpose/.mim/tools @@ -0,0 +1 @@ +../../tools \ No newline at end of file diff --git a/mmpose/__init__.py b/mmpose/__init__.py new file mode 100644 index 0000000..e52beb9 --- /dev/null +++ b/mmpose/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv + +from .version import __version__, short_version + + +def digit_version(version_str): + digit_version = [] + for x in version_str.split('.'): + if x.isdigit(): + digit_version.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + digit_version.append(int(patch_version[0]) - 1) + digit_version.append(int(patch_version[1])) + return digit_version + + +mmcv_minimum_version = '1.3.8' +mmcv_maximum_version = '1.5.0' +mmcv_version = digit_version(mmcv.__version__) + + +assert (mmcv_version >= digit_version(mmcv_minimum_version) + and mmcv_version <= digit_version(mmcv_maximum_version)), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.' + +__all__ = ['__version__', 'short_version'] diff --git a/mmpose/apis/__init__.py b/mmpose/apis/__init__.py new file mode 100644 index 0000000..0e263ed --- /dev/null +++ b/mmpose/apis/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .inference import (inference_bottom_up_pose_model, + inference_top_down_pose_model, init_pose_model, + process_mmdet_results, vis_pose_result) +from .inference_3d import (extract_pose_sequence, inference_interhand_3d_model, + inference_mesh_model, inference_pose_lifter_model, + vis_3d_mesh_result, vis_3d_pose_result) +from .inference_tracking import get_track_id, vis_pose_tracking_result +from .test import multi_gpu_test, single_gpu_test +from .train import init_random_seed, train_model + +__all__ = [ + 'train_model', 'init_pose_model', 'inference_top_down_pose_model', + 'inference_bottom_up_pose_model', 'multi_gpu_test', 'single_gpu_test', + 'vis_pose_result', 'get_track_id', 'vis_pose_tracking_result', + 'inference_pose_lifter_model', 'vis_3d_pose_result', + 'inference_interhand_3d_model', 'extract_pose_sequence', + 'inference_mesh_model', 'vis_3d_mesh_result', 'process_mmdet_results', + 'init_random_seed' +] diff --git a/mmpose/apis/inference.py b/mmpose/apis/inference.py new file mode 100644 index 0000000..5363d40 --- /dev/null +++ b/mmpose/apis/inference.py @@ -0,0 +1,833 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import warnings + +import mmcv +import numpy as np +import torch +from mmcv.parallel import collate, scatter +from mmcv.runner import load_checkpoint +from PIL import Image + +from mmpose.core.post_processing import oks_nms +from mmpose.datasets.dataset_info import DatasetInfo +from mmpose.datasets.pipelines import Compose +from mmpose.models import build_posenet +from mmpose.utils.hooks import OutputHook + +os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' + + +def init_pose_model(config, checkpoint=None, device='cuda:0'): + """Initialize a pose model from config file. + + Args: + config (str or :obj:`mmcv.Config`): Config file path or the config + object. + checkpoint (str, optional): Checkpoint path. If left as None, the model + will not load any weights. + + Returns: + nn.Module: The constructed detector. + """ + if isinstance(config, str): + config = mmcv.Config.fromfile(config) + elif not isinstance(config, mmcv.Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + config.model.pretrained = None + model = build_posenet(config.model) + if checkpoint is not None: + # load model checkpoint + load_checkpoint(model, checkpoint, map_location='cpu') + # save the config in the model for convenience + model.cfg = config + model.to(device) + model.eval() + return model + + +def _xyxy2xywh(bbox_xyxy): + """Transform the bbox format from x1y1x2y2 to xywh. + + Args: + bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or + (n, 5). (left, top, right, bottom, [score]) + + Returns: + np.ndarray: Bounding boxes (with scores), + shaped (n, 4) or (n, 5). (left, top, width, height, [score]) + """ + bbox_xywh = bbox_xyxy.copy() + bbox_xywh[:, 2] = bbox_xywh[:, 2] - bbox_xywh[:, 0] + 1 + bbox_xywh[:, 3] = bbox_xywh[:, 3] - bbox_xywh[:, 1] + 1 + + return bbox_xywh + + +def _xywh2xyxy(bbox_xywh): + """Transform the bbox format from xywh to x1y1x2y2. + + Args: + bbox_xywh (ndarray): Bounding boxes (with scores), + shaped (n, 4) or (n, 5). (left, top, width, height, [score]) + Returns: + np.ndarray: Bounding boxes (with scores), shaped (n, 4) or + (n, 5). (left, top, right, bottom, [score]) + """ + bbox_xyxy = bbox_xywh.copy() + bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0] - 1 + bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1] - 1 + + return bbox_xyxy + + +def _box2cs(cfg, box): + """This encodes bbox(x,y,w,h) into (center, scale) + + Args: + x, y, w, h + + Returns: + tuple: A tuple containing center and scale. + + - np.ndarray[float32](2,): Center of the bbox (x, y). + - np.ndarray[float32](2,): Scale of the bbox w & h. + """ + + x, y, w, h = box[:4] + input_size = cfg.data_cfg['image_size'] + aspect_ratio = input_size[0] / input_size[1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + scale = scale * 1.25 + + return center, scale + + +def _inference_single_pose_model(model, + img_or_path, + bboxes, + dataset='TopDownCocoDataset', + dataset_info=None, + return_heatmap=False): + """Inference human bounding boxes. + + Note: + - num_bboxes: N + - num_keypoints: K + + Args: + model (nn.Module): The loaded pose model. + img_or_path (str | np.ndarray): Image filename or loaded image. + bboxes (list | np.ndarray): All bounding boxes (with scores), + shaped (N, 4) or (N, 5). (left, top, width, height, [score]) + where N is number of bounding boxes. + dataset (str): Dataset name. Deprecated. + dataset_info (DatasetInfo): A class containing all dataset info. + outputs (list[str] | tuple[str]): Names of layers whose output is + to be returned, default: None + + Returns: + ndarray[NxKx3]: Predicted pose x, y, score. + heatmap[N, K, H, W]: Model output heatmap. + """ + + cfg = model.cfg + device = next(model.parameters()).device + if device.type == 'cpu': + device = -1 + + # build the data pipeline + test_pipeline = Compose(cfg.test_pipeline) + + assert len(bboxes[0]) in [4, 5] + + if dataset_info is not None: + dataset_name = dataset_info.dataset_name + flip_pairs = dataset_info.flip_pairs + else: + warnings.warn( + 'dataset is deprecated.' + 'Please set `dataset_info` in the config.' + 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', + DeprecationWarning) + # TODO: These will be removed in the later versions. + if dataset in ('TopDownCocoDataset', 'TopDownOCHumanDataset', + 'AnimalMacaqueDataset'): + flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], + [13, 14], [15, 16]] + elif dataset == 'TopDownCocoWholeBodyDataset': + body = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], + [13, 14], [15, 16]] + foot = [[17, 20], [18, 21], [19, 22]] + + face = [[23, 39], [24, 38], [25, 37], [26, 36], [27, 35], [28, 34], + [29, 33], [30, 32], [40, 49], [41, 48], [42, 47], [43, 46], + [44, 45], [54, 58], [55, 57], [59, 68], [60, 67], [61, 66], + [62, 65], [63, 70], [64, 69], [71, 77], [72, 76], [73, 75], + [78, 82], [79, 81], [83, 87], [84, 86], [88, 90]] + + hand = [[91, 112], [92, 113], [93, 114], [94, 115], [95, 116], + [96, 117], [97, 118], [98, 119], [99, 120], [100, 121], + [101, 122], [102, 123], [103, 124], [104, 125], [105, 126], + [106, 127], [107, 128], [108, 129], [109, 130], [110, 131], + [111, 132]] + flip_pairs = body + foot + face + hand + elif dataset == 'TopDownAicDataset': + flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]] + elif dataset == 'TopDownMpiiDataset': + flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]] + elif dataset == 'TopDownMpiiTrbDataset': + flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], + [14, 15], [16, 22], [28, 34], [17, 23], [29, 35], + [18, 24], [30, 36], [19, 25], [31, 37], [20, 26], + [32, 38], [21, 27], [33, 39]] + elif dataset in ('OneHand10KDataset', 'FreiHandDataset', + 'PanopticDataset', 'InterHand2DDataset'): + flip_pairs = [] + elif dataset in 'Face300WDataset': + flip_pairs = [[0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], + [6, 10], [7, 9], [17, 26], [18, 25], [19, 24], + [20, 23], [21, 22], [31, 35], [32, 34], [36, 45], + [37, 44], [38, 43], [39, 42], [40, 47], [41, 46], + [48, 54], [49, 53], [50, 52], [61, 63], [60, 64], + [67, 65], [58, 56], [59, 55]] + + elif dataset in 'FaceAFLWDataset': + flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9], + [12, 14], [15, 17]] + + elif dataset in 'FaceCOFWDataset': + flip_pairs = [[0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11], + [12, 14], [16, 17], [13, 15], [18, 19], [22, 23]] + + elif dataset in 'FaceWFLWDataset': + flip_pairs = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27], + [6, 26], [7, 25], [8, 24], [9, 23], [10, 22], + [11, 21], [12, 20], [13, 19], [14, 18], [15, 17], + [33, 46], [34, 45], [35, 44], [36, 43], [37, 42], + [38, 50], [39, 49], [40, 48], [41, 47], [60, 72], + [61, 71], [62, 70], [63, 69], [64, 68], [65, 75], + [66, 74], [67, 73], [55, 59], [56, 58], [76, 82], + [77, 81], [78, 80], [87, 83], [86, 84], [88, 92], + [89, 91], [95, 93], [96, 97]] + + elif dataset in 'AnimalFlyDataset': + flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], + [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], + [16, 28], [17, 29], [30, 31]] + elif dataset in 'AnimalHorse10Dataset': + flip_pairs = [] + + elif dataset in 'AnimalLocustDataset': + flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], + [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], + [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]] + + elif dataset in 'AnimalZebraDataset': + flip_pairs = [[3, 4], [5, 6]] + + elif dataset in 'AnimalPoseDataset': + flip_pairs = [[0, 1], [2, 3], [8, 9], [10, 11], [12, 13], [14, 15], + [16, 17], [18, 19]] + else: + raise NotImplementedError() + dataset_name = dataset + + batch_data = [] + for bbox in bboxes: + center, scale = _box2cs(cfg, bbox) + + # prepare data + data = { + 'center': + center, + 'scale': + scale, + 'bbox_score': + bbox[4] if len(bbox) == 5 else 1, + 'bbox_id': + 0, # need to be assigned if batch_size > 1 + 'dataset': + dataset_name, + 'joints_3d': + np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), + 'joints_3d_visible': + np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), + 'rotation': + 0, + 'ann_info': { + 'image_size': np.array(cfg.data_cfg['image_size']), + 'num_joints': cfg.data_cfg['num_joints'], + 'flip_pairs': flip_pairs + } + } + if isinstance(img_or_path, np.ndarray): + data['img'] = img_or_path + else: + data['image_file'] = img_or_path + + data = test_pipeline(data) + batch_data.append(data) + + batch_data = collate(batch_data, samples_per_gpu=len(batch_data)) + batch_data = scatter(batch_data, [device])[0] + + # forward the model + with torch.no_grad(): + result = model( + img=batch_data['img'], + img_metas=batch_data['img_metas'], + return_loss=False, + return_heatmap=return_heatmap) + + return result['preds'], result['output_heatmap'] + + +def inference_top_down_pose_model(model, + img_or_path, + person_results=None, + bbox_thr=None, + format='xywh', + dataset='TopDownCocoDataset', + dataset_info=None, + return_heatmap=False, + outputs=None): + """Inference a single image with a list of person bounding boxes. + + Note: + - num_people: P + - num_keypoints: K + - bbox height: H + - bbox width: W + + Args: + model (nn.Module): The loaded pose model. + img_or_path (str| np.ndarray): Image filename or loaded image. + person_results (list(dict), optional): a list of detected persons that + contains ``bbox`` and/or ``track_id``: + + - ``bbox`` (4, ) or (5, ): The person bounding box, which contains + 4 box coordinates (and score). + - ``track_id`` (int): The unique id for each human instance. If + not provided, a dummy person result with a bbox covering + the entire image will be used. Default: None. + bbox_thr (float | None): Threshold for bounding boxes. Only bboxes + with higher scores will be fed into the pose detector. + If bbox_thr is None, all boxes will be used. + format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'. + + - `xyxy` means (left, top, right, bottom), + - `xywh` means (left, top, width, height). + dataset (str): Dataset name, e.g. 'TopDownCocoDataset'. + It is deprecated. Please use dataset_info instead. + dataset_info (DatasetInfo): A class containing all dataset info. + return_heatmap (bool) : Flag to return heatmap, default: False + outputs (list(str) | tuple(str)) : Names of layers whose outputs + need to be returned. Default: None. + + Returns: + tuple: + - pose_results (list[dict]): The bbox & pose info. \ + Each item in the list is a dictionary, \ + containing the bbox: (left, top, right, bottom, [score]) \ + and the pose (ndarray[Kx3]): x, y, score. + - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \ + torch.Tensor[N, K, H, W]]]): \ + Output feature maps from layers specified in `outputs`. \ + Includes 'heatmap' if `return_heatmap` is True. + """ + # get dataset info + if (dataset_info is None and hasattr(model, 'cfg') + and 'dataset_info' in model.cfg): + dataset_info = DatasetInfo(model.cfg.dataset_info) + if dataset_info is None: + warnings.warn( + 'dataset is deprecated.' + 'Please set `dataset_info` in the config.' + 'Check https://github.com/open-mmlab/mmpose/pull/663' + ' for details.', DeprecationWarning) + + # only two kinds of bbox format is supported. + assert format in ['xyxy', 'xywh'] + + pose_results = [] + returned_outputs = [] + + if person_results is None: + # create dummy person results + if isinstance(img_or_path, str): + width, height = Image.open(img_or_path).size + else: + height, width = img_or_path.shape[:2] + person_results = [{'bbox': np.array([0, 0, width, height])}] + + if len(person_results) == 0: + return pose_results, returned_outputs + + # Change for-loop preprocess each bbox to preprocess all bboxes at once. + bboxes = np.array([box['bbox'] for box in person_results]) + + # Select bboxes by score threshold + if bbox_thr is not None: + assert bboxes.shape[1] == 5 + valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0] + bboxes = bboxes[valid_idx] + person_results = [person_results[i] for i in valid_idx] + + if format == 'xyxy': + bboxes_xyxy = bboxes + bboxes_xywh = _xyxy2xywh(bboxes) + else: + # format is already 'xywh' + bboxes_xywh = bboxes + bboxes_xyxy = _xywh2xyxy(bboxes) + + # if bbox_thr remove all bounding box + if len(bboxes_xywh) == 0: + return [], [] + + with OutputHook(model, outputs=outputs, as_tensor=False) as h: + # poses is results['pred'] # N x 17x 3 + poses, heatmap = _inference_single_pose_model( + model, + img_or_path, + bboxes_xywh, + dataset=dataset, + dataset_info=dataset_info, + return_heatmap=return_heatmap) + + if return_heatmap: + h.layer_outputs['heatmap'] = heatmap + + returned_outputs.append(h.layer_outputs) + + assert len(poses) == len(person_results), print( + len(poses), len(person_results), len(bboxes_xyxy)) + for pose, person_result, bbox_xyxy in zip(poses, person_results, + bboxes_xyxy): + pose_result = person_result.copy() + pose_result['keypoints'] = pose + pose_result['bbox'] = bbox_xyxy + pose_results.append(pose_result) + + return pose_results, returned_outputs + + +def inference_bottom_up_pose_model(model, + img_or_path, + dataset='BottomUpCocoDataset', + dataset_info=None, + pose_nms_thr=0.9, + return_heatmap=False, + outputs=None): + """Inference a single image with a bottom-up pose model. + + Note: + - num_people: P + - num_keypoints: K + - bbox height: H + - bbox width: W + + Args: + model (nn.Module): The loaded pose model. + img_or_path (str| np.ndarray): Image filename or loaded image. + dataset (str): Dataset name, e.g. 'BottomUpCocoDataset'. + It is deprecated. Please use dataset_info instead. + dataset_info (DatasetInfo): A class containing all dataset info. + pose_nms_thr (float): retain oks overlap < pose_nms_thr, default: 0.9. + return_heatmap (bool) : Flag to return heatmap, default: False. + outputs (list(str) | tuple(str)) : Names of layers whose outputs + need to be returned, default: None. + + Returns: + tuple: + - pose_results (list[np.ndarray]): The predicted pose info. \ + The length of the list is the number of people (P). \ + Each item in the list is a ndarray, containing each \ + person's pose (np.ndarray[Kx3]): x, y, score. + - returned_outputs (list[dict[np.ndarray[N, K, H, W] | \ + torch.Tensor[N, K, H, W]]]): \ + Output feature maps from layers specified in `outputs`. \ + Includes 'heatmap' if `return_heatmap` is True. + """ + # get dataset info + if (dataset_info is None and hasattr(model, 'cfg') + and 'dataset_info' in model.cfg): + dataset_info = DatasetInfo(model.cfg.dataset_info) + + if dataset_info is not None: + dataset_name = dataset_info.dataset_name + flip_index = dataset_info.flip_index + sigmas = getattr(dataset_info, 'sigmas', None) + else: + warnings.warn( + 'dataset is deprecated.' + 'Please set `dataset_info` in the config.' + 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', + DeprecationWarning) + assert (dataset == 'BottomUpCocoDataset') + dataset_name = dataset + flip_index = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] + sigmas = None + + pose_results = [] + returned_outputs = [] + + cfg = model.cfg + device = next(model.parameters()).device + if device.type == 'cpu': + device = -1 + + # build the data pipeline + test_pipeline = Compose(cfg.test_pipeline) + + # prepare data + data = { + 'dataset': dataset_name, + 'ann_info': { + 'image_size': np.array(cfg.data_cfg['image_size']), + 'num_joints': cfg.data_cfg['num_joints'], + 'flip_index': flip_index, + } + } + if isinstance(img_or_path, np.ndarray): + data['img'] = img_or_path + else: + data['image_file'] = img_or_path + + data = test_pipeline(data) + data = collate([data], samples_per_gpu=1) + data = scatter(data, [device])[0] + + with OutputHook(model, outputs=outputs, as_tensor=False) as h: + # forward the model + with torch.no_grad(): + result = model( + img=data['img'], + img_metas=data['img_metas'], + return_loss=False, + return_heatmap=return_heatmap) + + if return_heatmap: + h.layer_outputs['heatmap'] = result['output_heatmap'] + + returned_outputs.append(h.layer_outputs) + + for idx, pred in enumerate(result['preds']): + area = (np.max(pred[:, 0]) - np.min(pred[:, 0])) * ( + np.max(pred[:, 1]) - np.min(pred[:, 1])) + pose_results.append({ + 'keypoints': pred[:, :3], + 'score': result['scores'][idx], + 'area': area, + }) + + # pose nms + score_per_joint = cfg.model.test_cfg.get('score_per_joint', False) + keep = oks_nms( + pose_results, + pose_nms_thr, + sigmas, + score_per_joint=score_per_joint) + pose_results = [pose_results[_keep] for _keep in keep] + + return pose_results, returned_outputs + + +def vis_pose_result(model, + img, + result, + radius=4, + thickness=1, + kpt_score_thr=0.3, + bbox_color='green', + dataset='TopDownCocoDataset', + dataset_info=None, + show=False, + out_file=None): + """Visualize the detection results on the image. + + Args: + model (nn.Module): The loaded detector. + img (str | np.ndarray): Image filename or loaded image. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + radius (int): Radius of circles. + thickness (int): Thickness of lines. + kpt_score_thr (float): The threshold to visualize the keypoints. + skeleton (list[tuple()]): Default None. + show (bool): Whether to show the image. Default True. + out_file (str|None): The filename of the output visualization image. + """ + + # get dataset info + if (dataset_info is None and hasattr(model, 'cfg') + and 'dataset_info' in model.cfg): + dataset_info = DatasetInfo(model.cfg.dataset_info) + + if dataset_info is not None: + skeleton = dataset_info.skeleton + pose_kpt_color = dataset_info.pose_kpt_color + pose_link_color = dataset_info.pose_link_color + else: + warnings.warn( + 'dataset is deprecated.' + 'Please set `dataset_info` in the config.' + 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', + DeprecationWarning) + # TODO: These will be removed in the later versions. + palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102], + [230, 230, 0], [255, 153, 255], [153, 204, 255], + [255, 102, 255], [255, 51, 255], [102, 178, 255], + [51, 153, 255], [255, 153, 153], [255, 102, 102], + [255, 51, 51], [153, 255, 153], [102, 255, 102], + [51, 255, 51], [0, 255, 0], [0, 0, 255], + [255, 0, 0], [255, 255, 255]]) + + if dataset in ('TopDownCocoDataset', 'BottomUpCocoDataset', + 'TopDownOCHumanDataset', 'AnimalMacaqueDataset'): + # show the results + skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], + [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], + [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], + [3, 5], [4, 6]] + + pose_link_color = palette[[ + 0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16 + ]] + pose_kpt_color = palette[[ + 16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0 + ]] + + elif dataset == 'TopDownCocoWholeBodyDataset': + # show the results + skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], + [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], + [8, 10], [1, 2], [0, 1], [0, 2], + [1, 3], [2, 4], [3, 5], [4, 6], [15, 17], [15, 18], + [15, 19], [16, 20], [16, 21], [16, 22], [91, 92], + [92, 93], [93, 94], [94, 95], [91, 96], [96, 97], + [97, 98], [98, 99], [91, 100], [100, 101], [101, 102], + [102, 103], [91, 104], [104, 105], [105, 106], + [106, 107], [91, 108], [108, 109], [109, 110], + [110, 111], [112, 113], [113, 114], [114, 115], + [115, 116], [112, 117], [117, 118], [118, 119], + [119, 120], [112, 121], [121, 122], [122, 123], + [123, 124], [112, 125], [125, 126], [126, 127], + [127, 128], [112, 129], [129, 130], [130, 131], + [131, 132]] + + pose_link_color = palette[[ + 0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16 + ] + [16, 16, 16, 16, 16, 16] + [ + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16, + 16 + ] + [ + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16, + 16 + ]] + pose_kpt_color = palette[ + [16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0] + + [0, 0, 0, 0, 0, 0] + [19] * (68 + 42)] + + elif dataset == 'TopDownAicDataset': + skeleton = [[2, 1], [1, 0], [0, 13], [13, 3], [3, 4], [4, 5], + [8, 7], [7, 6], [6, 9], [9, 10], [10, 11], [12, 13], + [0, 6], [3, 9]] + + pose_link_color = palette[[ + 9, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 0, 7, 7 + ]] + pose_kpt_color = palette[[ + 9, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 0, 0 + ]] + + elif dataset == 'TopDownMpiiDataset': + skeleton = [[0, 1], [1, 2], [2, 6], [6, 3], [3, 4], [4, 5], [6, 7], + [7, 8], [8, 9], [8, 12], [12, 11], [11, 10], [8, 13], + [13, 14], [14, 15]] + + pose_link_color = palette[[ + 16, 16, 16, 16, 16, 16, 7, 7, 0, 9, 9, 9, 9, 9, 9 + ]] + pose_kpt_color = palette[[ + 16, 16, 16, 16, 16, 16, 7, 7, 0, 0, 9, 9, 9, 9, 9, 9 + ]] + + elif dataset == 'TopDownMpiiTrbDataset': + skeleton = [[12, 13], [13, 0], [13, 1], [0, 2], [1, 3], [2, 4], + [3, 5], [0, 6], [1, 7], [6, 7], [6, 8], [7, + 9], [8, 10], + [9, 11], [14, 15], [16, 17], [18, 19], [20, 21], + [22, 23], [24, 25], [26, 27], [28, 29], [30, 31], + [32, 33], [34, 35], [36, 37], [38, 39]] + + pose_link_color = palette[[16] * 14 + [19] * 13] + pose_kpt_color = palette[[16] * 14 + [0] * 26] + + elif dataset in ('OneHand10KDataset', 'FreiHandDataset', + 'PanopticDataset'): + skeleton = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], + [7, 8], [0, 9], [9, 10], [10, 11], [11, 12], [0, 13], + [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], + [18, 19], [19, 20]] + + pose_link_color = palette[[ + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16, + 16 + ]] + pose_kpt_color = palette[[ + 0, 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, + 16, 16 + ]] + + elif dataset == 'InterHand2DDataset': + skeleton = [[0, 1], [1, 2], [2, 3], [4, 5], [5, 6], [6, 7], [8, 9], + [9, 10], [10, 11], [12, 13], [13, 14], [14, 15], + [16, 17], [17, 18], [18, 19], [3, 20], [7, 20], + [11, 20], [15, 20], [19, 20]] + + pose_link_color = palette[[ + 0, 0, 0, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 0, 4, 8, 12, + 16 + ]] + pose_kpt_color = palette[[ + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16, + 16, 0 + ]] + + elif dataset == 'Face300WDataset': + # show the results + skeleton = [] + + pose_link_color = palette[[]] + pose_kpt_color = palette[[19] * 68] + kpt_score_thr = 0 + + elif dataset == 'FaceAFLWDataset': + # show the results + skeleton = [] + + pose_link_color = palette[[]] + pose_kpt_color = palette[[19] * 19] + kpt_score_thr = 0 + + elif dataset == 'FaceCOFWDataset': + # show the results + skeleton = [] + + pose_link_color = palette[[]] + pose_kpt_color = palette[[19] * 29] + kpt_score_thr = 0 + + elif dataset == 'FaceWFLWDataset': + # show the results + skeleton = [] + + pose_link_color = palette[[]] + pose_kpt_color = palette[[19] * 98] + kpt_score_thr = 0 + + elif dataset == 'AnimalHorse10Dataset': + skeleton = [[0, 1], [1, 12], [12, 16], [16, 21], [21, 17], + [17, 11], [11, 10], [10, 8], [8, 9], [9, 12], [2, 3], + [3, 4], [5, 6], [6, 7], [13, 14], [14, 15], [18, 19], + [19, 20]] + + pose_link_color = palette[[4] * 10 + [6] * 2 + [6] * 2 + [7] * 2 + + [7] * 2] + pose_kpt_color = palette[[ + 4, 4, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 7, 7, 7, 4, 4, 7, 7, 7, + 4 + ]] + + elif dataset == 'AnimalFlyDataset': + skeleton = [[1, 0], [2, 0], [3, 0], [4, 3], [5, 4], [7, 6], [8, 7], + [9, 8], [11, 10], [12, 11], [13, 12], [15, 14], + [16, 15], [17, 16], [19, 18], [20, 19], [21, 20], + [23, 22], [24, 23], [25, 24], [27, 26], [28, 27], + [29, 28], [30, 3], [31, 3]] + + pose_link_color = palette[[0] * 25] + pose_kpt_color = palette[[0] * 32] + + elif dataset == 'AnimalLocustDataset': + skeleton = [[1, 0], [2, 1], [3, 2], [4, 3], [6, 5], [7, 6], [9, 8], + [10, 9], [11, 10], [13, 12], [14, 13], [15, 14], + [17, 16], [18, 17], [19, 18], [21, 20], [22, 21], + [24, 23], [25, 24], [26, 25], [28, 27], [29, 28], + [30, 29], [32, 31], [33, 32], [34, 33]] + + pose_link_color = palette[[0] * 26] + pose_kpt_color = palette[[0] * 35] + + elif dataset == 'AnimalZebraDataset': + skeleton = [[1, 0], [2, 1], [3, 2], [4, 2], [5, 7], [6, 7], [7, 2], + [8, 7]] + + pose_link_color = palette[[0] * 8] + pose_kpt_color = palette[[0] * 9] + + elif dataset in 'AnimalPoseDataset': + skeleton = [[0, 1], [0, 2], [1, 3], [0, 4], [1, 4], [4, 5], [5, 7], + [6, 7], [5, 8], [8, 12], [12, 16], [5, 9], [9, 13], + [13, 17], [6, 10], [10, 14], [14, 18], [6, 11], + [11, 15], [15, 19]] + + pose_link_color = palette[[0] * 20] + pose_kpt_color = palette[[0] * 20] + else: + NotImplementedError() + + if hasattr(model, 'module'): + model = model.module + + img = model.show_result( + img, + result, + skeleton, + radius=radius, + thickness=thickness, + pose_kpt_color=pose_kpt_color, + pose_link_color=pose_link_color, + kpt_score_thr=kpt_score_thr, + bbox_color=bbox_color, + show=show, + out_file=out_file) + + return img + + +def process_mmdet_results(mmdet_results, cat_id=1): + """Process mmdet results, and return a list of bboxes. + + Args: + mmdet_results (list|tuple): mmdet results. + cat_id (int): category id (default: 1 for human) + + Returns: + person_results (list): a list of detected bounding boxes + """ + if isinstance(mmdet_results, tuple): + det_results = mmdet_results[0] + else: + det_results = mmdet_results + + bboxes = det_results[cat_id - 1] + + person_results = [] + for bbox in bboxes: + person = {} + person['bbox'] = bbox + person_results.append(person) + + return person_results diff --git a/mmpose/apis/inference_3d.py b/mmpose/apis/inference_3d.py new file mode 100644 index 0000000..f59f20a --- /dev/null +++ b/mmpose/apis/inference_3d.py @@ -0,0 +1,791 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np +import torch +from mmcv.parallel import collate, scatter + +from mmpose.datasets.pipelines import Compose +from .inference import _box2cs, _xywh2xyxy, _xyxy2xywh + + +def extract_pose_sequence(pose_results, frame_idx, causal, seq_len, step=1): + """Extract the target frame from 2D pose results, and pad the sequence to a + fixed length. + + Args: + pose_results (list[list[dict]]): Multi-frame pose detection results + stored in a nested list. Each element of the outer list is the + pose detection results of a single frame, and each element of the + inner list is the pose information of one person, which contains: + + - keypoints (ndarray[K, 2 or 3]): x, y, [score] + - track_id (int): unique id of each person, required \ + when ``with_track_id==True``. + - bbox ((4, ) or (5, )): left, right, top, bottom, [score] + + frame_idx (int): The index of the frame in the original video. + causal (bool): If True, the target frame is the last frame in + a sequence. Otherwise, the target frame is in the middle of + a sequence. + seq_len (int): The number of frames in the input sequence. + step (int): Step size to extract frames from the video. + + Returns: + list[list[dict]]: Multi-frame pose detection results stored \ + in a nested list with a length of seq_len. + """ + + if causal: + frames_left = seq_len - 1 + frames_right = 0 + else: + frames_left = (seq_len - 1) // 2 + frames_right = frames_left + num_frames = len(pose_results) + + # get the padded sequence + pad_left = max(0, frames_left - frame_idx // step) + pad_right = max(0, frames_right - (num_frames - 1 - frame_idx) // step) + start = max(frame_idx % step, frame_idx - frames_left * step) + end = min(num_frames - (num_frames - 1 - frame_idx) % step, + frame_idx + frames_right * step + 1) + pose_results_seq = [pose_results[0]] * pad_left + \ + pose_results[start:end:step] + [pose_results[-1]] * pad_right + return pose_results_seq + + +def _gather_pose_lifter_inputs(pose_results, + bbox_center, + bbox_scale, + norm_pose_2d=False): + """Gather input data (keypoints and track_id) for pose lifter model. + + Note: + - The temporal length of the pose detection results: T + - The number of the person instances: N + - The number of the keypoints: K + - The channel number of each keypoint: C + + Args: + pose_results (List[List[Dict]]): Multi-frame pose detection results + stored in a nested list. Each element of the outer list is the + pose detection results of a single frame, and each element of the + inner list is the pose information of one person, which contains: + + - keypoints (ndarray[K, 2 or 3]): x, y, [score] + - track_id (int): unique id of each person, required when + ``with_track_id==True``` + - bbox ((4, ) or (5, )): left, right, top, bottom, [score] + + bbox_center (ndarray[1, 2]): x, y. The average center coordinate of the + bboxes in the dataset. + bbox_scale (int|float): The average scale of the bboxes in the dataset. + norm_pose_2d (bool): If True, scale the bbox (along with the 2D + pose) to bbox_scale, and move the bbox (along with the 2D pose) to + bbox_center. Default: False. + + Returns: + list[list[dict]]: Multi-frame pose detection results + stored in a nested list. Each element of the outer list is the + pose detection results of a single frame, and each element of the + inner list is the pose information of one person, which contains: + + - keypoints (ndarray[K, 2 or 3]): x, y, [score] + - track_id (int): unique id of each person, required when + ``with_track_id==True`` + """ + sequence_inputs = [] + for frame in pose_results: + frame_inputs = [] + for res in frame: + inputs = dict() + + if norm_pose_2d: + bbox = res['bbox'] + center = np.array([[(bbox[0] + bbox[2]) / 2, + (bbox[1] + bbox[3]) / 2]]) + scale = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) + inputs['keypoints'] = (res['keypoints'][:, :2] - center) \ + / scale * bbox_scale + bbox_center + else: + inputs['keypoints'] = res['keypoints'][:, :2] + + if res['keypoints'].shape[1] == 3: + inputs['keypoints'] = np.concatenate( + [inputs['keypoints'], res['keypoints'][:, 2:]], axis=1) + + if 'track_id' in res: + inputs['track_id'] = res['track_id'] + frame_inputs.append(inputs) + sequence_inputs.append(frame_inputs) + return sequence_inputs + + +def _collate_pose_sequence(pose_results, with_track_id=True, target_frame=-1): + """Reorganize multi-frame pose detection results into individual pose + sequences. + + Note: + - The temporal length of the pose detection results: T + - The number of the person instances: N + - The number of the keypoints: K + - The channel number of each keypoint: C + + Args: + pose_results (List[List[Dict]]): Multi-frame pose detection results + stored in a nested list. Each element of the outer list is the + pose detection results of a single frame, and each element of the + inner list is the pose information of one person, which contains: + + - keypoints (ndarray[K, 2 or 3]): x, y, [score] + - track_id (int): unique id of each person, required when + ``with_track_id==True``` + + with_track_id (bool): If True, the element in pose_results is expected + to contain "track_id", which will be used to gather the pose + sequence of a person from multiple frames. Otherwise, the pose + results in each frame are expected to have a consistent number and + order of identities. Default is True. + target_frame (int): The index of the target frame. Default: -1. + """ + T = len(pose_results) + assert T > 0 + + target_frame = (T + target_frame) % T # convert negative index to positive + + N = len(pose_results[target_frame]) # use identities in the target frame + if N == 0: + return [] + + K, C = pose_results[target_frame][0]['keypoints'].shape + + track_ids = None + if with_track_id: + track_ids = [res['track_id'] for res in pose_results[target_frame]] + + pose_sequences = [] + for idx in range(N): + pose_seq = dict() + # gather static information + for k, v in pose_results[target_frame][idx].items(): + if k != 'keypoints': + pose_seq[k] = v + # gather keypoints + if not with_track_id: + pose_seq['keypoints'] = np.stack( + [frame[idx]['keypoints'] for frame in pose_results]) + else: + keypoints = np.zeros((T, K, C), dtype=np.float32) + keypoints[target_frame] = pose_results[target_frame][idx][ + 'keypoints'] + # find the left most frame containing track_ids[idx] + for frame_idx in range(target_frame - 1, -1, -1): + contains_idx = False + for res in pose_results[frame_idx]: + if res['track_id'] == track_ids[idx]: + keypoints[frame_idx] = res['keypoints'] + contains_idx = True + break + if not contains_idx: + # replicate the left most frame + keypoints[:frame_idx + 1] = keypoints[frame_idx + 1] + break + # find the right most frame containing track_idx[idx] + for frame_idx in range(target_frame + 1, T): + contains_idx = False + for res in pose_results[frame_idx]: + if res['track_id'] == track_ids[idx]: + keypoints[frame_idx] = res['keypoints'] + contains_idx = True + break + if not contains_idx: + # replicate the right most frame + keypoints[frame_idx + 1:] = keypoints[frame_idx] + break + pose_seq['keypoints'] = keypoints + pose_sequences.append(pose_seq) + + return pose_sequences + + +def inference_pose_lifter_model(model, + pose_results_2d, + dataset=None, + dataset_info=None, + with_track_id=True, + image_size=None, + norm_pose_2d=False): + """Inference 3D pose from 2D pose sequences using a pose lifter model. + + Args: + model (nn.Module): The loaded pose lifter model + pose_results_2d (list[list[dict]]): The 2D pose sequences stored in a + nested list. Each element of the outer list is the 2D pose results + of a single frame, and each element of the inner list is the 2D + pose of one person, which contains: + + - "keypoints" (ndarray[K, 2 or 3]): x, y, [score] + - "track_id" (int) + dataset (str): Dataset name, e.g. 'Body3DH36MDataset' + with_track_id: If True, the element in pose_results_2d is expected to + contain "track_id", which will be used to gather the pose sequence + of a person from multiple frames. Otherwise, the pose results in + each frame are expected to have a consistent number and order of + identities. Default is True. + image_size (tuple|list): image width, image height. If None, image size + will not be contained in dict ``data``. + norm_pose_2d (bool): If True, scale the bbox (along with the 2D + pose) to the average bbox scale of the dataset, and move the bbox + (along with the 2D pose) to the average bbox center of the dataset. + + Returns: + list[dict]: 3D pose inference results. Each element is the result of \ + an instance, which contains: + + - "keypoints_3d" (ndarray[K, 3]): predicted 3D keypoints + - "keypoints" (ndarray[K, 2 or 3]): from the last frame in \ + ``pose_results_2d``. + - "track_id" (int): from the last frame in ``pose_results_2d``. \ + If there is no valid instance, an empty list will be \ + returned. + """ + cfg = model.cfg + test_pipeline = Compose(cfg.test_pipeline) + + device = next(model.parameters()).device + if device.type == 'cpu': + device = -1 + + if dataset_info is not None: + flip_pairs = dataset_info.flip_pairs + assert 'stats_info' in dataset_info._dataset_info + bbox_center = dataset_info._dataset_info['stats_info']['bbox_center'] + bbox_scale = dataset_info._dataset_info['stats_info']['bbox_scale'] + else: + warnings.warn( + 'dataset is deprecated.' + 'Please set `dataset_info` in the config.' + 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', + DeprecationWarning) + # TODO: These will be removed in the later versions. + if dataset == 'Body3DH36MDataset': + flip_pairs = [[1, 4], [2, 5], [3, 6], [11, 14], [12, 15], [13, 16]] + bbox_center = np.array([[528, 427]], dtype=np.float32) + bbox_scale = 400 + else: + raise NotImplementedError() + + target_idx = -1 if model.causal else len(pose_results_2d) // 2 + pose_lifter_inputs = _gather_pose_lifter_inputs(pose_results_2d, + bbox_center, bbox_scale, + norm_pose_2d) + pose_sequences_2d = _collate_pose_sequence(pose_lifter_inputs, + with_track_id, target_idx) + + if not pose_sequences_2d: + return [] + + batch_data = [] + for seq in pose_sequences_2d: + pose_2d = seq['keypoints'].astype(np.float32) + T, K, C = pose_2d.shape + + input_2d = pose_2d[..., :2] + input_2d_visible = pose_2d[..., 2:3] + if C > 2: + input_2d_visible = pose_2d[..., 2:3] + else: + input_2d_visible = np.ones((T, K, 1), dtype=np.float32) + + # TODO: Will be removed in the later versions + # Dummy 3D input + # This is for compatibility with configs in mmpose<=v0.14.0, where a + # 3D input is required to generate denormalization parameters. This + # part will be removed in the future. + target = np.zeros((K, 3), dtype=np.float32) + target_visible = np.ones((K, 1), dtype=np.float32) + + # Dummy image path + # This is for compatibility with configs in mmpose<=v0.14.0, where + # target_image_path is required. This part will be removed in the + # future. + target_image_path = None + + data = { + 'input_2d': input_2d, + 'input_2d_visible': input_2d_visible, + 'target': target, + 'target_visible': target_visible, + 'target_image_path': target_image_path, + 'ann_info': { + 'num_joints': K, + 'flip_pairs': flip_pairs + } + } + + if image_size is not None: + assert len(image_size) == 2 + data['image_width'] = image_size[0] + data['image_height'] = image_size[1] + + data = test_pipeline(data) + batch_data.append(data) + + batch_data = collate(batch_data, samples_per_gpu=len(batch_data)) + batch_data = scatter(batch_data, target_gpus=[device])[0] + + with torch.no_grad(): + result = model( + input=batch_data['input'], + metas=batch_data['metas'], + return_loss=False) + + poses_3d = result['preds'] + if poses_3d.shape[-1] != 4: + assert poses_3d.shape[-1] == 3 + dummy_score = np.ones( + poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype) + poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1) + pose_results = [] + for pose_2d, pose_3d in zip(pose_sequences_2d, poses_3d): + pose_result = pose_2d.copy() + pose_result['keypoints_3d'] = pose_3d + pose_results.append(pose_result) + + return pose_results + + +def vis_3d_pose_result(model, + result, + img=None, + dataset='Body3DH36MDataset', + dataset_info=None, + kpt_score_thr=0.3, + radius=8, + thickness=2, + num_instances=-1, + show=False, + out_file=None): + """Visualize the 3D pose estimation results. + + Args: + model (nn.Module): The loaded model. + result (list[dict]) + """ + + if dataset_info is not None: + skeleton = dataset_info.skeleton + pose_kpt_color = dataset_info.pose_kpt_color + pose_link_color = dataset_info.pose_link_color + else: + warnings.warn( + 'dataset is deprecated.' + 'Please set `dataset_info` in the config.' + 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', + DeprecationWarning) + # TODO: These will be removed in the later versions. + palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102], + [230, 230, 0], [255, 153, 255], [153, 204, 255], + [255, 102, 255], [255, 51, 255], [102, 178, 255], + [51, 153, 255], [255, 153, 153], [255, 102, 102], + [255, 51, 51], [153, 255, 153], [102, 255, 102], + [51, 255, 51], [0, 255, 0], [0, 0, 255], + [255, 0, 0], [255, 255, 255]]) + + if dataset == 'Body3DH36MDataset': + skeleton = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7], + [7, 8], [8, 9], [9, 10], [8, 11], [11, 12], [12, 13], + [8, 14], [14, 15], [15, 16]] + + pose_kpt_color = palette[[ + 9, 0, 0, 0, 16, 16, 16, 9, 9, 9, 9, 16, 16, 16, 0, 0, 0 + ]] + pose_link_color = palette[[ + 0, 0, 0, 16, 16, 16, 9, 9, 9, 9, 16, 16, 16, 0, 0, 0 + ]] + + elif dataset == 'InterHand3DDataset': + skeleton = [[0, 1], [1, 2], [2, 3], [3, 20], [4, 5], [5, 6], + [6, 7], [7, 20], [8, 9], [9, 10], [10, 11], [11, 20], + [12, 13], [13, 14], [14, 15], [15, 20], [16, 17], + [17, 18], [18, 19], [19, 20], [21, 22], [22, 23], + [23, 24], [24, 41], [25, 26], [26, 27], [27, 28], + [28, 41], [29, 30], [30, 31], [31, 32], [32, 41], + [33, 34], [34, 35], [35, 36], [36, 41], [37, 38], + [38, 39], [39, 40], [40, 41]] + + pose_kpt_color = [[14, 128, 250], [14, 128, 250], [14, 128, 250], + [14, 128, 250], [80, 127, 255], [80, 127, 255], + [80, 127, 255], [80, 127, 255], [71, 99, 255], + [71, 99, 255], [71, 99, 255], [71, 99, 255], + [0, 36, 255], [0, 36, 255], [0, 36, 255], + [0, 36, 255], [0, 0, 230], [0, 0, 230], + [0, 0, 230], [0, 0, 230], [0, 0, 139], + [237, 149, 100], [237, 149, 100], + [237, 149, 100], [237, 149, 100], [230, 128, 77], + [230, 128, 77], [230, 128, 77], [230, 128, 77], + [255, 144, 30], [255, 144, 30], [255, 144, 30], + [255, 144, 30], [153, 51, 0], [153, 51, 0], + [153, 51, 0], [153, 51, 0], [255, 51, 13], + [255, 51, 13], [255, 51, 13], [255, 51, 13], + [103, 37, 8]] + + pose_link_color = [[14, 128, 250], [14, 128, 250], [14, 128, 250], + [14, 128, 250], [80, 127, 255], [80, 127, 255], + [80, 127, 255], [80, 127, 255], [71, 99, 255], + [71, 99, 255], [71, 99, 255], [71, 99, 255], + [0, 36, 255], [0, 36, 255], [0, 36, 255], + [0, 36, 255], [0, 0, 230], [0, 0, 230], + [0, 0, 230], [0, 0, 230], [237, 149, 100], + [237, 149, 100], [237, 149, 100], + [237, 149, 100], [230, 128, 77], [230, 128, 77], + [230, 128, 77], [230, 128, 77], [255, 144, 30], + [255, 144, 30], [255, 144, 30], [255, 144, 30], + [153, 51, 0], [153, 51, 0], [153, 51, 0], + [153, 51, 0], [255, 51, 13], [255, 51, 13], + [255, 51, 13], [255, 51, 13]] + else: + raise NotImplementedError + + if hasattr(model, 'module'): + model = model.module + + img = model.show_result( + result, + img, + skeleton, + radius=radius, + thickness=thickness, + pose_kpt_color=pose_kpt_color, + pose_link_color=pose_link_color, + num_instances=num_instances, + show=show, + out_file=out_file) + + return img + + +def inference_interhand_3d_model(model, + img_or_path, + det_results, + bbox_thr=None, + format='xywh', + dataset='InterHand3DDataset'): + """Inference a single image with a list of hand bounding boxes. + + Note: + - num_bboxes: N + - num_keypoints: K + + Args: + model (nn.Module): The loaded pose model. + img_or_path (str | np.ndarray): Image filename or loaded image. + det_results (list[dict]): The 2D bbox sequences stored in a list. + Each each element of the list is the bbox of one person, whose + shape is (ndarray[4 or 5]), containing 4 box coordinates + (and score). + dataset (str): Dataset name. + format: bbox format ('xyxy' | 'xywh'). Default: 'xywh'. + 'xyxy' means (left, top, right, bottom), + 'xywh' means (left, top, width, height). + + Returns: + list[dict]: 3D pose inference results. Each element is the result \ + of an instance, which contains the predicted 3D keypoints with \ + shape (ndarray[K,3]). If there is no valid instance, an \ + empty list will be returned. + """ + + assert format in ['xyxy', 'xywh'] + + pose_results = [] + + if len(det_results) == 0: + return pose_results + + # Change for-loop preprocess each bbox to preprocess all bboxes at once. + bboxes = np.array([box['bbox'] for box in det_results]) + + # Select bboxes by score threshold + if bbox_thr is not None: + assert bboxes.shape[1] == 5 + valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0] + bboxes = bboxes[valid_idx] + det_results = [det_results[i] for i in valid_idx] + + if format == 'xyxy': + bboxes_xyxy = bboxes + bboxes_xywh = _xyxy2xywh(bboxes) + else: + # format is already 'xywh' + bboxes_xywh = bboxes + bboxes_xyxy = _xywh2xyxy(bboxes) + + # if bbox_thr remove all bounding box + if len(bboxes_xywh) == 0: + return [] + + cfg = model.cfg + device = next(model.parameters()).device + if device.type == 'cpu': + device = -1 + + # build the data pipeline + test_pipeline = Compose(cfg.test_pipeline) + + assert len(bboxes[0]) in [4, 5] + + if dataset == 'InterHand3DDataset': + flip_pairs = [[i, 21 + i] for i in range(21)] + else: + raise NotImplementedError() + + batch_data = [] + for bbox in bboxes: + center, scale = _box2cs(cfg, bbox) + + # prepare data + data = { + 'center': + center, + 'scale': + scale, + 'bbox_score': + bbox[4] if len(bbox) == 5 else 1, + 'bbox_id': + 0, # need to be assigned if batch_size > 1 + 'dataset': + dataset, + 'joints_3d': + np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), + 'joints_3d_visible': + np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), + 'rotation': + 0, + 'ann_info': { + 'image_size': np.array(cfg.data_cfg['image_size']), + 'num_joints': cfg.data_cfg['num_joints'], + 'flip_pairs': flip_pairs, + 'heatmap3d_depth_bound': cfg.data_cfg['heatmap3d_depth_bound'], + 'heatmap_size_root': cfg.data_cfg['heatmap_size_root'], + 'root_depth_bound': cfg.data_cfg['root_depth_bound'] + } + } + + if isinstance(img_or_path, np.ndarray): + data['img'] = img_or_path + else: + data['image_file'] = img_or_path + + data = test_pipeline(data) + batch_data.append(data) + + batch_data = collate(batch_data, samples_per_gpu=len(batch_data)) + batch_data = scatter(batch_data, [device])[0] + + # forward the model + with torch.no_grad(): + result = model( + img=batch_data['img'], + img_metas=batch_data['img_metas'], + return_loss=False) + + poses_3d = result['preds'] + rel_root_depth = result['rel_root_depth'] + hand_type = result['hand_type'] + if poses_3d.shape[-1] != 4: + assert poses_3d.shape[-1] == 3 + dummy_score = np.ones( + poses_3d.shape[:-1] + (1, ), dtype=poses_3d.dtype) + poses_3d = np.concatenate((poses_3d, dummy_score), axis=-1) + + # add relative root depth to left hand joints + poses_3d[:, 21:, 2] += rel_root_depth + + # set joint scores according to hand type + poses_3d[:, :21, 3] *= hand_type[:, [0]] + poses_3d[:, 21:, 3] *= hand_type[:, [1]] + + pose_results = [] + for pose_3d, person_res, bbox_xyxy in zip(poses_3d, det_results, + bboxes_xyxy): + pose_res = person_res.copy() + pose_res['keypoints_3d'] = pose_3d + pose_res['bbox'] = bbox_xyxy + pose_results.append(pose_res) + + return pose_results + + +def inference_mesh_model(model, + img_or_path, + det_results, + bbox_thr=None, + format='xywh', + dataset='MeshH36MDataset'): + """Inference a single image with a list of bounding boxes. + + Note: + - num_bboxes: N + - num_keypoints: K + - num_vertices: V + - num_faces: F + + Args: + model (nn.Module): The loaded pose model. + img_or_path (str | np.ndarray): Image filename or loaded image. + det_results (list[dict]): The 2D bbox sequences stored in a list. + Each element of the list is the bbox of one person. + "bbox" (ndarray[4 or 5]): The person bounding box, + which contains 4 box coordinates (and score). + bbox_thr (float | None): Threshold for bounding boxes. + Only bboxes with higher scores will be fed into the pose + detector. If bbox_thr is None, all boxes will be used. + format (str): bbox format ('xyxy' | 'xywh'). Default: 'xywh'. + + - 'xyxy' means (left, top, right, bottom), + - 'xywh' means (left, top, width, height). + dataset (str): Dataset name. + + Returns: + list[dict]: 3D pose inference results. Each element \ + is the result of an instance, which contains: + + - 'bbox' (ndarray[4]): instance bounding bbox + - 'center' (ndarray[2]): bbox center + - 'scale' (ndarray[2]): bbox scale + - 'keypoints_3d' (ndarray[K,3]): predicted 3D keypoints + - 'camera' (ndarray[3]): camera parameters + - 'vertices' (ndarray[V, 3]): predicted 3D vertices + - 'faces' (ndarray[F, 3]): mesh faces + + If there is no valid instance, an empty list + will be returned. + """ + + assert format in ['xyxy', 'xywh'] + + pose_results = [] + + if len(det_results) == 0: + return pose_results + + # Change for-loop preprocess each bbox to preprocess all bboxes at once. + bboxes = np.array([box['bbox'] for box in det_results]) + + # Select bboxes by score threshold + if bbox_thr is not None: + assert bboxes.shape[1] == 5 + valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0] + bboxes = bboxes[valid_idx] + det_results = [det_results[i] for i in valid_idx] + + if format == 'xyxy': + bboxes_xyxy = bboxes + bboxes_xywh = _xyxy2xywh(bboxes) + else: + # format is already 'xywh' + bboxes_xywh = bboxes + bboxes_xyxy = _xywh2xyxy(bboxes) + + # if bbox_thr remove all bounding box + if len(bboxes_xywh) == 0: + return [] + + cfg = model.cfg + device = next(model.parameters()).device + if device.type == 'cpu': + device = -1 + + # build the data pipeline + test_pipeline = Compose(cfg.test_pipeline) + + assert len(bboxes[0]) in [4, 5] + + if dataset == 'MeshH36MDataset': + flip_pairs = [[0, 5], [1, 4], [2, 3], [6, 11], [7, 10], [8, 9], + [20, 21], [22, 23]] + else: + raise NotImplementedError() + + batch_data = [] + for bbox in bboxes: + center, scale = _box2cs(cfg, bbox) + + # prepare data + data = { + 'image_file': + img_or_path, + 'center': + center, + 'scale': + scale, + 'rotation': + 0, + 'bbox_score': + bbox[4] if len(bbox) == 5 else 1, + 'dataset': + dataset, + 'joints_2d': + np.zeros((cfg.data_cfg.num_joints, 2), dtype=np.float32), + 'joints_2d_visible': + np.zeros((cfg.data_cfg.num_joints, 1), dtype=np.float32), + 'joints_3d': + np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), + 'joints_3d_visible': + np.zeros((cfg.data_cfg.num_joints, 3), dtype=np.float32), + 'pose': + np.zeros(72, dtype=np.float32), + 'beta': + np.zeros(10, dtype=np.float32), + 'has_smpl': + 0, + 'ann_info': { + 'image_size': np.array(cfg.data_cfg['image_size']), + 'num_joints': cfg.data_cfg['num_joints'], + 'flip_pairs': flip_pairs, + } + } + + data = test_pipeline(data) + batch_data.append(data) + + batch_data = collate(batch_data, samples_per_gpu=len(batch_data)) + batch_data = scatter(batch_data, target_gpus=[device])[0] + + # forward the model + with torch.no_grad(): + preds = model( + img=batch_data['img'], + img_metas=batch_data['img_metas'], + return_loss=False, + return_vertices=True, + return_faces=True) + + for idx in range(len(det_results)): + pose_res = det_results[idx].copy() + pose_res['bbox'] = bboxes_xyxy[idx] + pose_res['center'] = batch_data['img_metas'][idx]['center'] + pose_res['scale'] = batch_data['img_metas'][idx]['scale'] + pose_res['keypoints_3d'] = preds['keypoints_3d'][idx] + pose_res['camera'] = preds['camera'][idx] + pose_res['vertices'] = preds['vertices'][idx] + pose_res['faces'] = preds['faces'] + pose_results.append(pose_res) + return pose_results + + +def vis_3d_mesh_result(model, result, img=None, show=False, out_file=None): + """Visualize the 3D mesh estimation results. + + Args: + model (nn.Module): The loaded model. + result (list[dict]): 3D mesh estimation results. + """ + if hasattr(model, 'module'): + model = model.module + + img = model.show_result(result, img, show=show, out_file=out_file) + + return img diff --git a/mmpose/apis/inference_tracking.py b/mmpose/apis/inference_tracking.py new file mode 100644 index 0000000..9494fba --- /dev/null +++ b/mmpose/apis/inference_tracking.py @@ -0,0 +1,347 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np + +from mmpose.core import OneEuroFilter, oks_iou + + +def _compute_iou(bboxA, bboxB): + """Compute the Intersection over Union (IoU) between two boxes . + + Args: + bboxA (list): The first bbox info (left, top, right, bottom, score). + bboxB (list): The second bbox info (left, top, right, bottom, score). + + Returns: + float: The IoU value. + """ + + x1 = max(bboxA[0], bboxB[0]) + y1 = max(bboxA[1], bboxB[1]) + x2 = min(bboxA[2], bboxB[2]) + y2 = min(bboxA[3], bboxB[3]) + + inter_area = max(0, x2 - x1) * max(0, y2 - y1) + + bboxA_area = (bboxA[2] - bboxA[0]) * (bboxA[3] - bboxA[1]) + bboxB_area = (bboxB[2] - bboxB[0]) * (bboxB[3] - bboxB[1]) + union_area = float(bboxA_area + bboxB_area - inter_area) + if union_area == 0: + union_area = 1e-5 + warnings.warn('union_area=0 is unexpected') + + iou = inter_area / union_area + + return iou + + +def _track_by_iou(res, results_last, thr): + """Get track id using IoU tracking greedily. + + Args: + res (dict): The bbox & pose results of the person instance. + results_last (list[dict]): The bbox & pose & track_id info of the + last frame (bbox_result, pose_result, track_id). + thr (float): The threshold for iou tracking. + + Returns: + int: The track id for the new person instance. + list[dict]: The bbox & pose & track_id info of the persons + that have not been matched on the last frame. + dict: The matched person instance on the last frame. + """ + + bbox = list(res['bbox']) + + max_iou_score = -1 + max_index = -1 + match_result = {} + for index, res_last in enumerate(results_last): + bbox_last = list(res_last['bbox']) + + iou_score = _compute_iou(bbox, bbox_last) + if iou_score > max_iou_score: + max_iou_score = iou_score + max_index = index + + if max_iou_score > thr: + track_id = results_last[max_index]['track_id'] + match_result = results_last[max_index] + del results_last[max_index] + else: + track_id = -1 + + return track_id, results_last, match_result + + +def _track_by_oks(res, results_last, thr): + """Get track id using OKS tracking greedily. + + Args: + res (dict): The pose results of the person instance. + results_last (list[dict]): The pose & track_id info of the + last frame (pose_result, track_id). + thr (float): The threshold for oks tracking. + + Returns: + int: The track id for the new person instance. + list[dict]: The pose & track_id info of the persons + that have not been matched on the last frame. + dict: The matched person instance on the last frame. + """ + pose = res['keypoints'].reshape((-1)) + area = res['area'] + max_index = -1 + match_result = {} + + if len(results_last) == 0: + return -1, results_last, match_result + + pose_last = np.array( + [res_last['keypoints'].reshape((-1)) for res_last in results_last]) + area_last = np.array([res_last['area'] for res_last in results_last]) + + oks_score = oks_iou(pose, pose_last, area, area_last) + + max_index = np.argmax(oks_score) + + if oks_score[max_index] > thr: + track_id = results_last[max_index]['track_id'] + match_result = results_last[max_index] + del results_last[max_index] + else: + track_id = -1 + + return track_id, results_last, match_result + + +def _get_area(results): + """Get bbox for each person instance on the current frame. + + Args: + results (list[dict]): The pose results of the current frame + (pose_result). + Returns: + list[dict]: The bbox & pose info of the current frame + (bbox_result, pose_result, area). + """ + for result in results: + if 'bbox' in result: + result['area'] = ((result['bbox'][2] - result['bbox'][0]) * + (result['bbox'][3] - result['bbox'][1])) + else: + xmin = np.min( + result['keypoints'][:, 0][result['keypoints'][:, 0] > 0], + initial=1e10) + xmax = np.max(result['keypoints'][:, 0]) + ymin = np.min( + result['keypoints'][:, 1][result['keypoints'][:, 1] > 0], + initial=1e10) + ymax = np.max(result['keypoints'][:, 1]) + result['area'] = (xmax - xmin) * (ymax - ymin) + result['bbox'] = np.array([xmin, ymin, xmax, ymax]) + return results + + +def _temporal_refine(result, match_result, fps=None): + """Refine koypoints using tracked person instance on last frame. + + Args: + results (dict): The pose results of the current frame + (pose_result). + match_result (dict): The pose results of the last frame + (match_result) + Returns: + (array): The person keypoints after refine. + """ + if 'one_euro' in match_result: + result['keypoints'][:, :2] = match_result['one_euro']( + result['keypoints'][:, :2]) + result['one_euro'] = match_result['one_euro'] + else: + result['one_euro'] = OneEuroFilter(result['keypoints'][:, :2], fps=fps) + return result['keypoints'] + + +def get_track_id(results, + results_last, + next_id, + min_keypoints=3, + use_oks=False, + tracking_thr=0.3, + use_one_euro=False, + fps=None): + """Get track id for each person instance on the current frame. + + Args: + results (list[dict]): The bbox & pose results of the current frame + (bbox_result, pose_result). + results_last (list[dict]): The bbox & pose & track_id info of the + last frame (bbox_result, pose_result, track_id). + next_id (int): The track id for the new person instance. + min_keypoints (int): Minimum number of keypoints recognized as person. + default: 3. + use_oks (bool): Flag to using oks tracking. default: False. + tracking_thr (float): The threshold for tracking. + use_one_euro (bool): Option to use one-euro-filter. default: False. + fps (optional): Parameters that d_cutoff + when one-euro-filter is used as a video input + + Returns: + tuple: + - results (list[dict]): The bbox & pose & track_id info of the \ + current frame (bbox_result, pose_result, track_id). + - next_id (int): The track id for the new person instance. + """ + results = _get_area(results) + + if use_oks: + _track = _track_by_oks + else: + _track = _track_by_iou + + for result in results: + track_id, results_last, match_result = _track(result, results_last, + tracking_thr) + if track_id == -1: + if np.count_nonzero(result['keypoints'][:, 1]) > min_keypoints: + result['track_id'] = next_id + next_id += 1 + else: + # If the number of keypoints detected is small, + # delete that person instance. + result['keypoints'][:, 1] = -10 + result['bbox'] *= 0 + result['track_id'] = -1 + else: + result['track_id'] = track_id + if use_one_euro: + result['keypoints'] = _temporal_refine( + result, match_result, fps=fps) + del match_result + + return results, next_id + + +def vis_pose_tracking_result(model, + img, + result, + radius=4, + thickness=1, + kpt_score_thr=0.3, + dataset='TopDownCocoDataset', + dataset_info=None, + show=False, + out_file=None): + """Visualize the pose tracking results on the image. + + Args: + model (nn.Module): The loaded detector. + img (str | np.ndarray): Image filename or loaded image. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + radius (int): Radius of circles. + thickness (int): Thickness of lines. + kpt_score_thr (float): The threshold to visualize the keypoints. + skeleton (list[tuple]): Default None. + show (bool): Whether to show the image. Default True. + out_file (str|None): The filename of the output visualization image. + """ + if hasattr(model, 'module'): + model = model.module + + palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102], + [230, 230, 0], [255, 153, 255], [153, 204, 255], + [255, 102, 255], [255, 51, 255], [102, 178, 255], + [51, 153, 255], [255, 153, 153], [255, 102, 102], + [255, 51, 51], [153, 255, 153], [102, 255, 102], + [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0], + [255, 255, 255]]) + + if dataset_info is None and dataset is not None: + warnings.warn( + 'dataset is deprecated.' + 'Please set `dataset_info` in the config.' + 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', + DeprecationWarning) + # TODO: These will be removed in the later versions. + if dataset in ('TopDownCocoDataset', 'BottomUpCocoDataset', + 'TopDownOCHumanDataset'): + kpt_num = 17 + skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], + [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], + [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], + [3, 5], [4, 6]] + + elif dataset == 'TopDownCocoWholeBodyDataset': + kpt_num = 133 + skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], + [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], + [8, 10], [1, 2], [0, 1], [0, 2], + [1, 3], [2, 4], [3, 5], [4, 6], [15, 17], [15, 18], + [15, 19], [16, 20], [16, 21], [16, 22], [91, 92], + [92, 93], [93, 94], [94, 95], [91, 96], [96, 97], + [97, 98], [98, 99], [91, 100], [100, 101], [101, 102], + [102, 103], [91, 104], [104, 105], [105, 106], + [106, 107], [91, 108], [108, 109], [109, 110], + [110, 111], [112, 113], [113, 114], [114, 115], + [115, 116], [112, 117], [117, 118], [118, 119], + [119, 120], [112, 121], [121, 122], [122, 123], + [123, 124], [112, 125], [125, 126], [126, 127], + [127, 128], [112, 129], [129, 130], [130, 131], + [131, 132]] + radius = 1 + + elif dataset == 'TopDownAicDataset': + kpt_num = 14 + skeleton = [[2, 1], [1, 0], [0, 13], [13, 3], [3, 4], [4, 5], + [8, 7], [7, 6], [6, 9], [9, 10], [10, 11], [12, 13], + [0, 6], [3, 9]] + + elif dataset == 'TopDownMpiiDataset': + kpt_num = 16 + skeleton = [[0, 1], [1, 2], [2, 6], [6, 3], [3, 4], [4, 5], [6, 7], + [7, 8], [8, 9], [8, 12], [12, 11], [11, 10], [8, 13], + [13, 14], [14, 15]] + + elif dataset in ('OneHand10KDataset', 'FreiHandDataset', + 'PanopticDataset'): + kpt_num = 21 + skeleton = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], + [7, 8], [0, 9], [9, 10], [10, 11], [11, 12], [0, 13], + [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], + [18, 19], [19, 20]] + + elif dataset == 'InterHand2DDataset': + kpt_num = 21 + skeleton = [[0, 1], [1, 2], [2, 3], [4, 5], [5, 6], [6, 7], [8, 9], + [9, 10], [10, 11], [12, 13], [13, 14], [14, 15], + [16, 17], [17, 18], [18, 19], [3, 20], [7, 20], + [11, 20], [15, 20], [19, 20]] + + else: + raise NotImplementedError() + + elif dataset_info is not None: + kpt_num = dataset_info.keypoint_num + skeleton = dataset_info.skeleton + + for res in result: + track_id = res['track_id'] + bbox_color = palette[track_id % len(palette)] + pose_kpt_color = palette[[track_id % len(palette)] * kpt_num] + pose_link_color = palette[[track_id % len(palette)] * len(skeleton)] + img = model.show_result( + img, [res], + skeleton, + radius=radius, + thickness=thickness, + pose_kpt_color=pose_kpt_color, + pose_link_color=pose_link_color, + bbox_color=tuple(bbox_color.tolist()), + kpt_score_thr=kpt_score_thr, + show=show, + out_file=out_file) + + return img diff --git a/mmpose/apis/test.py b/mmpose/apis/test.py new file mode 100644 index 0000000..3843b5a --- /dev/null +++ b/mmpose/apis/test.py @@ -0,0 +1,191 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import pickle +import shutil +import tempfile + +import mmcv +import torch +import torch.distributed as dist +from mmcv.runner import get_dist_info + + +def single_gpu_test(model, data_loader): + """Test model with a single gpu. + + This method tests model with a single gpu and displays test progress bar. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + + + Returns: + list: The prediction results. + """ + + model.eval() + results = [] + dataset = data_loader.dataset + prog_bar = mmcv.ProgressBar(len(dataset)) + for data in data_loader: + with torch.no_grad(): + result = model(return_loss=False, **data) + results.append(result) + + # use the first key as main key to calculate the batch size + batch_size = len(next(iter(data.values()))) + for _ in range(batch_size): + prog_bar.update() + return results + + +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + for data in data_loader: + with torch.no_grad(): + result = model(return_loss=False, **data) + results.append(result) + + if rank == 0: + # use the first key as main key to calculate the batch size + batch_size = len(next(iter(data.values()))) + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results + + +def collect_results_cpu(result_part, size, tmpdir=None): + """Collect results in cpu mode. + + It saves the results on different gpus to 'tmpdir' and collects + them by the rank 0 worker. + + Args: + result_part (list): Results to be collected + size (int): Result size. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. Default: None + + Returns: + list: Ordered results. + """ + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # synchronizes all processes to make sure tmpdir exist + dist.barrier() + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + # synchronizes all processes for loading pickle file + dist.barrier() + # collect all parts + if rank != 0: + return None + + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + """Collect results in gpu mode. + + It encodes results to gpu tensors and use gpu communication for results + collection. + + Args: + result_part (list): Results to be collected + size (int): Result size. + + Returns: + list: Ordered results. + """ + + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_list.append( + pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results + return None diff --git a/mmpose/apis/train.py b/mmpose/apis/train.py new file mode 100644 index 0000000..7c31f8b --- /dev/null +++ b/mmpose/apis/train.py @@ -0,0 +1,200 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import mmcv +import numpy as np +import torch +import torch.distributed as dist +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook, + get_dist_info) +from mmcv.utils import digit_version + +from mmpose.core import DistEvalHook, EvalHook, build_optimizers +from mmpose.core.distributed_wrapper import DistributedDataParallelWrapper +from mmpose.datasets import build_dataloader, build_dataset +from mmpose.utils import get_root_logger + +try: + from mmcv.runner import Fp16OptimizerHook +except ImportError: + warnings.warn( + 'Fp16OptimizerHook from mmpose will be deprecated from ' + 'v0.15.0. Please install mmcv>=1.1.4', DeprecationWarning) + from mmpose.core import Fp16OptimizerHook + + +def init_random_seed(seed=None, device='cuda'): + """Initialize random seed. + + If the seed is not set, the seed will be automatically randomized, + and then broadcast to all processes to prevent some potential bugs. + + Args: + seed (int, Optional): The seed. Default to None. + device (str): The device where the seed will be put on. + Default to 'cuda'. + + Returns: + int: Seed to be used. + """ + if seed is not None: + return seed + + # Make sure all ranks share the same random seed to prevent + # some potential bugs. Please refer to + # https://github.com/open-mmlab/mmdetection/issues/6339 + rank, world_size = get_dist_info() + seed = np.random.randint(2**31) + if world_size == 1: + return seed + + if rank == 0: + random_num = torch.tensor(seed, dtype=torch.int32, device=device) + else: + random_num = torch.tensor(0, dtype=torch.int32, device=device) + dist.broadcast(random_num, src=0) + return random_num.item() + + +def train_model(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + """Train model entry function. + + Args: + model (nn.Module): The model to be trained. + dataset (Dataset): Train dataset. + cfg (dict): The config dict for training. + distributed (bool): Whether to use distributed training. + Default: False. + validate (bool): Whether to do evaluation. Default: False. + timestamp (str | None): Local time for runner. Default: None. + meta (dict | None): Meta dict to record some important information. + Default: None + """ + logger = get_root_logger(cfg.log_level) + + # prepare data loaders + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + # step 1: give default values and override (if exist) from cfg.data + loader_cfg = { + **dict( + seed=cfg.get('seed'), + drop_last=False, + dist=distributed, + num_gpus=len(cfg.gpu_ids)), + **({} if torch.__version__ != 'parrots' else dict( + prefetch_num=2, + pin_memory=False, + )), + **dict((k, cfg.data[k]) for k in [ + 'samples_per_gpu', + 'workers_per_gpu', + 'shuffle', + 'seed', + 'drop_last', + 'prefetch_num', + 'pin_memory', + 'persistent_workers', + ] if k in cfg.data) + } + + # step 2: cfg.data.train_dataloader has highest priority + train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {})) + + data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] + + # determine whether use adversarial training precess or not + use_adverserial_train = cfg.get('use_adversarial_train', False) + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + + if use_adverserial_train: + # Use DistributedDataParallelWrapper for adversarial training + model = DistributedDataParallelWrapper( + model, + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + if digit_version(mmcv.__version__) >= digit_version( + '1.4.4') or torch.cuda.is_available(): + model = MMDataParallel(model, device_ids=cfg.gpu_ids) + else: + warnings.warn( + 'We recommend to use MMCV >= 1.4.4 for CPU training. ' + 'See https://github.com/open-mmlab/mmpose/pull/1157 for ' + 'details.') + + # build runner + optimizer = build_optimizers(model, cfg.optimizer) + + runner = EpochBasedRunner( + model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta) + # an ugly workaround to make .log and .log.json filenames the same + runner.timestamp = timestamp + + if use_adverserial_train: + # The optimizer step process is included in the train_step function + # of the model, so the runner should NOT include optimizer hook. + optimizer_config = None + else: + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=distributed) + elif distributed and 'type' not in cfg.optimizer_config: + optimizer_config = OptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks(cfg.lr_config, optimizer_config, + cfg.checkpoint_config, cfg.log_config, + cfg.get('momentum_config', None)) + if distributed: + runner.register_hook(DistSamplerSeedHook()) + + # register eval hooks + if validate: + eval_cfg = cfg.get('evaluation', {}) + val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) + dataloader_setting = dict( + samples_per_gpu=1, + workers_per_gpu=cfg.data.get('workers_per_gpu', 1), + # cfg.gpus will be ignored if distributed + num_gpus=len(cfg.gpu_ids), + dist=distributed, + drop_last=False, + shuffle=False) + dataloader_setting = dict(dataloader_setting, + **cfg.data.get('val_dataloader', {})) + val_dataloader = build_dataloader(val_dataset, **dataloader_setting) + eval_hook = DistEvalHook if distributed else EvalHook + runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow, cfg.total_epochs) diff --git a/mmpose/core/__init__.py b/mmpose/core/__init__.py new file mode 100644 index 0000000..66185b7 --- /dev/null +++ b/mmpose/core/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .camera import * # noqa: F401, F403 +from .evaluation import * # noqa: F401, F403 +from .fp16 import * # noqa: F401, F403 +from .optimizer import * # noqa: F401, F403 +from .post_processing import * # noqa: F401, F403 +from .utils import * # noqa: F401, F403 +from .visualization import * # noqa: F401, F403 diff --git a/mmpose/core/camera/__init__.py b/mmpose/core/camera/__init__.py new file mode 100644 index 0000000..a4a3c55 --- /dev/null +++ b/mmpose/core/camera/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .camera_base import CAMERAS +from .single_camera import SimpleCamera +from .single_camera_torch import SimpleCameraTorch + +__all__ = ['CAMERAS', 'SimpleCamera', 'SimpleCameraTorch'] diff --git a/mmpose/core/camera/camera_base.py b/mmpose/core/camera/camera_base.py new file mode 100644 index 0000000..28b23e7 --- /dev/null +++ b/mmpose/core/camera/camera_base.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + +from mmcv.utils import Registry + +CAMERAS = Registry('camera') + + +class SingleCameraBase(metaclass=ABCMeta): + """Base class for single camera model. + + Args: + param (dict): Camera parameters + + Methods: + world_to_camera: Project points from world coordinates to camera + coordinates + camera_to_world: Project points from camera coordinates to world + coordinates + camera_to_pixel: Project points from camera coordinates to pixel + coordinates + world_to_pixel: Project points from world coordinates to pixel + coordinates + """ + + @abstractmethod + def __init__(self, param): + """Load camera parameters and check validity.""" + + def world_to_camera(self, X): + """Project points from world coordinates to camera coordinates.""" + raise NotImplementedError + + def camera_to_world(self, X): + """Project points from camera coordinates to world coordinates.""" + raise NotImplementedError + + def camera_to_pixel(self, X): + """Project points from camera coordinates to pixel coordinates.""" + raise NotImplementedError + + def world_to_pixel(self, X): + """Project points from world coordinates to pixel coordinates.""" + _X = self.world_to_camera(X) + return self.camera_to_pixel(_X) diff --git a/mmpose/core/camera/single_camera.py b/mmpose/core/camera/single_camera.py new file mode 100644 index 0000000..cabd799 --- /dev/null +++ b/mmpose/core/camera/single_camera.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + +from .camera_base import CAMERAS, SingleCameraBase + + +@CAMERAS.register_module() +class SimpleCamera(SingleCameraBase): + """Camera model to calculate coordinate transformation with given + intrinsic/extrinsic camera parameters. + + Note: + The keypoint coordinate should be an np.ndarray with a shape of + [...,J, C] where J is the keypoint number of an instance, and C is + the coordinate dimension. For example: + + [J, C]: shape of joint coordinates of a person with J joints. + [N, J, C]: shape of a batch of person joint coordinates. + [N, T, J, C]: shape of a batch of pose sequences. + + Args: + param (dict): camera parameters including: + - R: 3x3, camera rotation matrix (camera-to-world) + - T: 3x1, camera translation (camera-to-world) + - K: (optional) 2x3, camera intrinsic matrix + - k: (optional) nx1, camera radial distortion coefficients + - p: (optional) mx1, camera tangential distortion coefficients + - f: (optional) 2x1, camera focal length + - c: (optional) 2x1, camera center + if K is not provided, it will be calculated from f and c. + + Methods: + world_to_camera: Project points from world coordinates to camera + coordinates + camera_to_pixel: Project points from camera coordinates to pixel + coordinates + world_to_pixel: Project points from world coordinates to pixel + coordinates + """ + + def __init__(self, param): + + self.param = {} + # extrinsic param + R = np.array(param['R'], dtype=np.float32) + T = np.array(param['T'], dtype=np.float32) + assert R.shape == (3, 3) + assert T.shape == (3, 1) + # The camera matrices are transposed in advance because the joint + # coordinates are stored as row vectors. + self.param['R_c2w'] = R.T + self.param['T_c2w'] = T.T + self.param['R_w2c'] = R + self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c'] + + # intrinsic param + if 'K' in param: + K = np.array(param['K'], dtype=np.float32) + assert K.shape == (2, 3) + self.param['K'] = K.T + self.param['f'] = np.array([K[0, 0], K[1, 1]])[:, np.newaxis] + self.param['c'] = np.array([K[0, 2], K[1, 2]])[:, np.newaxis] + elif 'f' in param and 'c' in param: + f = np.array(param['f'], dtype=np.float32) + c = np.array(param['c'], dtype=np.float32) + assert f.shape == (2, 1) + assert c.shape == (2, 1) + self.param['K'] = np.concatenate((np.diagflat(f), c), axis=-1).T + self.param['f'] = f + self.param['c'] = c + else: + raise ValueError('Camera intrinsic parameters are missing. ' + 'Either "K" or "f"&"c" should be provided.') + + # distortion param + if 'k' in param and 'p' in param: + self.undistortion = True + self.param['k'] = np.array(param['k'], dtype=np.float32).flatten() + self.param['p'] = np.array(param['p'], dtype=np.float32).flatten() + assert self.param['k'].size in {3, 6} + assert self.param['p'].size == 2 + else: + self.undistortion = False + + def world_to_camera(self, X): + assert isinstance(X, np.ndarray) + assert X.ndim >= 2 and X.shape[-1] == 3 + return X @ self.param['R_w2c'] + self.param['T_w2c'] + + def camera_to_world(self, X): + assert isinstance(X, np.ndarray) + assert X.ndim >= 2 and X.shape[-1] == 3 + return X @ self.param['R_c2w'] + self.param['T_c2w'] + + def camera_to_pixel(self, X): + assert isinstance(X, np.ndarray) + assert X.ndim >= 2 and X.shape[-1] == 3 + + _X = X / X[..., 2:] + + if self.undistortion: + k = self.param['k'] + p = self.param['p'] + _X_2d = _X[..., :2] + r2 = (_X_2d**2).sum(-1) + radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3])) + if k.size == 6: + radial /= 1 + sum( + (ki * r2**(i + 1) for i, ki in enumerate(k[3:]))) + + tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1]) + + _X[..., :2] = _X_2d * (radial + tangential)[..., None] + np.outer( + r2, p[::-1]).reshape(_X_2d.shape) + return _X @ self.param['K'] + + def pixel_to_camera(self, X): + assert isinstance(X, np.ndarray) + assert X.ndim >= 2 and X.shape[-1] == 3 + _X = X.copy() + _X[:, :2] = (X[:, :2] - self.param['c'].T) / self.param['f'].T * X[:, + [2]] + return _X diff --git a/mmpose/core/camera/single_camera_torch.py b/mmpose/core/camera/single_camera_torch.py new file mode 100644 index 0000000..22eb72f --- /dev/null +++ b/mmpose/core/camera/single_camera_torch.py @@ -0,0 +1,118 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from .camera_base import CAMERAS, SingleCameraBase + + +@CAMERAS.register_module() +class SimpleCameraTorch(SingleCameraBase): + """Camera model to calculate coordinate transformation with given + intrinsic/extrinsic camera parameters. + + Notes: + The keypoint coordinate should be an np.ndarray with a shape of + [...,J, C] where J is the keypoint number of an instance, and C is + the coordinate dimension. For example: + + [J, C]: shape of joint coordinates of a person with J joints. + [N, J, C]: shape of a batch of person joint coordinates. + [N, T, J, C]: shape of a batch of pose sequences. + + Args: + param (dict): camera parameters including: + - R: 3x3, camera rotation matrix (camera-to-world) + - T: 3x1, camera translation (camera-to-world) + - K: (optional) 2x3, camera intrinsic matrix + - k: (optional) nx1, camera radial distortion coefficients + - p: (optional) mx1, camera tangential distortion coefficients + - f: (optional) 2x1, camera focal length + - c: (optional) 2x1, camera center + if K is not provided, it will be calculated from f and c. + + Methods: + world_to_camera: Project points from world coordinates to camera + coordinates + camera_to_pixel: Project points from camera coordinates to pixel + coordinates + world_to_pixel: Project points from world coordinates to pixel + coordinates + """ + + def __init__(self, param, device): + + self.param = {} + # extrinsic param + R = torch.tensor(param['R'], device=device) + T = torch.tensor(param['T'], device=device) + + assert R.shape == (3, 3) + assert T.shape == (3, 1) + # The camera matrices are transposed in advance because the joint + # coordinates are stored as row vectors. + self.param['R_c2w'] = R.T + self.param['T_c2w'] = T.T + self.param['R_w2c'] = R + self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c'] + + # intrinsic param + if 'K' in param: + K = torch.tensor(param['K'], device=device) + assert K.shape == (2, 3) + self.param['K'] = K.T + self.param['f'] = torch.tensor([[K[0, 0]], [K[1, 1]]], + device=device) + self.param['c'] = torch.tensor([[K[0, 2]], [K[1, 2]]], + device=device) + elif 'f' in param and 'c' in param: + f = torch.tensor(param['f'], device=device) + c = torch.tensor(param['c'], device=device) + assert f.shape == (2, 1) + assert c.shape == (2, 1) + self.param['K'] = torch.cat([torch.diagflat(f), c], dim=-1).T + self.param['f'] = f + self.param['c'] = c + else: + raise ValueError('Camera intrinsic parameters are missing. ' + 'Either "K" or "f"&"c" should be provided.') + + # distortion param + if 'k' in param and 'p' in param: + self.undistortion = True + self.param['k'] = torch.tensor(param['k'], device=device).view(-1) + self.param['p'] = torch.tensor(param['p'], device=device).view(-1) + assert len(self.param['k']) in {3, 6} + assert len(self.param['p']) == 2 + else: + self.undistortion = False + + def world_to_camera(self, X): + assert isinstance(X, torch.Tensor) + assert X.ndim >= 2 and X.shape[-1] == 3 + return X @ self.param['R_w2c'] + self.param['T_w2c'] + + def camera_to_world(self, X): + assert isinstance(X, torch.Tensor) + assert X.ndim >= 2 and X.shape[-1] == 3 + return X @ self.param['R_c2w'] + self.param['T_c2w'] + + def camera_to_pixel(self, X): + assert isinstance(X, torch.Tensor) + assert X.ndim >= 2 and X.shape[-1] == 3 + + _X = X / X[..., 2:] + + if self.undistortion: + k = self.param['k'] + p = self.param['p'] + _X_2d = _X[..., :2] + r2 = (_X_2d**2).sum(-1) + radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3])) + if k.size == 6: + radial /= 1 + sum( + (ki * r2**(i + 1) for i, ki in enumerate(k[3:]))) + + tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1]) + + _X[..., :2] = _X_2d * (radial + tangential)[..., None] + torch.ger( + r2, p.flip([0])).reshape(_X_2d.shape) + return _X @ self.param['K'] diff --git a/mmpose/core/distributed_wrapper.py b/mmpose/core/distributed_wrapper.py new file mode 100644 index 0000000..c67acee --- /dev/null +++ b/mmpose/core/distributed_wrapper.py @@ -0,0 +1,143 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.parallel import MODULE_WRAPPERS as MMCV_MODULE_WRAPPERS +from mmcv.parallel import MMDistributedDataParallel +from mmcv.parallel.scatter_gather import scatter_kwargs +from mmcv.utils import Registry +from torch.cuda._utils import _get_device_index + +MODULE_WRAPPERS = Registry('module wrapper', parent=MMCV_MODULE_WRAPPERS) + + +@MODULE_WRAPPERS.register_module() +class DistributedDataParallelWrapper(nn.Module): + """A DistributedDataParallel wrapper for models in 3D mesh estimation task. + + In 3D mesh estimation task, there is a need to wrap different modules in + the models with separate DistributedDataParallel. Otherwise, it will cause + errors for GAN training. + More specific, the GAN model, usually has two sub-modules: + generator and discriminator. If we wrap both of them in one + standard DistributedDataParallel, it will cause errors during training, + because when we update the parameters of the generator (or discriminator), + the parameters of the discriminator (or generator) is not updated, which is + not allowed for DistributedDataParallel. + So we design this wrapper to separately wrap DistributedDataParallel + for generator and discriminator. + + In this wrapper, we perform two operations: + 1. Wrap the modules in the models with separate MMDistributedDataParallel. + Note that only modules with parameters will be wrapped. + 2. Do scatter operation for 'forward', 'train_step' and 'val_step'. + + Note that the arguments of this wrapper is the same as those in + `torch.nn.parallel.distributed.DistributedDataParallel`. + + Args: + module (nn.Module): Module that needs to be wrapped. + device_ids (list[int | `torch.device`]): Same as that in + `torch.nn.parallel.distributed.DistributedDataParallel`. + dim (int, optional): Same as that in the official scatter function in + pytorch. Defaults to 0. + broadcast_buffers (bool): Same as that in + `torch.nn.parallel.distributed.DistributedDataParallel`. + Defaults to False. + find_unused_parameters (bool, optional): Same as that in + `torch.nn.parallel.distributed.DistributedDataParallel`. + Traverse the autograd graph of all tensors contained in returned + value of the wrapped module’s forward function. Defaults to False. + kwargs (dict): Other arguments used in + `torch.nn.parallel.distributed.DistributedDataParallel`. + """ + + def __init__(self, + module, + device_ids, + dim=0, + broadcast_buffers=False, + find_unused_parameters=False, + **kwargs): + super().__init__() + assert len(device_ids) == 1, ( + 'Currently, DistributedDataParallelWrapper only supports one' + 'single CUDA device for each process.' + f'The length of device_ids must be 1, but got {len(device_ids)}.') + self.module = module + self.dim = dim + self.to_ddp( + device_ids=device_ids, + dim=dim, + broadcast_buffers=broadcast_buffers, + find_unused_parameters=find_unused_parameters, + **kwargs) + self.output_device = _get_device_index(device_ids[0], True) + + def to_ddp(self, device_ids, dim, broadcast_buffers, + find_unused_parameters, **kwargs): + """Wrap models with separate MMDistributedDataParallel. + + It only wraps the modules with parameters. + """ + for name, module in self.module._modules.items(): + if next(module.parameters(), None) is None: + module = module.cuda() + elif all(not p.requires_grad for p in module.parameters()): + module = module.cuda() + else: + module = MMDistributedDataParallel( + module.cuda(), + device_ids=device_ids, + dim=dim, + broadcast_buffers=broadcast_buffers, + find_unused_parameters=find_unused_parameters, + **kwargs) + self.module._modules[name] = module + + def scatter(self, inputs, kwargs, device_ids): + """Scatter function. + + Args: + inputs (Tensor): Input Tensor. + kwargs (dict): Args for + ``mmcv.parallel.scatter_gather.scatter_kwargs``. + device_ids (int): Device id. + """ + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def forward(self, *inputs, **kwargs): + """Forward function. + + Args: + inputs (tuple): Input data. + kwargs (dict): Args for + ``mmcv.parallel.scatter_gather.scatter_kwargs``. + """ + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + return self.module(*inputs[0], **kwargs[0]) + + def train_step(self, *inputs, **kwargs): + """Train step function. + + Args: + inputs (Tensor): Input Tensor. + kwargs (dict): Args for + ``mmcv.parallel.scatter_gather.scatter_kwargs``. + """ + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + output = self.module.train_step(*inputs[0], **kwargs[0]) + return output + + def val_step(self, *inputs, **kwargs): + """Validation step function. + + Args: + inputs (tuple): Input data. + kwargs (dict): Args for ``scatter_kwargs``. + """ + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + output = self.module.val_step(*inputs[0], **kwargs[0]) + return output diff --git a/mmpose/core/evaluation/__init__.py b/mmpose/core/evaluation/__init__.py new file mode 100644 index 0000000..5f93784 --- /dev/null +++ b/mmpose/core/evaluation/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bottom_up_eval import (aggregate_scale, aggregate_stage_flip, + flip_feature_maps, get_group_preds, + split_ae_outputs) +from .eval_hooks import DistEvalHook, EvalHook +from .mesh_eval import compute_similarity_transform +from .pose3d_eval import keypoint_3d_auc, keypoint_3d_pck, keypoint_mpjpe +from .top_down_eval import (keypoint_auc, keypoint_epe, keypoint_pck_accuracy, + keypoints_from_heatmaps, keypoints_from_heatmaps3d, + keypoints_from_regression, + multilabel_classification_accuracy, + pose_pck_accuracy, post_dark_udp) + +__all__ = [ + 'EvalHook', 'DistEvalHook', 'pose_pck_accuracy', 'keypoints_from_heatmaps', + 'keypoints_from_regression', 'keypoint_pck_accuracy', 'keypoint_3d_pck', + 'keypoint_3d_auc', 'keypoint_auc', 'keypoint_epe', 'get_group_preds', + 'split_ae_outputs', 'flip_feature_maps', 'aggregate_stage_flip', + 'aggregate_scale', 'compute_similarity_transform', 'post_dark_udp', + 'keypoint_mpjpe', 'keypoints_from_heatmaps3d', + 'multilabel_classification_accuracy' +] diff --git a/mmpose/core/evaluation/bottom_up_eval.py b/mmpose/core/evaluation/bottom_up_eval.py new file mode 100644 index 0000000..7b37d7c --- /dev/null +++ b/mmpose/core/evaluation/bottom_up_eval.py @@ -0,0 +1,333 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmpose.core.post_processing import (get_warp_matrix, transform_preds, + warp_affine_joints) + + +def split_ae_outputs(outputs, num_joints, with_heatmaps, with_ae, + select_output_index): + """Split multi-stage outputs into heatmaps & tags. + + Args: + outputs (list(Tensor)): Outputs of network + num_joints (int): Number of joints + with_heatmaps (list[bool]): Option to output + heatmaps for different stages. + with_ae (list[bool]): Option to output + ae tags for different stages. + select_output_index (list[int]): Output keep the selected index + + Returns: + tuple: A tuple containing multi-stage outputs. + + - list[Tensor]: multi-stage heatmaps. + - list[Tensor]: multi-stage tags. + """ + + heatmaps = [] + tags = [] + + # aggregate heatmaps from different stages + for i, output in enumerate(outputs): + if i not in select_output_index: + continue + # staring index of the associative embeddings + offset_feat = num_joints if with_heatmaps[i] else 0 + if with_heatmaps[i]: + heatmaps.append(output[:, :num_joints]) + if with_ae[i]: + tags.append(output[:, offset_feat:]) + + return heatmaps, tags + + +def flip_feature_maps(feature_maps, flip_index=None): + """Flip the feature maps and swap the channels. + + Args: + feature_maps (list[Tensor]): Feature maps. + flip_index (list[int] | None): Channel-flip indexes. + If None, do not flip channels. + + Returns: + list[Tensor]: Flipped feature_maps. + """ + flipped_feature_maps = [] + for feature_map in feature_maps: + feature_map = torch.flip(feature_map, [3]) + if flip_index is not None: + flipped_feature_maps.append(feature_map[:, flip_index, :, :]) + else: + flipped_feature_maps.append(feature_map) + + return flipped_feature_maps + + +def _resize_average(feature_maps, align_corners, index=-1, resize_size=None): + """Resize the feature maps and compute the average. + + Args: + feature_maps (list[Tensor]): Feature maps. + align_corners (bool): Align corners when performing interpolation. + index (int): Only used when `resize_size' is None. + If `resize_size' is None, the target size is the size + of the indexed feature maps. + resize_size (list[int, int]): The target size [w, h]. + + Returns: + list[Tensor]: Averaged feature_maps. + """ + + if feature_maps is None: + return None + feature_maps_avg = 0 + + feature_map_list = _resize_concate( + feature_maps, align_corners, index=index, resize_size=resize_size) + for feature_map in feature_map_list: + feature_maps_avg += feature_map + + feature_maps_avg /= len(feature_map_list) + return [feature_maps_avg] + + +def _resize_unsqueeze_concat(feature_maps, + align_corners, + index=-1, + resize_size=None): + """Resize, unsqueeze and concatenate the feature_maps. + + Args: + feature_maps (list[Tensor]): Feature maps. + align_corners (bool): Align corners when performing interpolation. + index (int): Only used when `resize_size' is None. + If `resize_size' is None, the target size is the size + of the indexed feature maps. + resize_size (list[int, int]): The target size [w, h]. + + Returns: + list[Tensor]: Averaged feature_maps. + """ + if feature_maps is None: + return None + feature_map_list = _resize_concate( + feature_maps, align_corners, index=index, resize_size=resize_size) + + feat_dim = len(feature_map_list[0].shape) - 1 + output_feature_maps = torch.cat( + [torch.unsqueeze(fmap, dim=feat_dim + 1) for fmap in feature_map_list], + dim=feat_dim + 1) + return [output_feature_maps] + + +def _resize_concate(feature_maps, align_corners, index=-1, resize_size=None): + """Resize and concatenate the feature_maps. + + Args: + feature_maps (list[Tensor]): Feature maps. + align_corners (bool): Align corners when performing interpolation. + index (int): Only used when `resize_size' is None. + If `resize_size' is None, the target size is the size + of the indexed feature maps. + resize_size (list[int, int]): The target size [w, h]. + + Returns: + list[Tensor]: Averaged feature_maps. + """ + if feature_maps is None: + return None + + feature_map_list = [] + + if index < 0: + index += len(feature_maps) + + if resize_size is None: + resize_size = (feature_maps[index].size(2), + feature_maps[index].size(3)) + + for feature_map in feature_maps: + ori_size = (feature_map.size(2), feature_map.size(3)) + if ori_size != resize_size: + feature_map = torch.nn.functional.interpolate( + feature_map, + size=resize_size, + mode='bilinear', + align_corners=align_corners) + + feature_map_list.append(feature_map) + + return feature_map_list + + +def aggregate_stage_flip(feature_maps, + feature_maps_flip, + index=-1, + project2image=True, + size_projected=None, + align_corners=False, + aggregate_stage='concat', + aggregate_flip='average'): + """Inference the model to get multi-stage outputs (heatmaps & tags), and + resize them to base sizes. + + Args: + feature_maps (list[Tensor]): feature_maps can be heatmaps, + tags, and pafs. + feature_maps_flip (list[Tensor] | None): flipped feature_maps. + feature maps can be heatmaps, tags, and pafs. + project2image (bool): Option to resize to base scale. + size_projected (list[int, int]): Base size of heatmaps [w, h]. + align_corners (bool): Align corners when performing interpolation. + aggregate_stage (str): Methods to aggregate multi-stage feature maps. + Options: 'concat', 'average'. Default: 'concat. + + - 'concat': Concatenate the original and the flipped feature maps. + - 'average': Get the average of the original and the flipped + feature maps. + aggregate_flip (str): Methods to aggregate the original and + the flipped feature maps. Options: 'concat', 'average', 'none'. + Default: 'average. + + - 'concat': Concatenate the original and the flipped feature maps. + - 'average': Get the average of the original and the flipped + feature maps.. + - 'none': no flipped feature maps. + + Returns: + list[Tensor]: Aggregated feature maps with shape [NxKxWxH]. + """ + + if feature_maps_flip is None: + aggregate_flip = 'none' + + output_feature_maps = [] + + if aggregate_stage == 'average': + _aggregate_stage_func = _resize_average + elif aggregate_stage == 'concat': + _aggregate_stage_func = _resize_concate + else: + NotImplementedError() + + if project2image and size_projected: + _origin = _aggregate_stage_func( + feature_maps, + align_corners, + index=index, + resize_size=(size_projected[1], size_projected[0])) + + _flipped = _aggregate_stage_func( + feature_maps_flip, + align_corners, + index=index, + resize_size=(size_projected[1], size_projected[0])) + else: + _origin = _aggregate_stage_func( + feature_maps, align_corners, index=index, resize_size=None) + _flipped = _aggregate_stage_func( + feature_maps_flip, align_corners, index=index, resize_size=None) + + if aggregate_flip == 'average': + assert feature_maps_flip is not None + for _ori, _fli in zip(_origin, _flipped): + output_feature_maps.append((_ori + _fli) / 2.0) + + elif aggregate_flip == 'concat': + assert feature_maps_flip is not None + output_feature_maps.append(*_origin) + output_feature_maps.append(*_flipped) + + elif aggregate_flip == 'none': + if isinstance(_origin, list): + output_feature_maps.append(*_origin) + else: + output_feature_maps.append(_origin) + else: + NotImplementedError() + + return output_feature_maps + + +def aggregate_scale(feature_maps_list, + align_corners=False, + aggregate_scale='average'): + """Aggregate multi-scale outputs. + + Note: + batch size: N + keypoints num : K + heatmap width: W + heatmap height: H + + Args: + feature_maps_list (list[Tensor]): Aggregated feature maps. + project2image (bool): Option to resize to base scale. + align_corners (bool): Align corners when performing interpolation. + aggregate_scale (str): Methods to aggregate multi-scale feature maps. + Options: 'average', 'unsqueeze_concat'. + + - 'average': Get the average of the feature maps. + - 'unsqueeze_concat': Concatenate the feature maps along new axis. + Default: 'average. + + Returns: + Tensor: Aggregated feature maps. + """ + + if aggregate_scale == 'average': + output_feature_maps = _resize_average( + feature_maps_list, align_corners, index=0, resize_size=None) + + elif aggregate_scale == 'unsqueeze_concat': + output_feature_maps = _resize_unsqueeze_concat( + feature_maps_list, align_corners, index=0, resize_size=None) + else: + NotImplementedError() + + return output_feature_maps[0] + + +def get_group_preds(grouped_joints, + center, + scale, + heatmap_size, + use_udp=False): + """Transform the grouped joints back to the image. + + Args: + grouped_joints (list): Grouped person joints. + center (np.ndarray[2, ]): Center of the bounding box (x, y). + scale (np.ndarray[2, ]): Scale of the bounding box + wrt [width, height]. + heatmap_size (np.ndarray[2, ]): Size of the destination heatmaps. + use_udp (bool): Unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR'2020). + + Returns: + list: List of the pose result for each person. + """ + if len(grouped_joints) == 0: + return [] + + if use_udp: + if grouped_joints[0].shape[0] > 0: + heatmap_size_t = np.array(heatmap_size, dtype=np.float32) - 1.0 + trans = get_warp_matrix( + theta=0, + size_input=heatmap_size_t, + size_dst=scale, + size_target=heatmap_size_t) + grouped_joints[0][..., :2] = \ + warp_affine_joints(grouped_joints[0][..., :2], trans) + results = [person for person in grouped_joints[0]] + else: + results = [] + for person in grouped_joints[0]: + joints = transform_preds(person, center, scale, heatmap_size) + results.append(joints) + + return results diff --git a/mmpose/core/evaluation/eval_hooks.py b/mmpose/core/evaluation/eval_hooks.py new file mode 100644 index 0000000..cf36a03 --- /dev/null +++ b/mmpose/core/evaluation/eval_hooks.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmcv.runner import DistEvalHook as _DistEvalHook +from mmcv.runner import EvalHook as _EvalHook + +MMPOSE_GREATER_KEYS = [ + 'acc', 'ap', 'ar', 'pck', 'auc', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc' +] +MMPOSE_LESS_KEYS = ['loss', 'epe', 'nme', 'mpjpe', 'p-mpjpe', 'n-mpjpe'] + + +class EvalHook(_EvalHook): + + def __init__(self, + dataloader, + start=None, + interval=1, + by_epoch=True, + save_best=None, + rule=None, + test_fn=None, + greater_keys=MMPOSE_GREATER_KEYS, + less_keys=MMPOSE_LESS_KEYS, + **eval_kwargs): + + if test_fn is None: + from mmpose.apis import single_gpu_test + test_fn = single_gpu_test + + # to be compatible with the config before v0.16.0 + + # remove "gpu_collect" from eval_kwargs + if 'gpu_collect' in eval_kwargs: + warnings.warn( + '"gpu_collect" will be deprecated in EvalHook.' + 'Please remove it from the config.', DeprecationWarning) + _ = eval_kwargs.pop('gpu_collect') + + # update "save_best" according to "key_indicator" and remove the + # latter from eval_kwargs + if 'key_indicator' in eval_kwargs or isinstance(save_best, bool): + warnings.warn( + '"key_indicator" will be deprecated in EvalHook.' + 'Please use "save_best" to specify the metric key,' + 'e.g., save_best="AP".', DeprecationWarning) + + key_indicator = eval_kwargs.pop('key_indicator', 'AP') + if save_best is True and key_indicator is None: + raise ValueError('key_indicator should not be None, when ' + 'save_best is set to True.') + save_best = key_indicator + + super().__init__(dataloader, start, interval, by_epoch, save_best, + rule, test_fn, greater_keys, less_keys, **eval_kwargs) + + +class DistEvalHook(_DistEvalHook): + + def __init__(self, + dataloader, + start=None, + interval=1, + by_epoch=True, + save_best=None, + rule=None, + test_fn=None, + greater_keys=MMPOSE_GREATER_KEYS, + less_keys=MMPOSE_LESS_KEYS, + broadcast_bn_buffer=True, + tmpdir=None, + gpu_collect=False, + **eval_kwargs): + + if test_fn is None: + from mmpose.apis import multi_gpu_test + test_fn = multi_gpu_test + + # to be compatible with the config before v0.16.0 + + # update "save_best" according to "key_indicator" and remove the + # latter from eval_kwargs + if 'key_indicator' in eval_kwargs or isinstance(save_best, bool): + warnings.warn( + '"key_indicator" will be deprecated in EvalHook.' + 'Please use "save_best" to specify the metric key,' + 'e.g., save_best="AP".', DeprecationWarning) + + key_indicator = eval_kwargs.pop('key_indicator', 'AP') + if save_best is True and key_indicator is None: + raise ValueError('key_indicator should not be None, when ' + 'save_best is set to True.') + save_best = key_indicator + + super().__init__(dataloader, start, interval, by_epoch, save_best, + rule, test_fn, greater_keys, less_keys, + broadcast_bn_buffer, tmpdir, gpu_collect, + **eval_kwargs) diff --git a/mmpose/core/evaluation/mesh_eval.py b/mmpose/core/evaluation/mesh_eval.py new file mode 100644 index 0000000..683b453 --- /dev/null +++ b/mmpose/core/evaluation/mesh_eval.py @@ -0,0 +1,66 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/akanazawa/hmr +# Original licence: Copyright (c) 2018 akanazawa, under the MIT License. +# ------------------------------------------------------------------------------ + +import numpy as np + + +def compute_similarity_transform(source_points, target_points): + """Computes a similarity transform (sR, t) that takes a set of 3D points + source_points (N x 3) closest to a set of 3D points target_points, where R + is an 3x3 rotation matrix, t 3x1 translation, s scale. And return the + transformed 3D points source_points_hat (N x 3). i.e. solves the orthogonal + Procrutes problem. + + Note: + Points number: N + + Args: + source_points (np.ndarray): Source point set with shape [N, 3]. + target_points (np.ndarray): Target point set with shape [N, 3]. + + Returns: + np.ndarray: Transformed source point set with shape [N, 3]. + """ + + assert target_points.shape[0] == source_points.shape[0] + assert target_points.shape[1] == 3 and source_points.shape[1] == 3 + + source_points = source_points.T + target_points = target_points.T + + # 1. Remove mean. + mu1 = source_points.mean(axis=1, keepdims=True) + mu2 = target_points.mean(axis=1, keepdims=True) + X1 = source_points - mu1 + X2 = target_points - mu2 + + # 2. Compute variance of X1 used for scale. + var1 = np.sum(X1**2) + + # 3. The outer product of X1 and X2. + K = X1.dot(X2.T) + + # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are + # singular vectors of K. + U, _, Vh = np.linalg.svd(K) + V = Vh.T + # Construct Z that fixes the orientation of R to get det(R)=1. + Z = np.eye(U.shape[0]) + Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T))) + # Construct R. + R = V.dot(Z.dot(U.T)) + + # 5. Recover scale. + scale = np.trace(R.dot(K)) / var1 + + # 6. Recover translation. + t = mu2 - scale * (R.dot(mu1)) + + # 7. Transform the source points: + source_points_hat = scale * R.dot(source_points) + t + + source_points_hat = source_points_hat.T + + return source_points_hat diff --git a/mmpose/core/evaluation/pose3d_eval.py b/mmpose/core/evaluation/pose3d_eval.py new file mode 100644 index 0000000..545778c --- /dev/null +++ b/mmpose/core/evaluation/pose3d_eval.py @@ -0,0 +1,171 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + +from .mesh_eval import compute_similarity_transform + + +def keypoint_mpjpe(pred, gt, mask, alignment='none'): + """Calculate the mean per-joint position error (MPJPE) and the error after + rigid alignment with the ground truth (P-MPJPE). + + Note: + - batch_size: N + - num_keypoints: K + - keypoint_dims: C + + Args: + pred (np.ndarray): Predicted keypoint location with shape [N, K, C]. + gt (np.ndarray): Groundtruth keypoint location with shape [N, K, C]. + mask (np.ndarray): Visibility of the target with shape [N, K]. + False for invisible joints, and True for visible. + Invisible joints will be ignored for accuracy calculation. + alignment (str, optional): method to align the prediction with the + groundtruth. Supported options are: + + - ``'none'``: no alignment will be applied + - ``'scale'``: align in the least-square sense in scale + - ``'procrustes'``: align in the least-square sense in + scale, rotation and translation. + Returns: + tuple: A tuple containing joint position errors + + - (float | np.ndarray): mean per-joint position error (mpjpe). + - (float | np.ndarray): mpjpe after rigid alignment with the + ground truth (p-mpjpe). + """ + assert mask.any() + + if alignment == 'none': + pass + elif alignment == 'procrustes': + pred = np.stack([ + compute_similarity_transform(pred_i, gt_i) + for pred_i, gt_i in zip(pred, gt) + ]) + elif alignment == 'scale': + pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred) + pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt) + scale_factor = pred_dot_gt / pred_dot_pred + pred = pred * scale_factor[:, None, None] + else: + raise ValueError(f'Invalid value for alignment: {alignment}') + + error = np.linalg.norm(pred - gt, ord=2, axis=-1)[mask].mean() + + return error + + +def keypoint_3d_pck(pred, gt, mask, alignment='none', threshold=0.15): + """Calculate the Percentage of Correct Keypoints (3DPCK) w. or w/o rigid + alignment. + + Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved + CNN Supervision' 3DV'2017. `__ . + + Note: + - batch_size: N + - num_keypoints: K + - keypoint_dims: C + + Args: + pred (np.ndarray[N, K, C]): Predicted keypoint location. + gt (np.ndarray[N, K, C]): Groundtruth keypoint location. + mask (np.ndarray[N, K]): Visibility of the target. False for invisible + joints, and True for visible. Invisible joints will be ignored for + accuracy calculation. + alignment (str, optional): method to align the prediction with the + groundtruth. Supported options are: + + - ``'none'``: no alignment will be applied + - ``'scale'``: align in the least-square sense in scale + - ``'procrustes'``: align in the least-square sense in scale, + rotation and translation. + + threshold: If L2 distance between the prediction and the groundtruth + is less then threshold, the predicted result is considered as + correct. Default: 0.15 (m). + + Returns: + pck: percentage of correct keypoints. + """ + assert mask.any() + + if alignment == 'none': + pass + elif alignment == 'procrustes': + pred = np.stack([ + compute_similarity_transform(pred_i, gt_i) + for pred_i, gt_i in zip(pred, gt) + ]) + elif alignment == 'scale': + pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred) + pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt) + scale_factor = pred_dot_gt / pred_dot_pred + pred = pred * scale_factor[:, None, None] + else: + raise ValueError(f'Invalid value for alignment: {alignment}') + + error = np.linalg.norm(pred - gt, ord=2, axis=-1) + pck = (error < threshold).astype(np.float32)[mask].mean() * 100 + + return pck + + +def keypoint_3d_auc(pred, gt, mask, alignment='none'): + """Calculate the Area Under the Curve (3DAUC) computed for a range of 3DPCK + thresholds. + + Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved + CNN Supervision' 3DV'2017. `__ . + This implementation is derived from mpii_compute_3d_pck.m, which is + provided as part of the MPI-INF-3DHP test data release. + + Note: + batch_size: N + num_keypoints: K + keypoint_dims: C + + Args: + pred (np.ndarray[N, K, C]): Predicted keypoint location. + gt (np.ndarray[N, K, C]): Groundtruth keypoint location. + mask (np.ndarray[N, K]): Visibility of the target. False for invisible + joints, and True for visible. Invisible joints will be ignored for + accuracy calculation. + alignment (str, optional): method to align the prediction with the + groundtruth. Supported options are: + + - ``'none'``: no alignment will be applied + - ``'scale'``: align in the least-square sense in scale + - ``'procrustes'``: align in the least-square sense in scale, + rotation and translation. + + Returns: + auc: AUC computed for a range of 3DPCK thresholds. + """ + assert mask.any() + + if alignment == 'none': + pass + elif alignment == 'procrustes': + pred = np.stack([ + compute_similarity_transform(pred_i, gt_i) + for pred_i, gt_i in zip(pred, gt) + ]) + elif alignment == 'scale': + pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred) + pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt) + scale_factor = pred_dot_gt / pred_dot_pred + pred = pred * scale_factor[:, None, None] + else: + raise ValueError(f'Invalid value for alignment: {alignment}') + + error = np.linalg.norm(pred - gt, ord=2, axis=-1) + + thresholds = np.linspace(0., 0.15, 31) + pck_values = np.zeros(len(thresholds)) + for i in range(len(thresholds)): + pck_values[i] = (error < thresholds[i]).astype(np.float32)[mask].mean() + + auc = pck_values.mean() * 100 + + return auc diff --git a/mmpose/core/evaluation/top_down_eval.py b/mmpose/core/evaluation/top_down_eval.py new file mode 100644 index 0000000..ee6a250 --- /dev/null +++ b/mmpose/core/evaluation/top_down_eval.py @@ -0,0 +1,684 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import cv2 +import numpy as np + +from mmpose.core.post_processing import transform_preds + + +def _calc_distances(preds, targets, mask, normalize): + """Calculate the normalized distances between preds and target. + + Note: + batch_size: N + num_keypoints: K + dimension of keypoints: D (normally, D=2 or D=3) + + Args: + preds (np.ndarray[N, K, D]): Predicted keypoint location. + targets (np.ndarray[N, K, D]): Groundtruth keypoint location. + mask (np.ndarray[N, K]): Visibility of the target. False for invisible + joints, and True for visible. Invisible joints will be ignored for + accuracy calculation. + normalize (np.ndarray[N, D]): Typical value is heatmap_size + + Returns: + np.ndarray[K, N]: The normalized distances. \ + If target keypoints are missing, the distance is -1. + """ + N, K, _ = preds.shape + # set mask=0 when normalize==0 + _mask = mask.copy() + _mask[np.where((normalize == 0).sum(1))[0], :] = False + distances = np.full((N, K), -1, dtype=np.float32) + # handle invalid values + normalize[np.where(normalize <= 0)] = 1e6 + distances[_mask] = np.linalg.norm( + ((preds - targets) / normalize[:, None, :])[_mask], axis=-1) + return distances.T + + +def _distance_acc(distances, thr=0.5): + """Return the percentage below the distance threshold, while ignoring + distances values with -1. + + Note: + batch_size: N + Args: + distances (np.ndarray[N, ]): The normalized distances. + thr (float): Threshold of the distances. + + Returns: + float: Percentage of distances below the threshold. \ + If all target keypoints are missing, return -1. + """ + distance_valid = distances != -1 + num_distance_valid = distance_valid.sum() + if num_distance_valid > 0: + return (distances[distance_valid] < thr).sum() / num_distance_valid + return -1 + + +def _get_max_preds(heatmaps): + """Get keypoint predictions from score maps. + + Note: + batch_size: N + num_keypoints: K + heatmap height: H + heatmap width: W + + Args: + heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps. + + Returns: + tuple: A tuple containing aggregated results. + + - preds (np.ndarray[N, K, 2]): Predicted keypoint location. + - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints. + """ + assert isinstance(heatmaps, + np.ndarray), ('heatmaps should be numpy.ndarray') + assert heatmaps.ndim == 4, 'batch_images should be 4-ndim' + + N, K, _, W = heatmaps.shape + heatmaps_reshaped = heatmaps.reshape((N, K, -1)) + idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1)) + maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1)) + + preds = np.tile(idx, (1, 1, 2)).astype(np.float32) + preds[:, :, 0] = preds[:, :, 0] % W + preds[:, :, 1] = preds[:, :, 1] // W + + preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1) + return preds, maxvals + + +def _get_max_preds_3d(heatmaps): + """Get keypoint predictions from 3D score maps. + + Note: + batch size: N + num keypoints: K + heatmap depth size: D + heatmap height: H + heatmap width: W + + Args: + heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps. + + Returns: + tuple: A tuple containing aggregated results. + + - preds (np.ndarray[N, K, 3]): Predicted keypoint location. + - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints. + """ + assert isinstance(heatmaps, np.ndarray), \ + ('heatmaps should be numpy.ndarray') + assert heatmaps.ndim == 5, 'heatmaps should be 5-ndim' + + N, K, D, H, W = heatmaps.shape + heatmaps_reshaped = heatmaps.reshape((N, K, -1)) + idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1)) + maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1)) + + preds = np.zeros((N, K, 3), dtype=np.float32) + _idx = idx[..., 0] + preds[..., 2] = _idx // (H * W) + preds[..., 1] = (_idx // W) % H + preds[..., 0] = _idx % W + + preds = np.where(maxvals > 0.0, preds, -1) + return preds, maxvals + + +def pose_pck_accuracy(output, target, mask, thr=0.05, normalize=None): + """Calculate the pose accuracy of PCK for each individual keypoint and the + averaged accuracy across all keypoints from heatmaps. + + Note: + PCK metric measures accuracy of the localization of the body joints. + The distances between predicted positions and the ground-truth ones + are typically normalized by the bounding box size. + The threshold (thr) of the normalized distance is commonly set + as 0.05, 0.1 or 0.2 etc. + + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + output (np.ndarray[N, K, H, W]): Model output heatmaps. + target (np.ndarray[N, K, H, W]): Groundtruth heatmaps. + mask (np.ndarray[N, K]): Visibility of the target. False for invisible + joints, and True for visible. Invisible joints will be ignored for + accuracy calculation. + thr (float): Threshold of PCK calculation. Default 0.05. + normalize (np.ndarray[N, 2]): Normalization factor for H&W. + + Returns: + tuple: A tuple containing keypoint accuracy. + + - np.ndarray[K]: Accuracy of each keypoint. + - float: Averaged accuracy across all keypoints. + - int: Number of valid keypoints. + """ + N, K, H, W = output.shape + if K == 0: + return None, 0, 0 + if normalize is None: + normalize = np.tile(np.array([[H, W]]), (N, 1)) + + pred, _ = _get_max_preds(output) + gt, _ = _get_max_preds(target) + return keypoint_pck_accuracy(pred, gt, mask, thr, normalize) + + +def keypoint_pck_accuracy(pred, gt, mask, thr, normalize): + """Calculate the pose accuracy of PCK for each individual keypoint and the + averaged accuracy across all keypoints for coordinates. + + Note: + PCK metric measures accuracy of the localization of the body joints. + The distances between predicted positions and the ground-truth ones + are typically normalized by the bounding box size. + The threshold (thr) of the normalized distance is commonly set + as 0.05, 0.1 or 0.2 etc. + + - batch_size: N + - num_keypoints: K + + Args: + pred (np.ndarray[N, K, 2]): Predicted keypoint location. + gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. + mask (np.ndarray[N, K]): Visibility of the target. False for invisible + joints, and True for visible. Invisible joints will be ignored for + accuracy calculation. + thr (float): Threshold of PCK calculation. + normalize (np.ndarray[N, 2]): Normalization factor for H&W. + + Returns: + tuple: A tuple containing keypoint accuracy. + + - acc (np.ndarray[K]): Accuracy of each keypoint. + - avg_acc (float): Averaged accuracy across all keypoints. + - cnt (int): Number of valid keypoints. + """ + distances = _calc_distances(pred, gt, mask, normalize) + + acc = np.array([_distance_acc(d, thr) for d in distances]) + valid_acc = acc[acc >= 0] + cnt = len(valid_acc) + avg_acc = valid_acc.mean() if cnt > 0 else 0 + return acc, avg_acc, cnt + + +def keypoint_auc(pred, gt, mask, normalize, num_step=20): + """Calculate the pose accuracy of PCK for each individual keypoint and the + averaged accuracy across all keypoints for coordinates. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + pred (np.ndarray[N, K, 2]): Predicted keypoint location. + gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. + mask (np.ndarray[N, K]): Visibility of the target. False for invisible + joints, and True for visible. Invisible joints will be ignored for + accuracy calculation. + normalize (float): Normalization factor. + + Returns: + float: Area under curve. + """ + nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1)) + x = [1.0 * i / num_step for i in range(num_step)] + y = [] + for thr in x: + _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor) + y.append(avg_acc) + + auc = 0 + for i in range(num_step): + auc += 1.0 / num_step * y[i] + return auc + + +def keypoint_nme(pred, gt, mask, normalize_factor): + """Calculate the normalized mean error (NME). + + Note: + - batch_size: N + - num_keypoints: K + + Args: + pred (np.ndarray[N, K, 2]): Predicted keypoint location. + gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. + mask (np.ndarray[N, K]): Visibility of the target. False for invisible + joints, and True for visible. Invisible joints will be ignored for + accuracy calculation. + normalize_factor (np.ndarray[N, 2]): Normalization factor. + + Returns: + float: normalized mean error + """ + distances = _calc_distances(pred, gt, mask, normalize_factor) + distance_valid = distances[distances != -1] + return distance_valid.sum() / max(1, len(distance_valid)) + + +def keypoint_epe(pred, gt, mask): + """Calculate the end-point error. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + pred (np.ndarray[N, K, 2]): Predicted keypoint location. + gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. + mask (np.ndarray[N, K]): Visibility of the target. False for invisible + joints, and True for visible. Invisible joints will be ignored for + accuracy calculation. + + Returns: + float: Average end-point error. + """ + + distances = _calc_distances( + pred, gt, mask, + np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32)) + distance_valid = distances[distances != -1] + return distance_valid.sum() / max(1, len(distance_valid)) + + +def _taylor(heatmap, coord): + """Distribution aware coordinate decoding method. + + Note: + - heatmap height: H + - heatmap width: W + + Args: + heatmap (np.ndarray[H, W]): Heatmap of a particular joint type. + coord (np.ndarray[2,]): Coordinates of the predicted keypoints. + + Returns: + np.ndarray[2,]: Updated coordinates. + """ + H, W = heatmap.shape[:2] + px, py = int(coord[0]), int(coord[1]) + if 1 < px < W - 2 and 1 < py < H - 2: + dx = 0.5 * (heatmap[py][px + 1] - heatmap[py][px - 1]) + dy = 0.5 * (heatmap[py + 1][px] - heatmap[py - 1][px]) + dxx = 0.25 * ( + heatmap[py][px + 2] - 2 * heatmap[py][px] + heatmap[py][px - 2]) + dxy = 0.25 * ( + heatmap[py + 1][px + 1] - heatmap[py - 1][px + 1] - + heatmap[py + 1][px - 1] + heatmap[py - 1][px - 1]) + dyy = 0.25 * ( + heatmap[py + 2 * 1][px] - 2 * heatmap[py][px] + + heatmap[py - 2 * 1][px]) + derivative = np.array([[dx], [dy]]) + hessian = np.array([[dxx, dxy], [dxy, dyy]]) + if dxx * dyy - dxy**2 != 0: + hessianinv = np.linalg.inv(hessian) + offset = -hessianinv @ derivative + offset = np.squeeze(np.array(offset.T), axis=0) + coord += offset + return coord + + +def post_dark_udp(coords, batch_heatmaps, kernel=3): + """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The + Devil is in the Details: Delving into Unbiased Data Processing for Human + Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate + Representation for Human Pose Estimation (CVPR 2020). + + Note: + - batch size: B + - num keypoints: K + - num persons: N + - height of heatmaps: H + - width of heatmaps: W + + B=1 for bottom_up paradigm where all persons share the same heatmap. + B=N for top_down paradigm where each person has its own heatmaps. + + Args: + coords (np.ndarray[N, K, 2]): Initial coordinates of human pose. + batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps + kernel (int): Gaussian kernel size (K) for modulation. + + Returns: + np.ndarray([N, K, 2]): Refined coordinates. + """ + if not isinstance(batch_heatmaps, np.ndarray): + batch_heatmaps = batch_heatmaps.cpu().numpy() + B, K, H, W = batch_heatmaps.shape + N = coords.shape[0] + assert (B == 1 or B == N) + for heatmaps in batch_heatmaps: + for heatmap in heatmaps: + cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap) + np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps) + np.log(batch_heatmaps, batch_heatmaps) + + batch_heatmaps_pad = np.pad( + batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), + mode='edge').flatten() + + index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2) + index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K) + index = index.astype(int).reshape(-1, 1) + i_ = batch_heatmaps_pad[index] + ix1 = batch_heatmaps_pad[index + 1] + iy1 = batch_heatmaps_pad[index + W + 2] + ix1y1 = batch_heatmaps_pad[index + W + 3] + ix1_y1_ = batch_heatmaps_pad[index - W - 3] + ix1_ = batch_heatmaps_pad[index - 1] + iy1_ = batch_heatmaps_pad[index - 2 - W] + + dx = 0.5 * (ix1 - ix1_) + dy = 0.5 * (iy1 - iy1_) + derivative = np.concatenate([dx, dy], axis=1) + derivative = derivative.reshape(N, K, 2, 1) + dxx = ix1 - 2 * i_ + ix1_ + dyy = iy1 - 2 * i_ + iy1_ + dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_) + hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1) + hessian = hessian.reshape(N, K, 2, 2) + hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2)) + coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze() + return coords + + +def _gaussian_blur(heatmaps, kernel=11): + """Modulate heatmap distribution with Gaussian. + sigma = 0.3*((kernel_size-1)*0.5-1)+0.8 + sigma~=3 if k=17 + sigma=2 if k=11; + sigma~=1.5 if k=7; + sigma~=1 if k=3; + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps. + kernel (int): Gaussian kernel size (K) for modulation, which should + match the heatmap gaussian sigma when training. + K=17 for sigma=3 and k=11 for sigma=2. + + Returns: + np.ndarray ([N, K, H, W]): Modulated heatmap distribution. + """ + assert kernel % 2 == 1 + + border = (kernel - 1) // 2 + batch_size = heatmaps.shape[0] + num_joints = heatmaps.shape[1] + height = heatmaps.shape[2] + width = heatmaps.shape[3] + for i in range(batch_size): + for j in range(num_joints): + origin_max = np.max(heatmaps[i, j]) + dr = np.zeros((height + 2 * border, width + 2 * border), + dtype=np.float32) + dr[border:-border, border:-border] = heatmaps[i, j].copy() + dr = cv2.GaussianBlur(dr, (kernel, kernel), 0) + heatmaps[i, j] = dr[border:-border, border:-border].copy() + heatmaps[i, j] *= origin_max / np.max(heatmaps[i, j]) + return heatmaps + + +def keypoints_from_regression(regression_preds, center, scale, img_size): + """Get final keypoint predictions from regression vectors and transform + them back to the image. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + regression_preds (np.ndarray[N, K, 2]): model prediction. + center (np.ndarray[N, 2]): Center of the bounding box (x, y). + scale (np.ndarray[N, 2]): Scale of the bounding box + wrt height/width. + img_size (list(img_width, img_height)): model input image size. + + Returns: + tuple: + + - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images. + - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints. + """ + N, K, _ = regression_preds.shape + preds, maxvals = regression_preds, np.ones((N, K, 1), dtype=np.float32) + + preds = preds * img_size + + # Transform back to the image + for i in range(N): + preds[i] = transform_preds(preds[i], center[i], scale[i], img_size) + + return preds, maxvals + + +def keypoints_from_heatmaps(heatmaps, + center, + scale, + unbiased=False, + post_process='default', + kernel=11, + valid_radius_factor=0.0546875, + use_udp=False, + target_type='GaussianHeatmap'): + """Get final keypoint predictions from heatmaps and transform them back to + the image. + + Note: + - batch size: N + - num keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps. + center (np.ndarray[N, 2]): Center of the bounding box (x, y). + scale (np.ndarray[N, 2]): Scale of the bounding box + wrt height/width. + post_process (str/None): Choice of methods to post-process + heatmaps. Currently supported: None, 'default', 'unbiased', + 'megvii'. + unbiased (bool): Option to use unbiased decoding. Mutually + exclusive with megvii. + Note: this arg is deprecated and unbiased=True can be replaced + by post_process='unbiased' + Paper ref: Zhang et al. Distribution-Aware Coordinate + Representation for Human Pose Estimation (CVPR 2020). + kernel (int): Gaussian kernel size (K) for modulation, which should + match the heatmap gaussian sigma when training. + K=17 for sigma=3 and k=11 for sigma=2. + valid_radius_factor (float): The radius factor of the positive area + in classification heatmap for UDP. + use_udp (bool): Use unbiased data processing. + target_type (str): 'GaussianHeatmap' or 'CombinedTarget'. + GaussianHeatmap: Classification target with gaussian distribution. + CombinedTarget: The combination of classification target + (response map) and regression target (offset map). + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + + Returns: + tuple: A tuple containing keypoint predictions and scores. + + - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images. + - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints. + """ + # Avoid being affected + heatmaps = heatmaps.copy() + + # detect conflicts + if unbiased: + assert post_process not in [False, None, 'megvii'] + if post_process in ['megvii', 'unbiased']: + assert kernel > 0 + if use_udp: + assert not post_process == 'megvii' + + # normalize configs + if post_process is False: + warnings.warn( + 'post_process=False is deprecated, ' + 'please use post_process=None instead', DeprecationWarning) + post_process = None + elif post_process is True: + if unbiased is True: + warnings.warn( + 'post_process=True, unbiased=True is deprecated,' + " please use post_process='unbiased' instead", + DeprecationWarning) + post_process = 'unbiased' + else: + warnings.warn( + 'post_process=True, unbiased=False is deprecated, ' + "please use post_process='default' instead", + DeprecationWarning) + post_process = 'default' + elif post_process == 'default': + if unbiased is True: + warnings.warn( + 'unbiased=True is deprecated, please use ' + "post_process='unbiased' instead", DeprecationWarning) + post_process = 'unbiased' + + # start processing + if post_process == 'megvii': + heatmaps = _gaussian_blur(heatmaps, kernel=kernel) + + N, K, H, W = heatmaps.shape + if use_udp: + if target_type.lower() == 'GaussianHeatMap'.lower(): + preds, maxvals = _get_max_preds(heatmaps) + preds = post_dark_udp(preds, heatmaps, kernel=kernel) + elif target_type.lower() == 'CombinedTarget'.lower(): + for person_heatmaps in heatmaps: + for i, heatmap in enumerate(person_heatmaps): + kt = 2 * kernel + 1 if i % 3 == 0 else kernel + cv2.GaussianBlur(heatmap, (kt, kt), 0, heatmap) + # valid radius is in direct proportion to the height of heatmap. + valid_radius = valid_radius_factor * H + offset_x = heatmaps[:, 1::3, :].flatten() * valid_radius + offset_y = heatmaps[:, 2::3, :].flatten() * valid_radius + heatmaps = heatmaps[:, ::3, :] + preds, maxvals = _get_max_preds(heatmaps) + index = preds[..., 0] + preds[..., 1] * W + index += W * H * np.arange(0, N * K / 3) + index = index.astype(int).reshape(N, K // 3, 1) + preds += np.concatenate((offset_x[index], offset_y[index]), axis=2) + else: + raise ValueError('target_type should be either ' + "'GaussianHeatmap' or 'CombinedTarget'") + else: + preds, maxvals = _get_max_preds(heatmaps) + if post_process == 'unbiased': # alleviate biased coordinate + # apply Gaussian distribution modulation. + heatmaps = np.log( + np.maximum(_gaussian_blur(heatmaps, kernel), 1e-10)) + for n in range(N): + for k in range(K): + preds[n][k] = _taylor(heatmaps[n][k], preds[n][k]) + elif post_process is not None: + # add +/-0.25 shift to the predicted locations for higher acc. + for n in range(N): + for k in range(K): + heatmap = heatmaps[n][k] + px = int(preds[n][k][0]) + py = int(preds[n][k][1]) + if 1 < px < W - 1 and 1 < py < H - 1: + diff = np.array([ + heatmap[py][px + 1] - heatmap[py][px - 1], + heatmap[py + 1][px] - heatmap[py - 1][px] + ]) + preds[n][k] += np.sign(diff) * .25 + if post_process == 'megvii': + preds[n][k] += 0.5 + + # Transform back to the image + for i in range(N): + preds[i] = transform_preds( + preds[i], center[i], scale[i], [W, H], use_udp=use_udp) + + if post_process == 'megvii': + maxvals = maxvals / 255.0 + 0.5 + + return preds, maxvals + + +def keypoints_from_heatmaps3d(heatmaps, center, scale): + """Get final keypoint predictions from 3d heatmaps and transform them back + to the image. + + Note: + - batch size: N + - num keypoints: K + - heatmap depth size: D + - heatmap height: H + - heatmap width: W + + Args: + heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps. + center (np.ndarray[N, 2]): Center of the bounding box (x, y). + scale (np.ndarray[N, 2]): Scale of the bounding box + wrt height/width. + + Returns: + tuple: A tuple containing keypoint predictions and scores. + + - preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location \ + in images. + - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints. + """ + N, K, D, H, W = heatmaps.shape + preds, maxvals = _get_max_preds_3d(heatmaps) + # Transform back to the image + for i in range(N): + preds[i, :, :2] = transform_preds(preds[i, :, :2], center[i], scale[i], + [W, H]) + return preds, maxvals + + +def multilabel_classification_accuracy(pred, gt, mask, thr=0.5): + """Get multi-label classification accuracy. + + Note: + - batch size: N + - label number: L + + Args: + pred (np.ndarray[N, L, 2]): model predicted labels. + gt (np.ndarray[N, L, 2]): ground-truth labels. + mask (np.ndarray[N, 1] or np.ndarray[N, L] ): reliability of + ground-truth labels. + + Returns: + float: multi-label classification accuracy. + """ + # we only compute accuracy on the samples with ground-truth of all labels. + valid = (mask > 0).min(axis=1) if mask.ndim == 2 else (mask > 0) + pred, gt = pred[valid], gt[valid] + + if pred.shape[0] == 0: + acc = 0.0 # when no sample is with gt labels, set acc to 0. + else: + # The classification of a sample is regarded as correct + # only if it's correct for all labels. + acc = (((pred - thr) * (gt - thr)) > 0).all(axis=1).mean() + return acc diff --git a/mmpose/core/fp16/__init__.py b/mmpose/core/fp16/__init__.py new file mode 100644 index 0000000..5cb0548 --- /dev/null +++ b/mmpose/core/fp16/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .decorators import auto_fp16, force_fp32 +from .hooks import Fp16OptimizerHook, wrap_fp16_model +from .utils import cast_tensor_type + +__all__ = [ + 'auto_fp16', 'force_fp32', 'Fp16OptimizerHook', 'wrap_fp16_model', + 'cast_tensor_type' +] diff --git a/mmpose/core/fp16/decorators.py b/mmpose/core/fp16/decorators.py new file mode 100644 index 0000000..2d70ddf --- /dev/null +++ b/mmpose/core/fp16/decorators.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +import warnings +from inspect import getfullargspec + +import torch + +from .utils import cast_tensor_type + + +def auto_fp16(apply_to=None, out_fp32=False): + """Decorator to enable fp16 training automatically. + + This decorator is useful when you write custom modules and want to support + mixed precision training. If inputs arguments are fp32 tensors, they will + be converted to fp16 automatically. Arguments other than fp32 tensors are + ignored. + + Args: + apply_to (Iterable, optional): The argument names to be converted. + `None` indicates all arguments. + out_fp32 (bool): Whether to convert the output back to fp32. + + Example: + + >>> import torch.nn as nn + >>> class MyModule1(nn.Module): + >>> + >>> # Convert x and y to fp16 + >>> @auto_fp16() + >>> def forward(self, x, y): + >>> pass + + >>> import torch.nn as nn + >>> class MyModule2(nn.Module): + >>> + >>> # convert pred to fp16 + >>> @auto_fp16(apply_to=('pred', )) + >>> def do_something(self, pred, others): + >>> pass + """ + + warnings.warn( + 'auto_fp16 in mmpose will be deprecated in the next release.' + 'Please use mmcv.runner.auto_fp16 instead (mmcv>=1.3.1).', + DeprecationWarning) + + def auto_fp16_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # check if the module has set the attribute `fp16_enabled`, if not, + # just fallback to the original method. + if not isinstance(args[0], torch.nn.Module): + raise TypeError('@auto_fp16 can only be used to decorate the ' + 'method of nn.Module') + if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): + return old_func(*args, **kwargs) + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get the argument names to be casted + args_to_cast = args_info.args if apply_to is None else apply_to + # convert the args that need to be processed + new_args = [] + # NOTE: default args are not taken into consideration + if args: + arg_names = args_info.args[:len(args)] + for i, arg_name in enumerate(arg_names): + if arg_name in args_to_cast: + new_args.append( + cast_tensor_type(args[i], torch.float, torch.half)) + else: + new_args.append(args[i]) + # convert the kwargs that need to be processed + new_kwargs = {} + if kwargs: + for arg_name, arg_value in kwargs.items(): + if arg_name in args_to_cast: + new_kwargs[arg_name] = cast_tensor_type( + arg_value, torch.float, torch.half) + else: + new_kwargs[arg_name] = arg_value + # apply converted arguments to the decorated method + output = old_func(*new_args, **new_kwargs) + # cast the results back to fp32 if necessary + if out_fp32: + output = cast_tensor_type(output, torch.half, torch.float) + return output + + return new_func + + return auto_fp16_wrapper + + +def force_fp32(apply_to=None, out_fp16=False): + """Decorator to convert input arguments to fp32 in force. + + This decorator is useful when you write custom modules and want to support + mixed precision training. If there are some inputs that must be processed + in fp32 mode, then this decorator can handle it. If inputs arguments are + fp16 tensors, they will be converted to fp32 automatically. Arguments other + than fp16 tensors are ignored. + + Args: + apply_to (Iterable, optional): The argument names to be converted. + `None` indicates all arguments. + out_fp16 (bool): Whether to convert the output back to fp16. + + Example: + + >>> import torch.nn as nn + >>> class MyModule1(nn.Module): + >>> + >>> # Convert x and y to fp32 + >>> @force_fp32() + >>> def loss(self, x, y): + >>> pass + + >>> import torch.nn as nn + >>> class MyModule2(nn.Module): + >>> + >>> # convert pred to fp32 + >>> @force_fp32(apply_to=('pred', )) + >>> def post_process(self, pred, others): + >>> pass + """ + warnings.warn( + 'force_fp32 in mmpose will be deprecated in the next release.' + 'Please use mmcv.runner.force_fp32 instead (mmcv>=1.3.1).', + DeprecationWarning) + + def force_fp32_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # check if the module has set the attribute `fp16_enabled`, if not, + # just fallback to the original method. + if not isinstance(args[0], torch.nn.Module): + raise TypeError('@force_fp32 can only be used to decorate the ' + 'method of nn.Module') + if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): + return old_func(*args, **kwargs) + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get the argument names to be casted + args_to_cast = args_info.args if apply_to is None else apply_to + # convert the args that need to be processed + new_args = [] + if args: + arg_names = args_info.args[:len(args)] + for i, arg_name in enumerate(arg_names): + if arg_name in args_to_cast: + new_args.append( + cast_tensor_type(args[i], torch.half, torch.float)) + else: + new_args.append(args[i]) + # convert the kwargs that need to be processed + new_kwargs = dict() + if kwargs: + for arg_name, arg_value in kwargs.items(): + if arg_name in args_to_cast: + new_kwargs[arg_name] = cast_tensor_type( + arg_value, torch.half, torch.float) + else: + new_kwargs[arg_name] = arg_value + # apply converted arguments to the decorated method + output = old_func(*new_args, **new_kwargs) + # cast the results back to fp32 if necessary + if out_fp16: + output = cast_tensor_type(output, torch.float, torch.half) + return output + + return new_func + + return force_fp32_wrapper diff --git a/mmpose/core/fp16/hooks.py b/mmpose/core/fp16/hooks.py new file mode 100644 index 0000000..74081a9 --- /dev/null +++ b/mmpose/core/fp16/hooks.py @@ -0,0 +1,167 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch +import torch.nn as nn +from mmcv.runner import OptimizerHook +from mmcv.utils import _BatchNorm + +from ..utils.dist_utils import allreduce_grads +from .utils import cast_tensor_type + + +class Fp16OptimizerHook(OptimizerHook): + """FP16 optimizer hook. + + The steps of fp16 optimizer is as follows. + 1. Scale the loss value. + 2. BP in the fp16 model. + 2. Copy gradients from fp16 model to fp32 weights. + 3. Update fp32 weights. + 4. Copy updated parameters from fp32 weights to fp16 model. + + Refer to https://arxiv.org/abs/1710.03740 for more details. + + Args: + loss_scale (float): Scale factor multiplied with loss. + """ + + def __init__(self, + grad_clip=None, + coalesce=True, + bucket_size_mb=-1, + loss_scale=512., + distributed=True): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + self.loss_scale = loss_scale + self.distributed = distributed + + def before_run(self, runner): + """Preparing steps before Mixed Precision Training. + + 1. Make a master copy of fp32 weights for optimization. + 2. Convert the main model from fp32 to fp16. + + Args: + runner (:obj:`mmcv.Runner`): The underlines training runner. + """ + # keep a copy of fp32 weights + runner.optimizer.param_groups = copy.deepcopy( + runner.optimizer.param_groups) + # convert model to fp16 + wrap_fp16_model(runner.model) + + @staticmethod + def copy_grads_to_fp32(fp16_net, fp32_weights): + """Copy gradients from fp16 model to fp32 weight copy.""" + for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()): + if fp16_param.grad is not None: + if fp32_param.grad is None: + fp32_param.grad = fp32_param.data.new(fp32_param.size()) + fp32_param.grad.copy_(fp16_param.grad) + + @staticmethod + def copy_params_to_fp16(fp16_net, fp32_weights): + """Copy updated params from fp32 weight copy to fp16 model.""" + for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights): + fp16_param.data.copy_(fp32_param.data) + + def after_train_iter(self, runner): + """Backward optimization steps for Mixed Precision Training. + + 1. Scale the loss by a scale factor. + 2. Backward the loss to obtain the gradients (fp16). + 3. Copy gradients from the model to the fp32 weight copy. + 4. Scale the gradients back and update the fp32 weight copy. + 5. Copy back the params from fp32 weight copy to the fp16 model. + + Args: + runner (:obj:`mmcv.Runner`): The underlines training runner. + """ + # clear grads of last iteration + runner.model.zero_grad() + runner.optimizer.zero_grad() + # scale the loss value + scaled_loss = runner.outputs['loss'] * self.loss_scale + scaled_loss.backward() + # copy fp16 grads in the model to fp32 params in the optimizer + fp32_weights = [] + for param_group in runner.optimizer.param_groups: + fp32_weights += param_group['params'] + self.copy_grads_to_fp32(runner.model, fp32_weights) + # allreduce grads + if self.distributed: + allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb) + # scale the gradients back + for param in fp32_weights: + if param.grad is not None: + param.grad.div_(self.loss_scale) + if self.grad_clip is not None: + self.clip_grads(fp32_weights) + # update fp32 params + runner.optimizer.step() + # copy fp32 params to the fp16 model + self.copy_params_to_fp16(runner.model, fp32_weights) + + +def wrap_fp16_model(model): + """Wrap the FP32 model to FP16. + + 1. Convert FP32 model to FP16. + 2. Remain some necessary layers to be FP32, e.g., normalization layers. + + Args: + model (nn.Module): Model in FP32. + """ + # convert model to fp16 + model.half() + # patch the normalization layers to make it work in fp32 mode + patch_norm_fp32(model) + # set `fp16_enabled` flag + for m in model.modules(): + if hasattr(m, 'fp16_enabled'): + m.fp16_enabled = True + + +def patch_norm_fp32(module): + """Recursively convert normalization layers from FP16 to FP32. + + Args: + module (nn.Module): The modules to be converted in FP16. + + Returns: + nn.Module: The converted module, the normalization layers have been + converted to FP32. + """ + if isinstance(module, (_BatchNorm, nn.GroupNorm)): + module.float() + module.forward = patch_forward_method(module.forward, torch.half, + torch.float) + for child in module.children(): + patch_norm_fp32(child) + return module + + +def patch_forward_method(func, src_type, dst_type, convert_output=True): + """Patch the forward method of a module. + + Args: + func (callable): The original forward method. + src_type (torch.dtype): Type of input arguments to be converted from. + dst_type (torch.dtype): Type of input arguments to be converted to. + convert_output (bool): Whether to convert the output back to src_type. + + Returns: + callable: The patched forward method. + """ + + def new_forward(*args, **kwargs): + output = func(*cast_tensor_type(args, src_type, dst_type), + **cast_tensor_type(kwargs, src_type, dst_type)) + if convert_output: + output = cast_tensor_type(output, dst_type, src_type) + return output + + return new_forward diff --git a/mmpose/core/fp16/utils.py b/mmpose/core/fp16/utils.py new file mode 100644 index 0000000..f1ec3d3 --- /dev/null +++ b/mmpose/core/fp16/utils.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import abc + +import numpy as np +import torch + + +def cast_tensor_type(inputs, src_type, dst_type): + """Recursively convert Tensor in inputs from src_type to dst_type. + + Args: + inputs: Inputs that to be casted. + src_type (torch.dtype): Source type. + dst_type (torch.dtype): Destination type. + + Returns: + The same type with inputs, but all contained Tensors have been cast. + """ + if isinstance(inputs, torch.Tensor): + return inputs.to(dst_type) + elif isinstance(inputs, str): + return inputs + elif isinstance(inputs, np.ndarray): + return inputs + elif isinstance(inputs, abc.Mapping): + return type(inputs)({ + k: cast_tensor_type(v, src_type, dst_type) + for k, v in inputs.items() + }) + elif isinstance(inputs, abc.Iterable): + return type(inputs)( + cast_tensor_type(item, src_type, dst_type) for item in inputs) + + return inputs diff --git a/mmpose/core/optimizer/__init__.py b/mmpose/core/optimizer/__init__.py new file mode 100644 index 0000000..4340ffc --- /dev/null +++ b/mmpose/core/optimizer/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .builder import OPTIMIZERS, build_optimizers + +__all__ = ['build_optimizers', 'OPTIMIZERS'] diff --git a/mmpose/core/optimizer/builder.py b/mmpose/core/optimizer/builder.py new file mode 100644 index 0000000..7d6accd --- /dev/null +++ b/mmpose/core/optimizer/builder.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.runner import build_optimizer +from mmcv.utils import Registry + +OPTIMIZERS = Registry('optimizers') + + +def build_optimizers(model, cfgs): + """Build multiple optimizers from configs. + + If `cfgs` contains several dicts for optimizers, then a dict for each + constructed optimizers will be returned. + If `cfgs` only contains one optimizer config, the constructed optimizer + itself will be returned. + + For example, + + 1) Multiple optimizer configs: + + .. code-block:: python + + optimizer_cfg = dict( + model1=dict(type='SGD', lr=lr), + model2=dict(type='SGD', lr=lr)) + + The return dict is + ``dict('model1': torch.optim.Optimizer, 'model2': torch.optim.Optimizer)`` + + 2) Single optimizer config: + + .. code-block:: python + + optimizer_cfg = dict(type='SGD', lr=lr) + + The return is ``torch.optim.Optimizer``. + + Args: + model (:obj:`nn.Module`): The model with parameters to be optimized. + cfgs (dict): The config dict of the optimizer. + + Returns: + dict[:obj:`torch.optim.Optimizer`] | :obj:`torch.optim.Optimizer`: + The initialized optimizers. + """ + optimizers = {} + if hasattr(model, 'module'): + model = model.module + # determine whether 'cfgs' has several dicts for optimizers + if all(isinstance(v, dict) for v in cfgs.values()): + for key, cfg in cfgs.items(): + cfg_ = cfg.copy() + module = getattr(model, key) + optimizers[key] = build_optimizer(module, cfg_) + return optimizers + + return build_optimizer(model, cfgs) diff --git a/mmpose/core/post_processing/__init__.py b/mmpose/core/post_processing/__init__.py new file mode 100644 index 0000000..1ee6858 --- /dev/null +++ b/mmpose/core/post_processing/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .nms import oks_iou, oks_nms, soft_oks_nms +from .one_euro_filter import OneEuroFilter +from .post_transforms import (affine_transform, flip_back, fliplr_joints, + fliplr_regression, get_affine_transform, + get_warp_matrix, rotate_point, transform_preds, + warp_affine_joints) + +__all__ = [ + 'oks_nms', 'soft_oks_nms', 'affine_transform', 'rotate_point', 'flip_back', + 'fliplr_joints', 'fliplr_regression', 'transform_preds', + 'get_affine_transform', 'get_warp_matrix', 'warp_affine_joints', + 'OneEuroFilter', 'oks_iou' +] diff --git a/mmpose/core/post_processing/group.py b/mmpose/core/post_processing/group.py new file mode 100644 index 0000000..6235dbc --- /dev/null +++ b/mmpose/core/post_processing/group.py @@ -0,0 +1,410 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/princeton-vl/pose-ae-train/ +# Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License. +# ------------------------------------------------------------------------------ + +import numpy as np +import torch +from munkres import Munkres + +from mmpose.core.evaluation import post_dark_udp + + +def _py_max_match(scores): + """Apply munkres algorithm to get the best match. + + Args: + scores(np.ndarray): cost matrix. + + Returns: + np.ndarray: best match. + """ + m = Munkres() + tmp = m.compute(scores) + tmp = np.array(tmp).astype(int) + return tmp + + +def _match_by_tag(inp, params): + """Match joints by tags. Use Munkres algorithm to calculate the best match + for keypoints grouping. + + Note: + number of keypoints: K + max number of people in an image: M (M=30 by default) + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + inp(tuple): + tag_k (np.ndarray[KxMxL]): tag corresponding to the + top k values of feature map per keypoint. + loc_k (np.ndarray[KxMx2]): top k locations of the + feature maps for keypoint. + val_k (np.ndarray[KxM]): top k value of the + feature maps per keypoint. + params(Params): class Params(). + + Returns: + np.ndarray: result of pose groups. + """ + assert isinstance(params, _Params), 'params should be class _Params()' + + tag_k, loc_k, val_k = inp + + default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]), + dtype=np.float32) + + joint_dict = {} + tag_dict = {} + for i in range(params.num_joints): + idx = params.joint_order[i] + + tags = tag_k[idx] + joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1) + mask = joints[:, 2] > params.detection_threshold + tags = tags[mask] + joints = joints[mask] + + if joints.shape[0] == 0: + continue + + if i == 0 or len(joint_dict) == 0: + for tag, joint in zip(tags, joints): + key = tag[0] + joint_dict.setdefault(key, np.copy(default_))[idx] = joint + tag_dict[key] = [tag] + else: + grouped_keys = list(joint_dict.keys())[:params.max_num_people] + grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys] + + if (params.ignore_too_much + and len(grouped_keys) == params.max_num_people): + continue + + diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :] + diff_normed = np.linalg.norm(diff, ord=2, axis=2) + diff_saved = np.copy(diff_normed) + + if params.use_detection_val: + diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3] + + num_added = diff.shape[0] + num_grouped = diff.shape[1] + + if num_added > num_grouped: + diff_normed = np.concatenate( + (diff_normed, + np.zeros((num_added, num_added - num_grouped), + dtype=np.float32) + 1e10), + axis=1) + + pairs = _py_max_match(diff_normed) + for row, col in pairs: + if (row < num_added and col < num_grouped + and diff_saved[row][col] < params.tag_threshold): + key = grouped_keys[col] + joint_dict[key][idx] = joints[row] + tag_dict[key].append(tags[row]) + else: + key = tags[row][0] + joint_dict.setdefault(key, np.copy(default_))[idx] = \ + joints[row] + tag_dict[key] = [tags[row]] + + results = np.array([joint_dict[i] for i in joint_dict]).astype(np.float32) + return results + + +class _Params: + """A class of parameter. + + Args: + cfg(Config): config. + """ + + def __init__(self, cfg): + self.num_joints = cfg['num_joints'] + self.max_num_people = cfg['max_num_people'] + + self.detection_threshold = cfg['detection_threshold'] + self.tag_threshold = cfg['tag_threshold'] + self.use_detection_val = cfg['use_detection_val'] + self.ignore_too_much = cfg['ignore_too_much'] + + if self.num_joints == 17: + self.joint_order = [ + i - 1 for i in + [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17] + ] + else: + self.joint_order = list(np.arange(self.num_joints)) + + +class HeatmapParser: + """The heatmap parser for post processing.""" + + def __init__(self, cfg): + self.params = _Params(cfg) + self.tag_per_joint = cfg['tag_per_joint'] + self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1, + cfg['nms_padding']) + self.use_udp = cfg.get('use_udp', False) + self.score_per_joint = cfg.get('score_per_joint', False) + + def nms(self, heatmaps): + """Non-Maximum Suppression for heatmaps. + + Args: + heatmap(torch.Tensor): Heatmaps before nms. + + Returns: + torch.Tensor: Heatmaps after nms. + """ + + maxm = self.pool(heatmaps) + maxm = torch.eq(maxm, heatmaps).float() + heatmaps = heatmaps * maxm + + return heatmaps + + def match(self, tag_k, loc_k, val_k): + """Group keypoints to human poses in a batch. + + Args: + tag_k (np.ndarray[NxKxMxL]): tag corresponding to the + top k values of feature map per keypoint. + loc_k (np.ndarray[NxKxMx2]): top k locations of the + feature maps for keypoint. + val_k (np.ndarray[NxKxM]): top k value of the + feature maps per keypoint. + + Returns: + list + """ + + def _match(x): + return _match_by_tag(x, self.params) + + return list(map(_match, zip(tag_k, loc_k, val_k))) + + def top_k(self, heatmaps, tags): + """Find top_k values in an image. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + max number of people: M + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + heatmaps (torch.Tensor[NxKxHxW]) + tags (torch.Tensor[NxKxHxWxL]) + + Returns: + dict: A dict containing top_k values. + + - tag_k (np.ndarray[NxKxMxL]): + tag corresponding to the top k values of + feature map per keypoint. + - loc_k (np.ndarray[NxKxMx2]): + top k location of feature map per keypoint. + - val_k (np.ndarray[NxKxM]): + top k value of feature map per keypoint. + """ + heatmaps = self.nms(heatmaps) + N, K, H, W = heatmaps.size() + heatmaps = heatmaps.view(N, K, -1) + val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2) + + tags = tags.view(tags.size(0), tags.size(1), W * H, -1) + if not self.tag_per_joint: + tags = tags.expand(-1, self.params.num_joints, -1, -1) + + tag_k = torch.stack( + [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))], + dim=3) + + x = ind % W + y = ind // W + + ind_k = torch.stack((x, y), dim=3) + + results = { + 'tag_k': tag_k.cpu().numpy(), + 'loc_k': ind_k.cpu().numpy(), + 'val_k': val_k.cpu().numpy() + } + + return results + + @staticmethod + def adjust(results, heatmaps): + """Adjust the coordinates for better accuracy. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + + Args: + results (list(np.ndarray)): Keypoint predictions. + heatmaps (torch.Tensor[NxKxHxW]): Heatmaps. + """ + _, _, H, W = heatmaps.shape + for batch_id, people in enumerate(results): + for people_id, people_i in enumerate(people): + for joint_id, joint in enumerate(people_i): + if joint[2] > 0: + x, y = joint[0:2] + xx, yy = int(x), int(y) + tmp = heatmaps[batch_id][joint_id] + if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1), + xx]: + y += 0.25 + else: + y -= 0.25 + + if tmp[yy, min(W - 1, xx + 1)] > tmp[yy, + max(0, xx - 1)]: + x += 0.25 + else: + x -= 0.25 + results[batch_id][people_id, joint_id, + 0:2] = (x + 0.5, y + 0.5) + return results + + @staticmethod + def refine(heatmap, tag, keypoints, use_udp=False): + """Given initial keypoint predictions, we identify missing joints. + + Note: + number of keypoints: K + heatmap height: H + heatmap width: W + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + heatmap: np.ndarray(K, H, W). + tag: np.ndarray(K, H, W) | np.ndarray(K, H, W, L) + keypoints: np.ndarray of size (K, 3 + L) + last dim is (x, y, score, tag). + use_udp: bool-unbiased data processing + + Returns: + np.ndarray: The refined keypoints. + """ + + K, H, W = heatmap.shape + if len(tag.shape) == 3: + tag = tag[..., None] + + tags = [] + for i in range(K): + if keypoints[i, 2] > 0: + # save tag value of detected keypoint + x, y = keypoints[i][:2].astype(int) + x = np.clip(x, 0, W - 1) + y = np.clip(y, 0, H - 1) + tags.append(tag[i, y, x]) + + # mean tag of current detected people + prev_tag = np.mean(tags, axis=0) + results = [] + + for _heatmap, _tag in zip(heatmap, tag): + # distance of all tag values with mean tag of + # current detected people + distance_tag = (((_tag - + prev_tag[None, None, :])**2).sum(axis=2)**0.5) + norm_heatmap = _heatmap - np.round(distance_tag) + + # find maximum position + y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape) + xx = x.copy() + yy = y.copy() + # detection score at maximum position + val = _heatmap[y, x] + if not use_udp: + # offset by 0.5 + x += 0.5 + y += 0.5 + + # add a quarter offset + if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]: + x += 0.25 + else: + x -= 0.25 + + if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]: + y += 0.25 + else: + y -= 0.25 + + results.append((x, y, val)) + results = np.array(results) + + if results is not None: + for i in range(K): + # add keypoint if it is not detected + if results[i, 2] > 0 and keypoints[i, 2] == 0: + keypoints[i, :3] = results[i, :3] + + return keypoints + + def parse(self, heatmaps, tags, adjust=True, refine=True): + """Group keypoints into poses given heatmap and tag. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps. + tags (torch.Tensor[NxKxHxWxL]): model output tagmaps. + + Returns: + tuple: A tuple containing keypoint grouping results. + + - results (list(np.ndarray)): Pose results. + - scores (list/list(np.ndarray)): Score of people. + """ + results = self.match(**self.top_k(heatmaps, tags)) + + if adjust: + if self.use_udp: + for i in range(len(results)): + if results[i].shape[0] > 0: + results[i][..., :2] = post_dark_udp( + results[i][..., :2].copy(), heatmaps[i:i + 1, :]) + else: + results = self.adjust(results, heatmaps) + + if self.score_per_joint: + scores = [i[:, 2] for i in results[0]] + else: + scores = [i[:, 2].mean() for i in results[0]] + + if refine: + results = results[0] + # for every detected person + for i in range(len(results)): + heatmap_numpy = heatmaps[0].cpu().numpy() + tag_numpy = tags[0].cpu().numpy() + if not self.tag_per_joint: + tag_numpy = np.tile(tag_numpy, + (self.params.num_joints, 1, 1, 1)) + results[i] = self.refine( + heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp) + results = [results] + + return results, scores diff --git a/mmpose/core/post_processing/nms.py b/mmpose/core/post_processing/nms.py new file mode 100644 index 0000000..86a0ab3 --- /dev/null +++ b/mmpose/core/post_processing/nms.py @@ -0,0 +1,207 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ + +import numpy as np + + +def nms(dets, thr): + """Greedily select boxes with high confidence and overlap <= thr. + + Args: + dets: [[x1, y1, x2, y2, score]]. + thr: Retain overlap < thr. + + Returns: + list: Indexes to keep. + """ + if len(dets) == 0: + return [] + + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while len(order) > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thr)[0] + order = order[inds + 1] + + return keep + + +def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None): + """Calculate oks ious. + + Args: + g: Ground truth keypoints. + d: Detected keypoints. + a_g: Area of the ground truth object. + a_d: Area of the detected object. + sigmas: standard deviation of keypoint labelling. + vis_thr: threshold of the keypoint visibility. + + Returns: + list: The oks ious. + """ + if sigmas is None: + sigmas = np.array([ + .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, + .87, .87, .89, .89 + ]) / 10.0 + vars = (sigmas * 2)**2 + xg = g[0::3] + yg = g[1::3] + vg = g[2::3] + ious = np.zeros(len(d), dtype=np.float32) + for n_d in range(0, len(d)): + xd = d[n_d, 0::3] + yd = d[n_d, 1::3] + vd = d[n_d, 2::3] + dx = xd - xg + dy = yd - yg + e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2 + if vis_thr is not None: + ind = list(vg > vis_thr) and list(vd > vis_thr) + e = e[ind] + ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0 + return ious + + +def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False): + """OKS NMS implementations. + + Args: + kpts_db: keypoints. + thr: Retain overlap < thr. + sigmas: standard deviation of keypoint labelling. + vis_thr: threshold of the keypoint visibility. + score_per_joint: the input scores (in kpts_db) are per joint scores + + Returns: + np.ndarray: indexes to keep. + """ + if len(kpts_db) == 0: + return [] + + if score_per_joint: + scores = np.array([k['score'].mean() for k in kpts_db]) + else: + scores = np.array([k['score'] for k in kpts_db]) + + kpts = np.array([k['keypoints'].flatten() for k in kpts_db]) + areas = np.array([k['area'] for k in kpts_db]) + + order = scores.argsort()[::-1] + + keep = [] + while len(order) > 0: + i = order[0] + keep.append(i) + + oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], + sigmas, vis_thr) + + inds = np.where(oks_ovr <= thr)[0] + order = order[inds + 1] + + keep = np.array(keep) + + return keep + + +def _rescore(overlap, scores, thr, type='gaussian'): + """Rescoring mechanism gaussian or linear. + + Args: + overlap: calculated ious + scores: target scores. + thr: retain oks overlap < thr. + type: 'gaussian' or 'linear' + + Returns: + np.ndarray: indexes to keep + """ + assert len(overlap) == len(scores) + assert type in ['gaussian', 'linear'] + + if type == 'linear': + inds = np.where(overlap >= thr)[0] + scores[inds] = scores[inds] * (1 - overlap[inds]) + else: + scores = scores * np.exp(-overlap**2 / thr) + + return scores + + +def soft_oks_nms(kpts_db, + thr, + max_dets=20, + sigmas=None, + vis_thr=None, + score_per_joint=False): + """Soft OKS NMS implementations. + + Args: + kpts_db + thr: retain oks overlap < thr. + max_dets: max number of detections to keep. + sigmas: Keypoint labelling uncertainty. + score_per_joint: the input scores (in kpts_db) are per joint scores + + Returns: + np.ndarray: indexes to keep. + """ + if len(kpts_db) == 0: + return [] + + if score_per_joint: + scores = np.array([k['score'].mean() for k in kpts_db]) + else: + scores = np.array([k['score'] for k in kpts_db]) + + kpts = np.array([k['keypoints'].flatten() for k in kpts_db]) + areas = np.array([k['area'] for k in kpts_db]) + + order = scores.argsort()[::-1] + scores = scores[order] + + keep = np.zeros(max_dets, dtype=np.intp) + keep_cnt = 0 + while len(order) > 0 and keep_cnt < max_dets: + i = order[0] + + oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], + sigmas, vis_thr) + + order = order[1:] + scores = _rescore(oks_ovr, scores[1:], thr) + + tmp = scores.argsort()[::-1] + order = order[tmp] + scores = scores[tmp] + + keep[keep_cnt] = i + keep_cnt += 1 + + keep = keep[:keep_cnt] + + return keep diff --git a/mmpose/core/post_processing/one_euro_filter.py b/mmpose/core/post_processing/one_euro_filter.py new file mode 100644 index 0000000..01ffa5f --- /dev/null +++ b/mmpose/core/post_processing/one_euro_filter.py @@ -0,0 +1,102 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy +# Original licence: Copyright (c) HoBeom Jeon, under the MIT License. +# ------------------------------------------------------------------------------ +from time import time + +import numpy as np + + +def smoothing_factor(t_e, cutoff): + r = 2 * np.pi * cutoff * t_e + return r / (r + 1) + + +def exponential_smoothing(a, x, x_prev): + return a * x + (1 - a) * x_prev + + +class OneEuroFilter: + + def __init__(self, + x0, + dx0=0.0, + min_cutoff=1.7, + beta=0.3, + d_cutoff=30.0, + fps=None): + """One Euro Filter for keypoints smoothing. + + Args: + x0 (np.ndarray[K, 2]): Initialize keypoints value + dx0 (float): 0.0 + min_cutoff (float): parameter for one euro filter + beta (float): parameter for one euro filter + d_cutoff (float): Input data FPS + fps (float): Video FPS for video inference + """ + + # The parameters. + self.data_shape = x0.shape + self.min_cutoff = np.full(x0.shape, min_cutoff) + self.beta = np.full(x0.shape, beta) + self.d_cutoff = np.full(x0.shape, d_cutoff) + # Previous values. + self.x_prev = x0.astype(np.float32) + self.dx_prev = np.full(x0.shape, dx0) + self.mask_prev = np.ma.masked_where(x0 <= 0, x0) + self.realtime = True + if fps is None: + # Using in realtime inference + self.t_e = None + self.skip_frame_factor = d_cutoff + else: + # fps using video inference + self.realtime = False + self.d_cutoff = np.full(x0.shape, float(fps)) + self.t_prev = time() + + def __call__(self, x, t_e=1.0): + """Compute the filtered signal. + + Hyper-parameters (cutoff, beta) are from `VNect + `__ . + + Realtime Camera fps (d_cutoff) default 30.0 + + Args: + x (np.ndarray[K, 2]): keypoints results in frame + t_e (Optional): video skip frame count for posetrack + evaluation + """ + assert x.shape == self.data_shape + + t = 0 + if self.realtime: + t = time() + t_e = (t - self.t_prev) * self.skip_frame_factor + t_e = np.full(x.shape, t_e) + + # missing keypoints mask + mask = np.ma.masked_where(x <= 0, x) + + # The filtered derivative of the signal. + a_d = smoothing_factor(t_e, self.d_cutoff) + dx = (x - self.x_prev) / t_e + dx_hat = exponential_smoothing(a_d, dx, self.dx_prev) + + # The filtered signal. + cutoff = self.min_cutoff + self.beta * np.abs(dx_hat) + a = smoothing_factor(t_e, cutoff) + x_hat = exponential_smoothing(a, x, self.x_prev) + + # missing keypoints remove + np.copyto(x_hat, -10, where=mask.mask) + + # Memorize the previous values. + self.x_prev = x_hat + self.dx_prev = dx_hat + self.t_prev = t + self.mask_prev = mask + + return x_hat diff --git a/mmpose/core/post_processing/post_transforms.py b/mmpose/core/post_processing/post_transforms.py new file mode 100644 index 0000000..93063fb --- /dev/null +++ b/mmpose/core/post_processing/post_transforms.py @@ -0,0 +1,366 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ + +import math + +import cv2 +import numpy as np +import torch + + +def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs): + """Flip human joints horizontally. + + Note: + - num_keypoints: K + + Args: + joints_3d (np.ndarray([K, 3])): Coordinates of keypoints. + joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints. + img_width (int): Image width. + flip_pairs (list[tuple]): Pairs of keypoints which are mirrored + (for example, left ear and right ear). + + Returns: + tuple: Flipped human joints. + + - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints. + - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility. + """ + + assert len(joints_3d) == len(joints_3d_visible) + assert img_width > 0 + + joints_3d_flipped = joints_3d.copy() + joints_3d_visible_flipped = joints_3d_visible.copy() + + # Swap left-right parts + for left, right in flip_pairs: + joints_3d_flipped[left, :] = joints_3d[right, :] + joints_3d_flipped[right, :] = joints_3d[left, :] + + joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :] + joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :] + + # Flip horizontally + joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0] + joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped + + return joints_3d_flipped, joints_3d_visible_flipped + + +def fliplr_regression(regression, + flip_pairs, + center_mode='static', + center_x=0.5, + center_index=0): + """Flip human joints horizontally. + + Note: + - batch_size: N + - num_keypoint: K + + Args: + regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K + is the joint number and C is the dimension. Example shapes are: + + - [N, K, C]: a batch of keypoints where N is the batch size. + - [N, T, K, C]: a batch of pose sequences, where T is the frame + number. + flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored + (for example, left ear -- right ear). + center_mode (str): The mode to set the center location on the x-axis + to flip around. Options are: + + - static: use a static x value (see center_x also) + - root: use a root joint (see center_index also) + center_x (float): Set the x-axis location of the flip center. Only used + when center_mode=static. + center_index (int): Set the index of the root joint, whose x location + will be used as the flip center. Only used when center_mode=root. + + Returns: + np.ndarray([..., K, C]): Flipped joints. + """ + assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}' + + allowed_center_mode = {'static', 'root'} + assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \ + f'{center_mode}, allowed choices are {allowed_center_mode}' + + if center_mode == 'static': + x_c = center_x + elif center_mode == 'root': + assert regression.shape[-2] > center_index + x_c = regression[..., center_index:center_index + 1, 0] + + regression_flipped = regression.copy() + # Swap left-right parts + for left, right in flip_pairs: + regression_flipped[..., left, :] = regression[..., right, :] + regression_flipped[..., right, :] = regression[..., left, :] + + # Flip horizontally + regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0] + return regression_flipped + + +def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'): + """Flip the flipped heatmaps back to the original form. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained + from the flipped images. + flip_pairs (list[tuple()): Pairs of keypoints which are mirrored + (for example, left ear -- right ear). + target_type (str): GaussianHeatmap or CombinedTarget + + Returns: + np.ndarray: heatmaps that flipped back to the original image + """ + assert output_flipped.ndim == 4, \ + 'output_flipped should be [batch_size, num_keypoints, height, width]' + shape_ori = output_flipped.shape + channels = 1 + if target_type.lower() == 'CombinedTarget'.lower(): + channels = 3 + output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...] + output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, + shape_ori[2], shape_ori[3]) + output_flipped_back = output_flipped.copy() + + # Swap left-right parts + for left, right in flip_pairs: + output_flipped_back[:, left, ...] = output_flipped[:, right, ...] + output_flipped_back[:, right, ...] = output_flipped[:, left, ...] + output_flipped_back = output_flipped_back.reshape(shape_ori) + # Flip horizontally + output_flipped_back = output_flipped_back[..., ::-1] + return output_flipped_back + + +def transform_preds(coords, center, scale, output_size, use_udp=False): + """Get final keypoint predictions from heatmaps and apply scaling and + translation to map them back to the image. + + Note: + num_keypoints: K + + Args: + coords (np.ndarray[K, ndims]): + + * If ndims=2, corrds are predicted keypoint location. + * If ndims=4, corrds are composed of (x, y, scores, tags) + * If ndims=5, corrds are composed of (x, y, scores, tags, + flipped_tags) + + center (np.ndarray[2, ]): Center of the bounding box (x, y). + scale (np.ndarray[2, ]): Scale of the bounding box + wrt [width, height]. + output_size (np.ndarray[2, ] | list(2,)): Size of the + destination heatmaps. + use_udp (bool): Use unbiased data processing + + Returns: + np.ndarray: Predicted coordinates in the images. + """ + assert coords.shape[1] in (2, 4, 5) + assert len(center) == 2 + assert len(scale) == 2 + assert len(output_size) == 2 + + # Recover the scale which is normalized by a factor of 200. + scale = scale * 200.0 + + if use_udp: + scale_x = scale[0] / (output_size[0] - 1.0) + scale_y = scale[1] / (output_size[1] - 1.0) + else: + scale_x = scale[0] / output_size[0] + scale_y = scale[1] / output_size[1] + + target_coords = np.ones_like(coords) + target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5 + target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5 + + return target_coords + + +def get_affine_transform(center, + scale, + rot, + output_size, + shift=(0., 0.), + inv=False): + """Get the affine transform matrix, given the center/scale/rot/output_size. + + Args: + center (np.ndarray[2, ]): Center of the bounding box (x, y). + scale (np.ndarray[2, ]): Scale of the bounding box + wrt [width, height]. + rot (float): Rotation angle (degree). + output_size (np.ndarray[2, ] | list(2,)): Size of the + destination heatmaps. + shift (0-100%): Shift translation ratio wrt the width/height. + Default (0., 0.). + inv (bool): Option to inverse the affine transform direction. + (inv=False: src->dst or inv=True: dst->src) + + Returns: + np.ndarray: The transform matrix. + """ + assert len(center) == 2 + assert len(scale) == 2 + assert len(output_size) == 2 + assert len(shift) == 2 + + # pixel_std is 200. + scale_tmp = scale * 200.0 + + shift = np.array(shift) + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = rotate_point([0., src_w * -0.5], rot_rad) + dst_dir = np.array([0., dst_w * -0.5]) + + src = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + src[2, :] = _get_3rd_point(src[0, :], src[1, :]) + + dst = np.zeros((3, 2), dtype=np.float32) + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir + dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + + return trans + + +def affine_transform(pt, trans_mat): + """Apply an affine transformation to the points. + + Args: + pt (np.ndarray): a 2 dimensional point to be transformed + trans_mat (np.ndarray): 2x3 matrix of an affine transform + + Returns: + np.ndarray: Transformed points. + """ + assert len(pt) == 2 + new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.]) + + return new_pt + + +def _get_3rd_point(a, b): + """To calculate the affine matrix, three pairs of points are required. This + function is used to get the 3rd point, given 2D points a & b. + + The 3rd point is defined by rotating vector `a - b` by 90 degrees + anticlockwise, using b as the rotation center. + + Args: + a (np.ndarray): point(x,y) + b (np.ndarray): point(x,y) + + Returns: + np.ndarray: The 3rd point. + """ + assert len(a) == 2 + assert len(b) == 2 + direction = a - b + third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32) + + return third_pt + + +def rotate_point(pt, angle_rad): + """Rotate a point by an angle. + + Args: + pt (list[float]): 2 dimensional point to be rotated + angle_rad (float): rotation angle by radian + + Returns: + list[float]: Rotated point. + """ + assert len(pt) == 2 + sn, cs = np.sin(angle_rad), np.cos(angle_rad) + new_x = pt[0] * cs - pt[1] * sn + new_y = pt[0] * sn + pt[1] * cs + rotated_pt = [new_x, new_y] + + return rotated_pt + + +def get_warp_matrix(theta, size_input, size_dst, size_target): + """Calculate the transformation matrix under the constraint of unbiased. + Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased + Data Processing for Human Pose Estimation (CVPR 2020). + + Args: + theta (float): Rotation angle in degrees. + size_input (np.ndarray): Size of input image [w, h]. + size_dst (np.ndarray): Size of output image [w, h]. + size_target (np.ndarray): Size of ROI in input plane [w, h]. + + Returns: + np.ndarray: A matrix for transformation. + """ + theta = np.deg2rad(theta) + matrix = np.zeros((2, 3), dtype=np.float32) + scale_x = size_dst[0] / size_target[0] + scale_y = size_dst[1] / size_target[1] + matrix[0, 0] = math.cos(theta) * scale_x + matrix[0, 1] = -math.sin(theta) * scale_x + matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) + + 0.5 * size_input[1] * math.sin(theta) + + 0.5 * size_target[0]) + matrix[1, 0] = math.sin(theta) * scale_y + matrix[1, 1] = math.cos(theta) * scale_y + matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) - + 0.5 * size_input[1] * math.cos(theta) + + 0.5 * size_target[1]) + return matrix + + +def warp_affine_joints(joints, mat): + """Apply affine transformation defined by the transform matrix on the + joints. + + Args: + joints (np.ndarray[..., 2]): Origin coordinate of joints. + mat (np.ndarray[3, 2]): The affine matrix. + + Returns: + np.ndarray[..., 2]: Result coordinate of joints. + """ + joints = np.array(joints) + shape = joints.shape + joints = joints.reshape(-1, 2) + return np.dot( + np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1), + mat.T).reshape(shape) + + +def affine_transform_torch(pts, t): + npts = pts.shape[0] + pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1) + out = torch.mm(t, torch.t(pts_homo)) + return torch.t(out[:2, :]) diff --git a/mmpose/core/utils/__init__.py b/mmpose/core/utils/__init__.py new file mode 100644 index 0000000..bd6c027 --- /dev/null +++ b/mmpose/core/utils/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .dist_utils import allreduce_grads +from .regularizations import WeightNormClipHook + +__all__ = ['allreduce_grads', 'WeightNormClipHook'] diff --git a/mmpose/core/utils/dist_utils.py b/mmpose/core/utils/dist_utils.py new file mode 100644 index 0000000..e76e591 --- /dev/null +++ b/mmpose/core/utils/dist_utils.py @@ -0,0 +1,51 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +import torch.distributed as dist +from torch._utils import (_flatten_dense_tensors, _take_tensors, + _unflatten_dense_tensors) + + +def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): + """Allreduce parameters as a whole.""" + if bucket_size_mb > 0: + bucket_size_bytes = bucket_size_mb * 1024 * 1024 + buckets = _take_tensors(tensors, bucket_size_bytes) + else: + buckets = OrderedDict() + for tensor in tensors: + tp = tensor.type() + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(tensor) + buckets = buckets.values() + + for bucket in buckets: + flat_tensors = _flatten_dense_tensors(bucket) + dist.all_reduce(flat_tensors) + flat_tensors.div_(world_size) + for tensor, synced in zip( + bucket, _unflatten_dense_tensors(flat_tensors, bucket)): + tensor.copy_(synced) + + +def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): + """Allreduce gradients. + + Args: + params (list[torch.Parameters]): List of parameters of a model + coalesce (bool, optional): Whether allreduce parameters as a whole. + Default: True. + bucket_size_mb (int, optional): Size of bucket, the unit is MB. + Default: -1. + """ + grads = [ + param.grad.data for param in params + if param.requires_grad and param.grad is not None + ] + world_size = dist.get_world_size() + if coalesce: + _allreduce_coalesced(grads, world_size, bucket_size_mb) + else: + for tensor in grads: + dist.all_reduce(tensor.div_(world_size)) diff --git a/mmpose/core/utils/regularizations.py b/mmpose/core/utils/regularizations.py new file mode 100644 index 0000000..d8c7449 --- /dev/null +++ b/mmpose/core/utils/regularizations.py @@ -0,0 +1,86 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod, abstractproperty + +import torch + + +class PytorchModuleHook(metaclass=ABCMeta): + """Base class for PyTorch module hook registers. + + An instance of a subclass of PytorchModuleHook can be used to + register hook to a pytorch module using the `register` method like: + hook_register.register(module) + + Subclasses should add/overwrite the following methods: + - __init__ + - hook + - hook_type + """ + + @abstractmethod + def hook(self, *args, **kwargs): + """Hook function.""" + + @abstractproperty + def hook_type(self) -> str: + """Hook type Subclasses should overwrite this function to return a + string value in. + + {`forward`, `forward_pre`, `backward`} + """ + + def register(self, module): + """Register the hook function to the module. + + Args: + module (pytorch module): the module to register the hook. + + Returns: + handle (torch.utils.hooks.RemovableHandle): a handle to remove + the hook by calling handle.remove() + """ + assert isinstance(module, torch.nn.Module) + + if self.hook_type == 'forward': + h = module.register_forward_hook(self.hook) + elif self.hook_type == 'forward_pre': + h = module.register_forward_pre_hook(self.hook) + elif self.hook_type == 'backward': + h = module.register_backward_hook(self.hook) + else: + raise ValueError(f'Invalid hook type {self.hook}') + + return h + + +class WeightNormClipHook(PytorchModuleHook): + """Apply weight norm clip regularization. + + The module's parameter will be clip to a given maximum norm before each + forward pass. + + Args: + max_norm (float): The maximum norm of the parameter. + module_param_names (str|list): The parameter name (or name list) to + apply weight norm clip. + """ + + def __init__(self, max_norm=1.0, module_param_names='weight'): + self.module_param_names = module_param_names if isinstance( + module_param_names, list) else [module_param_names] + self.max_norm = max_norm + + @property + def hook_type(self): + return 'forward_pre' + + def hook(self, module, _input): + for name in self.module_param_names: + assert name in module._parameters, f'{name} is not a parameter' \ + f' of the module {type(module)}' + param = module._parameters[name] + + with torch.no_grad(): + m = param.norm().item() + if m > self.max_norm: + param.mul_(self.max_norm / (m + 1e-6)) diff --git a/mmpose/core/visualization/__init__.py b/mmpose/core/visualization/__init__.py new file mode 100644 index 0000000..9705494 --- /dev/null +++ b/mmpose/core/visualization/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .effects import apply_bugeye_effect, apply_sunglasses_effect +from .image import (imshow_bboxes, imshow_keypoints, imshow_keypoints_3d, + imshow_mesh_3d) + +__all__ = [ + 'imshow_keypoints', + 'imshow_keypoints_3d', + 'imshow_bboxes', + 'apply_bugeye_effect', + 'apply_sunglasses_effect', + 'imshow_mesh_3d', +] diff --git a/mmpose/core/visualization/effects.py b/mmpose/core/visualization/effects.py new file mode 100644 index 0000000..d3add7d --- /dev/null +++ b/mmpose/core/visualization/effects.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + + +def apply_bugeye_effect(img, + pose_results, + left_eye_index, + right_eye_index, + kpt_thr=0.5): + """Apply bug-eye effect. + + Args: + img (np.ndarray): Image data. + pose_results (list[dict]): The pose estimation results containing: + - "bbox" ([K, 4(or 5)]): detection bbox in + [x1, y1, x2, y2, (score)] + - "keypoints" ([K,3]): keypoint detection result in [x, y, score] + left_eye_index (int): Keypoint index of left eye + right_eye_index (int): Keypoint index of right eye + kpt_thr (float): The score threshold of required keypoints. + """ + + xx, yy = np.meshgrid(np.arange(img.shape[1]), np.arange(img.shape[0])) + xx = xx.astype(np.float32) + yy = yy.astype(np.float32) + + for pose in pose_results: + bbox = pose['bbox'] + kpts = pose['keypoints'] + + if kpts[left_eye_index, 2] < kpt_thr or kpts[right_eye_index, + 2] < kpt_thr: + continue + + kpt_leye = kpts[left_eye_index, :2] + kpt_reye = kpts[right_eye_index, :2] + for xc, yc in [kpt_leye, kpt_reye]: + + # distortion parameters + k1 = 0.001 + epe = 1e-5 + + scale = (bbox[2] - bbox[0])**2 + (bbox[3] - bbox[1])**2 + r2 = ((xx - xc)**2 + (yy - yc)**2) + r2 = (r2 + epe) / scale # normalized by bbox scale + + xx = (xx - xc) / (1 + k1 / r2) + xc + yy = (yy - yc) / (1 + k1 / r2) + yc + + img = cv2.remap( + img, + xx, + yy, + interpolation=cv2.INTER_AREA, + borderMode=cv2.BORDER_REPLICATE) + return img + + +def apply_sunglasses_effect(img, + pose_results, + sunglasses_img, + left_eye_index, + right_eye_index, + kpt_thr=0.5): + """Apply sunglasses effect. + + Args: + img (np.ndarray): Image data. + pose_results (list[dict]): The pose estimation results containing: + - "keypoints" ([K,3]): keypoint detection result in [x, y, score] + sunglasses_img (np.ndarray): Sunglasses image with white background. + left_eye_index (int): Keypoint index of left eye + right_eye_index (int): Keypoint index of right eye + kpt_thr (float): The score threshold of required keypoints. + """ + + hm, wm = sunglasses_img.shape[:2] + # anchor points in the sunglasses mask + pts_src = np.array([[0.3 * wm, 0.3 * hm], [0.3 * wm, 0.7 * hm], + [0.7 * wm, 0.3 * hm], [0.7 * wm, 0.7 * hm]], + dtype=np.float32) + + for pose in pose_results: + kpts = pose['keypoints'] + + if kpts[left_eye_index, 2] < kpt_thr or kpts[right_eye_index, + 2] < kpt_thr: + continue + + kpt_leye = kpts[left_eye_index, :2] + kpt_reye = kpts[right_eye_index, :2] + # orthogonal vector to the left-to-right eyes + vo = 0.5 * (kpt_reye - kpt_leye)[::-1] * [-1, 1] + + # anchor points in the image by eye positions + pts_tar = np.vstack( + [kpt_reye + vo, kpt_reye - vo, kpt_leye + vo, kpt_leye - vo]) + + h_mat, _ = cv2.findHomography(pts_src, pts_tar) + patch = cv2.warpPerspective( + sunglasses_img, + h_mat, + dsize=(img.shape[1], img.shape[0]), + borderValue=(255, 255, 255)) + # mask the white background area in the patch with a threshold 200 + mask = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY) + mask = (mask < 200).astype(np.uint8) + img = cv2.copyTo(patch, mask, img) + + return img diff --git a/mmpose/core/visualization/image.py b/mmpose/core/visualization/image.py new file mode 100644 index 0000000..8acd10b --- /dev/null +++ b/mmpose/core/visualization/image.py @@ -0,0 +1,442 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import os +import warnings + +import cv2 +import mmcv +import numpy as np +from matplotlib import pyplot as plt +from mmcv.utils.misc import deprecated_api_warning +from mmcv.visualization.color import color_val + +try: + import trimesh + has_trimesh = True +except (ImportError, ModuleNotFoundError): + has_trimesh = False + +try: + os.environ['PYOPENGL_PLATFORM'] = 'osmesa' + import pyrender + has_pyrender = True +except (ImportError, ModuleNotFoundError): + has_pyrender = False + + +def imshow_bboxes(img, + bboxes, + labels=None, + colors='green', + text_color='white', + thickness=1, + font_scale=0.5, + show=True, + win_name='', + wait_time=0, + out_file=None): + """Draw bboxes with labels (optional) on an image. This is a wrapper of + mmcv.imshow_bboxes. + + Args: + img (str or ndarray): The image to be displayed. + bboxes (ndarray): ndarray of shape (k, 4), each row is a bbox in + format [x1, y1, x2, y2]. + labels (str or list[str], optional): labels of each bbox. + colors (list[str or tuple or :obj:`Color`]): A list of colors. + text_color (str or tuple or :obj:`Color`): Color of texts. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + show (bool): Whether to show the image. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + out_file (str, optional): The filename to write the image. + + Returns: + ndarray: The image with bboxes drawn on it. + """ + + # adapt to mmcv.imshow_bboxes input format + bboxes = np.split( + bboxes, bboxes.shape[0], axis=0) if bboxes.shape[0] > 0 else [] + if not isinstance(colors, list): + colors = [colors for _ in range(len(bboxes))] + colors = [mmcv.color_val(c) for c in colors] + assert len(bboxes) == len(colors) + + img = mmcv.imshow_bboxes( + img, + bboxes, + colors, + top_k=-1, + thickness=thickness, + show=False, + out_file=None) + + if labels is not None: + if not isinstance(labels, list): + labels = [labels for _ in range(len(bboxes))] + assert len(labels) == len(bboxes) + + for bbox, label, color in zip(bboxes, labels, colors): + if label is None: + continue + bbox_int = bbox[0, :4].astype(np.int32) + # roughly estimate the proper font size + text_size, text_baseline = cv2.getTextSize(label, + cv2.FONT_HERSHEY_DUPLEX, + font_scale, thickness) + text_x1 = bbox_int[0] + text_y1 = max(0, bbox_int[1] - text_size[1] - text_baseline) + text_x2 = bbox_int[0] + text_size[0] + text_y2 = text_y1 + text_size[1] + text_baseline + cv2.rectangle(img, (text_x1, text_y1), (text_x2, text_y2), color, + cv2.FILLED) + cv2.putText(img, label, (text_x1, text_y2 - text_baseline), + cv2.FONT_HERSHEY_DUPLEX, font_scale, + mmcv.color_val(text_color), thickness) + + if show: + mmcv.imshow(img, win_name, wait_time) + if out_file is not None: + mmcv.imwrite(img, out_file) + return img + + +@deprecated_api_warning({'pose_limb_color': 'pose_link_color'}) +def imshow_keypoints(img, + pose_result, + skeleton=None, + kpt_score_thr=0.3, + pose_kpt_color=None, + pose_link_color=None, + radius=4, + thickness=1, + show_keypoint_weight=False): + """Draw keypoints and links on an image. + + Args: + img (str or Tensor): The image to draw poses on. If an image array + is given, id will be modified in-place. + pose_result (list[kpts]): The poses to draw. Each element kpts is + a set of K keypoints as an Kx3 numpy.ndarray, where each + keypoint is represented as x, y, score. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None, + the keypoint will not be drawn. + pose_link_color (np.array[Mx3]): Color of M links. If None, the + links will not be drawn. + thickness (int): Thickness of lines. + """ + + img = mmcv.imread(img) + img_h, img_w, _ = img.shape + + for kpts in pose_result: + + kpts = np.array(kpts, copy=False) + + # draw each point on image + if pose_kpt_color is not None: + assert len(pose_kpt_color) == len(kpts) + for kid, kpt in enumerate(kpts): + x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2] + if kpt_score > kpt_score_thr: + color = tuple(int(c) for c in pose_kpt_color[kid]) + if show_keypoint_weight: + img_copy = img.copy() + cv2.circle(img_copy, (int(x_coord), int(y_coord)), + radius, color, -1) + transparency = max(0, min(1, kpt_score)) + cv2.addWeighted( + img_copy, + transparency, + img, + 1 - transparency, + 0, + dst=img) + else: + cv2.circle(img, (int(x_coord), int(y_coord)), radius, + color, -1) + + # draw links + if skeleton is not None and pose_link_color is not None: + assert len(pose_link_color) == len(skeleton) + for sk_id, sk in enumerate(skeleton): + pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1])) + pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1])) + if (pos1[0] > 0 and pos1[0] < img_w and pos1[1] > 0 + and pos1[1] < img_h and pos2[0] > 0 and pos2[0] < img_w + and pos2[1] > 0 and pos2[1] < img_h + and kpts[sk[0], 2] > kpt_score_thr + and kpts[sk[1], 2] > kpt_score_thr): + color = tuple(int(c) for c in pose_link_color[sk_id]) + if show_keypoint_weight: + img_copy = img.copy() + X = (pos1[0], pos2[0]) + Y = (pos1[1], pos2[1]) + mX = np.mean(X) + mY = np.mean(Y) + length = ((Y[0] - Y[1])**2 + (X[0] - X[1])**2)**0.5 + angle = math.degrees( + math.atan2(Y[0] - Y[1], X[0] - X[1])) + stickwidth = 2 + polygon = cv2.ellipse2Poly( + (int(mX), int(mY)), + (int(length / 2), int(stickwidth)), int(angle), 0, + 360, 1) + cv2.fillConvexPoly(img_copy, polygon, color) + transparency = max( + 0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2]))) + cv2.addWeighted( + img_copy, + transparency, + img, + 1 - transparency, + 0, + dst=img) + else: + cv2.line(img, pos1, pos2, color, thickness=thickness) + + return img + + +def imshow_keypoints_3d( + pose_result, + img=None, + skeleton=None, + pose_kpt_color=None, + pose_link_color=None, + vis_height=400, + kpt_score_thr=0.3, + num_instances=-1, + *, + axis_azimuth=70, + axis_limit=1.7, + axis_dist=10.0, + axis_elev=15.0, +): + """Draw 3D keypoints and links in 3D coordinates. + + Args: + pose_result (list[dict]): 3D pose results containing: + - "keypoints_3d" ([K,4]): 3D keypoints + - "title" (str): Optional. A string to specify the title of the + visualization of this pose result + img (str|np.ndarray): Opptional. The image or image path to show input + image and/or 2D pose. Note that the image should be given in BGR + channel order. + skeleton (list of [idx_i,idx_j]): Skeleton described by a list of + links, each is a pair of joint indices. + pose_kpt_color (np.ndarray[Nx3]`): Color of N keypoints. If None, do + not nddraw keypoints. + pose_link_color (np.array[Mx3]): Color of M links. If None, do not + draw links. + vis_height (int): The image height of the visualization. The width + will be N*vis_height depending on the number of visualized + items. + kpt_score_thr (float): Minimum score of keypoints to be shown. + Default: 0.3. + num_instances (int): Number of instances to be shown in 3D. If smaller + than 0, all the instances in the pose_result will be shown. + Otherwise, pad or truncate the pose_result to a length of + num_instances. + axis_azimuth (float): axis azimuth angle for 3D visualizations. + axis_dist (float): axis distance for 3D visualizations. + axis_elev (float): axis elevation view angle for 3D visualizations. + axis_limit (float): The axis limit to visualize 3d pose. The xyz + range will be set as: + - x: [x_c - axis_limit/2, x_c + axis_limit/2] + - y: [y_c - axis_limit/2, y_c + axis_limit/2] + - z: [0, axis_limit] + Where x_c, y_c is the mean value of x and y coordinates + figsize: (float): figure size in inch. + """ + + show_img = img is not None + if num_instances < 0: + num_instances = len(pose_result) + else: + if len(pose_result) > num_instances: + pose_result = pose_result[:num_instances] + elif len(pose_result) < num_instances: + pose_result += [dict()] * (num_instances - len(pose_result)) + num_axis = num_instances + 1 if show_img else num_instances + + plt.ioff() + fig = plt.figure(figsize=(vis_height * num_axis * 0.01, vis_height * 0.01)) + + if show_img: + img = mmcv.imread(img, channel_order='bgr') + img = mmcv.bgr2rgb(img) + img = mmcv.imrescale(img, scale=vis_height / img.shape[0]) + + ax_img = fig.add_subplot(1, num_axis, 1) + ax_img.get_xaxis().set_visible(False) + ax_img.get_yaxis().set_visible(False) + ax_img.set_axis_off() + ax_img.set_title('Input') + ax_img.imshow(img, aspect='equal') + + for idx, res in enumerate(pose_result): + dummy = len(res) == 0 + kpts = np.zeros((1, 3)) if dummy else res['keypoints_3d'] + if kpts.shape[1] == 3: + kpts = np.concatenate([kpts, np.ones((kpts.shape[0], 1))], axis=1) + valid = kpts[:, 3] >= kpt_score_thr + + ax_idx = idx + 2 if show_img else idx + 1 + ax = fig.add_subplot(1, num_axis, ax_idx, projection='3d') + ax.view_init( + elev=axis_elev, + azim=axis_azimuth, + ) + x_c = np.mean(kpts[valid, 0]) if sum(valid) > 0 else 0 + y_c = np.mean(kpts[valid, 1]) if sum(valid) > 0 else 0 + ax.set_xlim3d([x_c - axis_limit / 2, x_c + axis_limit / 2]) + ax.set_ylim3d([y_c - axis_limit / 2, y_c + axis_limit / 2]) + ax.set_zlim3d([0, axis_limit]) + ax.set_aspect('auto') + ax.set_xticks([]) + ax.set_yticks([]) + ax.set_zticks([]) + ax.set_xticklabels([]) + ax.set_yticklabels([]) + ax.set_zticklabels([]) + ax.dist = axis_dist + + if not dummy and pose_kpt_color is not None: + pose_kpt_color = np.array(pose_kpt_color) + assert len(pose_kpt_color) == len(kpts) + x_3d, y_3d, z_3d = np.split(kpts[:, :3], [1, 2], axis=1) + # matplotlib uses RGB color in [0, 1] value range + _color = pose_kpt_color[..., ::-1] / 255. + ax.scatter( + x_3d[valid], + y_3d[valid], + z_3d[valid], + marker='o', + color=_color[valid], + ) + + if not dummy and skeleton is not None and pose_link_color is not None: + pose_link_color = np.array(pose_link_color) + assert len(pose_link_color) == len(skeleton) + for link, link_color in zip(skeleton, pose_link_color): + link_indices = [_i for _i in link] + xs_3d = kpts[link_indices, 0] + ys_3d = kpts[link_indices, 1] + zs_3d = kpts[link_indices, 2] + kpt_score = kpts[link_indices, 3] + if kpt_score.min() > kpt_score_thr: + # matplotlib uses RGB color in [0, 1] value range + _color = link_color[::-1] / 255. + ax.plot(xs_3d, ys_3d, zs_3d, color=_color, zdir='z') + + if 'title' in res: + ax.set_title(res['title']) + + # convert figure to numpy array + fig.tight_layout() + fig.canvas.draw() + img_w, img_h = fig.canvas.get_width_height() + img_vis = np.frombuffer( + fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(img_h, img_w, -1) + img_vis = mmcv.rgb2bgr(img_vis) + + plt.close(fig) + + return img_vis + + +def imshow_mesh_3d(img, + vertices, + faces, + camera_center, + focal_length, + colors=(76, 76, 204)): + """Render 3D meshes on background image. + + Args: + img(np.ndarray): Background image. + vertices (list of np.ndarray): Vetrex coordinates in camera space. + faces (list of np.ndarray): Faces of meshes. + camera_center ([2]): Center pixel. + focal_length ([2]): Focal length of camera. + colors (list[str or tuple or Color]): A list of mesh colors. + """ + + H, W, C = img.shape + + if not has_pyrender: + warnings.warn('pyrender package is not installed.') + return img + + if not has_trimesh: + warnings.warn('trimesh package is not installed.') + return img + + try: + renderer = pyrender.OffscreenRenderer( + viewport_width=W, viewport_height=H) + except (ImportError, RuntimeError): + warnings.warn('pyrender package is not installed correctly.') + return img + + if not isinstance(colors, list): + colors = [colors for _ in range(len(vertices))] + colors = [color_val(c) for c in colors] + + depth_map = np.ones([H, W]) * np.inf + output_img = img + for idx in range(len(vertices)): + color = colors[idx] + color = [c / 255.0 for c in color] + color.append(1.0) + vert = vertices[idx] + face = faces[idx] + + material = pyrender.MetallicRoughnessMaterial( + metallicFactor=0.2, alphaMode='OPAQUE', baseColorFactor=color) + + mesh = trimesh.Trimesh(vert, face) + rot = trimesh.transformations.rotation_matrix( + np.radians(180), [1, 0, 0]) + mesh.apply_transform(rot) + mesh = pyrender.Mesh.from_trimesh(mesh, material=material) + + scene = pyrender.Scene(ambient_light=(0.5, 0.5, 0.5)) + scene.add(mesh, 'mesh') + + camera_pose = np.eye(4) + camera = pyrender.IntrinsicsCamera( + fx=focal_length[0], + fy=focal_length[1], + cx=camera_center[0], + cy=camera_center[1], + zfar=1e5) + scene.add(camera, pose=camera_pose) + + light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=1) + light_pose = np.eye(4) + + light_pose[:3, 3] = np.array([0, -1, 1]) + scene.add(light, pose=light_pose) + + light_pose[:3, 3] = np.array([0, 1, 1]) + scene.add(light, pose=light_pose) + + light_pose[:3, 3] = np.array([1, 1, 2]) + scene.add(light, pose=light_pose) + + color, rend_depth = renderer.render( + scene, flags=pyrender.RenderFlags.RGBA) + + valid_mask = (rend_depth < depth_map) * (rend_depth > 0) + depth_map[valid_mask] = rend_depth[valid_mask] + valid_mask = valid_mask[:, :, None] + output_img = ( + valid_mask * color[:, :, :3] + (1 - valid_mask) * output_img) + + return output_img diff --git a/mmpose/datasets/__init__.py b/mmpose/datasets/__init__.py new file mode 100644 index 0000000..1b9e7cf --- /dev/null +++ b/mmpose/datasets/__init__.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset +from .dataset_info import DatasetInfo +from .pipelines import Compose +from .samplers import DistributedSampler + +from .datasets import ( # isort:skip + AnimalATRWDataset, AnimalFlyDataset, AnimalHorse10Dataset, + AnimalLocustDataset, AnimalMacaqueDataset, AnimalPoseDataset, + AnimalZebraDataset, Body3DH36MDataset, BottomUpAicDataset, + BottomUpCocoDataset, BottomUpCocoWholeBodyDataset, + BottomUpCrowdPoseDataset, BottomUpMhpDataset, DeepFashionDataset, + Face300WDataset, FaceAFLWDataset, FaceCocoWholeBodyDataset, + FaceCOFWDataset, FaceWFLWDataset, FreiHandDataset, + HandCocoWholeBodyDataset, InterHand2DDataset, InterHand3DDataset, + MeshAdversarialDataset, MeshH36MDataset, MeshMixDataset, MoshDataset, + OneHand10KDataset, PanopticDataset, TopDownAicDataset, TopDownCocoDataset, + TopDownCocoWholeBodyDataset, TopDownCrowdPoseDataset, + TopDownFreiHandDataset, TopDownH36MDataset, TopDownJhmdbDataset, + TopDownMhpDataset, TopDownMpiiDataset, TopDownMpiiTrbDataset, + TopDownOCHumanDataset, TopDownOneHand10KDataset, TopDownPanopticDataset, + TopDownPoseTrack18Dataset, TopDownPoseTrack18VideoDataset) + +__all__ = [ + 'TopDownCocoDataset', 'BottomUpCocoDataset', 'BottomUpMhpDataset', + 'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset', 'TopDownMpiiDataset', + 'TopDownMpiiTrbDataset', 'OneHand10KDataset', 'PanopticDataset', + 'HandCocoWholeBodyDataset', 'FreiHandDataset', 'InterHand2DDataset', + 'InterHand3DDataset', 'TopDownOCHumanDataset', 'TopDownAicDataset', + 'TopDownCocoWholeBodyDataset', 'MeshH36MDataset', 'MeshMixDataset', + 'MoshDataset', 'MeshAdversarialDataset', 'TopDownCrowdPoseDataset', + 'BottomUpCrowdPoseDataset', 'TopDownFreiHandDataset', + 'TopDownOneHand10KDataset', 'TopDownPanopticDataset', + 'TopDownPoseTrack18Dataset', 'TopDownJhmdbDataset', 'TopDownMhpDataset', + 'DeepFashionDataset', 'Face300WDataset', 'FaceAFLWDataset', + 'FaceWFLWDataset', 'FaceCOFWDataset', 'FaceCocoWholeBodyDataset', + 'Body3DH36MDataset', 'AnimalHorse10Dataset', 'AnimalMacaqueDataset', + 'AnimalFlyDataset', 'AnimalLocustDataset', 'AnimalZebraDataset', + 'AnimalATRWDataset', 'AnimalPoseDataset', 'TopDownH36MDataset', + 'TopDownPoseTrack18VideoDataset', 'build_dataloader', 'build_dataset', + 'Compose', 'DistributedSampler', 'DATASETS', 'PIPELINES', 'DatasetInfo' +] diff --git a/mmpose/datasets/builder.py b/mmpose/datasets/builder.py new file mode 100644 index 0000000..990ba85 --- /dev/null +++ b/mmpose/datasets/builder.py @@ -0,0 +1,162 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import platform +import random +from functools import partial + +import numpy as np +from mmcv.parallel import collate +from mmcv.runner import get_dist_info +from mmcv.utils import Registry, build_from_cfg, is_seq_of +from mmcv.utils.parrots_wrapper import _get_dataloader +from torch.utils.data.dataset import ConcatDataset + +from .samplers import DistributedSampler + +if platform.system() != 'Windows': + # https://github.com/pytorch/pytorch/issues/973 + import resource + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + base_soft_limit = rlimit[0] + hard_limit = rlimit[1] + soft_limit = min(max(4096, base_soft_limit), hard_limit) + resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) + +DATASETS = Registry('dataset') +PIPELINES = Registry('pipeline') + + +def _concat_dataset(cfg, default_args=None): + types = cfg['type'] + ann_files = cfg['ann_file'] + img_prefixes = cfg.get('img_prefix', None) + dataset_infos = cfg.get('dataset_info', None) + + num_joints = cfg['data_cfg'].get('num_joints', None) + dataset_channel = cfg['data_cfg'].get('dataset_channel', None) + + datasets = [] + num_dset = len(ann_files) + for i in range(num_dset): + cfg_copy = copy.deepcopy(cfg) + cfg_copy['ann_file'] = ann_files[i] + + if isinstance(types, (list, tuple)): + cfg_copy['type'] = types[i] + if isinstance(img_prefixes, (list, tuple)): + cfg_copy['img_prefix'] = img_prefixes[i] + if isinstance(dataset_infos, (list, tuple)): + cfg_copy['dataset_info'] = dataset_infos[i] + + if isinstance(num_joints, (list, tuple)): + cfg_copy['data_cfg']['num_joints'] = num_joints[i] + + if is_seq_of(dataset_channel, list): + cfg_copy['data_cfg']['dataset_channel'] = dataset_channel[i] + + datasets.append(build_dataset(cfg_copy, default_args)) + + return ConcatDataset(datasets) + + +def build_dataset(cfg, default_args=None): + """Build a dataset from config dict. + + Args: + cfg (dict): Config dict. It should at least contain the key "type". + default_args (dict, optional): Default initialization arguments. + Default: None. + + Returns: + Dataset: The constructed dataset. + """ + from .dataset_wrappers import RepeatDataset + + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) + elif cfg['type'] == 'ConcatDataset': + dataset = ConcatDataset( + [build_dataset(c, default_args) for c in cfg['datasets']]) + elif cfg['type'] == 'RepeatDataset': + dataset = RepeatDataset( + build_dataset(cfg['dataset'], default_args), cfg['times']) + elif isinstance(cfg.get('ann_file'), (list, tuple)): + dataset = _concat_dataset(cfg, default_args) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + return dataset + + +def build_dataloader(dataset, + samples_per_gpu, + workers_per_gpu, + num_gpus=1, + dist=True, + shuffle=True, + seed=None, + drop_last=True, + pin_memory=True, + **kwargs): + """Build PyTorch DataLoader. + + In distributed training, each GPU/process has a dataloader. + In non-distributed training, there is only one dataloader for all GPUs. + + Args: + dataset (Dataset): A PyTorch dataset. + samples_per_gpu (int): Number of training samples on each GPU, i.e., + batch size of each GPU. + workers_per_gpu (int): How many subprocesses to use for data loading + for each GPU. + num_gpus (int): Number of GPUs. Only used in non-distributed training. + dist (bool): Distributed training/test or not. Default: True. + shuffle (bool): Whether to shuffle the data at every epoch. + Default: True. + drop_last (bool): Whether to drop the last incomplete batch in epoch. + Default: True + pin_memory (bool): Whether to use pin_memory in DataLoader. + Default: True + kwargs: any keyword argument to be used to initialize DataLoader + + Returns: + DataLoader: A PyTorch dataloader. + """ + rank, world_size = get_dist_info() + if dist: + sampler = DistributedSampler( + dataset, world_size, rank, shuffle=shuffle, seed=seed) + shuffle = False + batch_size = samples_per_gpu + num_workers = workers_per_gpu + else: + sampler = None + batch_size = num_gpus * samples_per_gpu + num_workers = num_gpus * workers_per_gpu + + init_fn = partial( + worker_init_fn, num_workers=num_workers, rank=rank, + seed=seed) if seed is not None else None + + _, DataLoader = _get_dataloader() + data_loader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + pin_memory=pin_memory, + shuffle=shuffle, + worker_init_fn=init_fn, + drop_last=drop_last, + **kwargs) + + return data_loader + + +def worker_init_fn(worker_id, num_workers, rank, seed): + """Init the random seed for various workers.""" + # The seed of each worker equals to + # num_worker * rank + worker_id + user_seed + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) diff --git a/mmpose/datasets/dataset_info.py b/mmpose/datasets/dataset_info.py new file mode 100644 index 0000000..ef0d62e --- /dev/null +++ b/mmpose/datasets/dataset_info.py @@ -0,0 +1,104 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + + +class DatasetInfo: + + def __init__(self, dataset_info): + self._dataset_info = dataset_info + self.dataset_name = self._dataset_info['dataset_name'] + self.paper_info = self._dataset_info['paper_info'] + self.keypoint_info = self._dataset_info['keypoint_info'] + self.skeleton_info = self._dataset_info['skeleton_info'] + self.joint_weights = np.array( + self._dataset_info['joint_weights'], dtype=np.float32)[:, None] + + self.sigmas = np.array(self._dataset_info['sigmas']) + + self._parse_keypoint_info() + self._parse_skeleton_info() + + def _parse_skeleton_info(self): + """Parse skeleton information. + + - link_num (int): number of links. + - skeleton (list((2,))): list of links (id). + - skeleton_name (list((2,))): list of links (name). + - pose_link_color (np.ndarray): the color of the link for + visualization. + """ + self.link_num = len(self.skeleton_info.keys()) + self.pose_link_color = [] + + self.skeleton_name = [] + self.skeleton = [] + for skid in self.skeleton_info.keys(): + link = self.skeleton_info[skid]['link'] + self.skeleton_name.append(link) + self.skeleton.append([ + self.keypoint_name2id[link[0]], self.keypoint_name2id[link[1]] + ]) + self.pose_link_color.append(self.skeleton_info[skid].get( + 'color', [255, 128, 0])) + self.pose_link_color = np.array(self.pose_link_color) + + def _parse_keypoint_info(self): + """Parse keypoint information. + + - keypoint_num (int): number of keypoints. + - keypoint_id2name (dict): mapping keypoint id to keypoint name. + - keypoint_name2id (dict): mapping keypoint name to keypoint id. + - upper_body_ids (list): a list of keypoints that belong to the + upper body. + - lower_body_ids (list): a list of keypoints that belong to the + lower body. + - flip_index (list): list of flip index (id) + - flip_pairs (list((2,))): list of flip pairs (id) + - flip_index_name (list): list of flip index (name) + - flip_pairs_name (list((2,))): list of flip pairs (name) + - pose_kpt_color (np.ndarray): the color of the keypoint for + visualization. + """ + + self.keypoint_num = len(self.keypoint_info.keys()) + self.keypoint_id2name = {} + self.keypoint_name2id = {} + + self.pose_kpt_color = [] + self.upper_body_ids = [] + self.lower_body_ids = [] + + self.flip_index_name = [] + self.flip_pairs_name = [] + + for kid in self.keypoint_info.keys(): + + keypoint_name = self.keypoint_info[kid]['name'] + self.keypoint_id2name[kid] = keypoint_name + self.keypoint_name2id[keypoint_name] = kid + self.pose_kpt_color.append(self.keypoint_info[kid].get( + 'color', [255, 128, 0])) + + type = self.keypoint_info[kid].get('type', '') + if type == 'upper': + self.upper_body_ids.append(kid) + elif type == 'lower': + self.lower_body_ids.append(kid) + else: + pass + + swap_keypoint = self.keypoint_info[kid].get('swap', '') + if swap_keypoint == keypoint_name or swap_keypoint == '': + self.flip_index_name.append(keypoint_name) + else: + self.flip_index_name.append(swap_keypoint) + if [swap_keypoint, keypoint_name] not in self.flip_pairs_name: + self.flip_pairs_name.append([keypoint_name, swap_keypoint]) + + self.flip_pairs = [[ + self.keypoint_name2id[pair[0]], self.keypoint_name2id[pair[1]] + ] for pair in self.flip_pairs_name] + self.flip_index = [ + self.keypoint_name2id[name] for name in self.flip_index_name + ] + self.pose_kpt_color = np.array(self.pose_kpt_color) diff --git a/mmpose/datasets/dataset_wrappers.py b/mmpose/datasets/dataset_wrappers.py new file mode 100644 index 0000000..aaaa173 --- /dev/null +++ b/mmpose/datasets/dataset_wrappers.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .builder import DATASETS + + +@DATASETS.register_module() +class RepeatDataset: + """A wrapper of repeated dataset. + + The length of repeated dataset will be `times` larger than the original + dataset. This is useful when the data loading time is long but the dataset + is small. Using RepeatDataset can reduce the data loading time between + epochs. + + Args: + dataset (:obj:`Dataset`): The dataset to be repeated. + times (int): Repeat times. + """ + + def __init__(self, dataset, times): + self.dataset = dataset + self.times = times + + self._ori_len = len(self.dataset) + + def __getitem__(self, idx): + """Get data.""" + return self.dataset[idx % self._ori_len] + + def __len__(self): + """Length after repetition.""" + return self.times * self._ori_len diff --git a/mmpose/datasets/datasets/__init__.py b/mmpose/datasets/datasets/__init__.py new file mode 100644 index 0000000..f3839e5 --- /dev/null +++ b/mmpose/datasets/datasets/__init__.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...deprecated import (TopDownFreiHandDataset, TopDownOneHand10KDataset, + TopDownPanopticDataset) +from .animal import (AnimalATRWDataset, AnimalFlyDataset, AnimalHorse10Dataset, + AnimalLocustDataset, AnimalMacaqueDataset, + AnimalPoseDataset, AnimalZebraDataset) +from .body3d import Body3DH36MDataset, Body3DMviewDirectPanopticDataset +from .bottom_up import (BottomUpAicDataset, BottomUpCocoDataset, + BottomUpCocoWholeBodyDataset, BottomUpCrowdPoseDataset, + BottomUpMhpDataset) +from .face import (Face300WDataset, FaceAFLWDataset, FaceCocoWholeBodyDataset, + FaceCOFWDataset, FaceWFLWDataset) +from .fashion import DeepFashionDataset +from .hand import (FreiHandDataset, HandCocoWholeBodyDataset, + InterHand2DDataset, InterHand3DDataset, OneHand10KDataset, + PanopticDataset) +from .mesh import (MeshAdversarialDataset, MeshH36MDataset, MeshMixDataset, + MoshDataset) +from .top_down import (TopDownAicDataset, TopDownCocoDataset, + TopDownCocoWholeBodyDataset, TopDownCrowdPoseDataset, + TopDownH36MDataset, TopDownHalpeDataset, + TopDownJhmdbDataset, TopDownMhpDataset, + TopDownMpiiDataset, TopDownMpiiTrbDataset, + TopDownOCHumanDataset, TopDownPoseTrack18Dataset, + TopDownPoseTrack18VideoDataset) + +__all__ = [ + 'TopDownCocoDataset', 'BottomUpCocoDataset', 'BottomUpMhpDataset', + 'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset', 'TopDownMpiiDataset', + 'TopDownMpiiTrbDataset', 'OneHand10KDataset', 'PanopticDataset', + 'HandCocoWholeBodyDataset', 'FreiHandDataset', 'InterHand2DDataset', + 'InterHand3DDataset', 'TopDownOCHumanDataset', 'TopDownAicDataset', + 'TopDownCocoWholeBodyDataset', 'MeshH36MDataset', 'MeshMixDataset', + 'MoshDataset', 'MeshAdversarialDataset', 'TopDownCrowdPoseDataset', + 'BottomUpCrowdPoseDataset', 'TopDownFreiHandDataset', + 'TopDownOneHand10KDataset', 'TopDownPanopticDataset', + 'TopDownPoseTrack18Dataset', 'TopDownJhmdbDataset', 'TopDownMhpDataset', + 'DeepFashionDataset', 'Face300WDataset', 'FaceAFLWDataset', + 'FaceWFLWDataset', 'FaceCOFWDataset', 'FaceCocoWholeBodyDataset', + 'Body3DH36MDataset', 'AnimalHorse10Dataset', 'AnimalMacaqueDataset', + 'AnimalFlyDataset', 'AnimalLocustDataset', 'AnimalZebraDataset', + 'AnimalATRWDataset', 'AnimalPoseDataset', 'TopDownH36MDataset', + 'TopDownHalpeDataset', 'TopDownPoseTrack18VideoDataset', + 'Body3DMviewDirectPanopticDataset' +] diff --git a/mmpose/datasets/datasets/animal/__init__.py b/mmpose/datasets/datasets/animal/__init__.py new file mode 100644 index 0000000..185b935 --- /dev/null +++ b/mmpose/datasets/datasets/animal/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .animal_ap10k_dataset import AnimalAP10KDataset +from .animal_atrw_dataset import AnimalATRWDataset +from .animal_fly_dataset import AnimalFlyDataset +from .animal_horse10_dataset import AnimalHorse10Dataset +from .animal_locust_dataset import AnimalLocustDataset +from .animal_macaque_dataset import AnimalMacaqueDataset +from .animal_pose_dataset import AnimalPoseDataset +from .animal_zebra_dataset import AnimalZebraDataset + +__all__ = [ + 'AnimalHorse10Dataset', 'AnimalMacaqueDataset', 'AnimalFlyDataset', + 'AnimalLocustDataset', 'AnimalZebraDataset', 'AnimalATRWDataset', + 'AnimalPoseDataset', 'AnimalAP10KDataset' +] diff --git a/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py b/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py new file mode 100644 index 0000000..11a1e73 --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_ap10k_dataset.py @@ -0,0 +1,367 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning +from xtcocotools.cocoeval import COCOeval + +from ....core.post_processing import oks_nms, soft_oks_nms +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class AnimalAP10KDataset(Kpt2dSviewRgbImgTopDownDataset): + """AP-10K dataset for animal pose estimation. + + "AP-10K: A Benchmark for Animal Pose Estimation in the Wild" + Neurips Dataset Track'2021. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + AP-10K keypoint indexes:: + + 0: 'L_Eye', + 1: 'R_Eye', + 2: 'Nose', + 3: 'Neck', + 4: 'root of tail', + 5: 'L_Shoulder', + 6: 'L_Elbow', + 7: 'L_F_Paw', + 8: 'R_Shoulder', + 9: 'R_Elbow', + 10: 'R_F_Paw, + 11: 'L_Hip', + 12: 'L_Knee', + 13: 'L_B_Paw', + 14: 'R_Hip', + 15: 'R_Knee', + 16: 'R_B_Paw' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/ap10k.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.ann_info['use_different_joint_weights'] = False + self.db, self.id2Cat = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db, id2Cat = self._load_coco_keypoint_annotations() + return gt_db, id2Cat + + def _load_coco_keypoint_annotations(self): + """Ground truth bbox and keypoints.""" + gt_db, id2Cat = [], dict() + for img_id in self.img_ids: + db_tmp, id2Cat_tmp = self._load_coco_keypoint_annotation_kernel( + img_id) + gt_db.extend(db_tmp) + id2Cat.update({img_id: id2Cat_tmp}) + return gt_db, id2Cat + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + id2Cat = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + category = obj['category_id'] + id2Cat.append({ + 'image_file': image_file, + 'bbox_id': bbox_id, + 'category': category, + }) + bbox_id = bbox_id + 1 + + return rec, id2Cat + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mAP', **kwargs): + """Evaluate coco keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['data/coco/val2017\ + /000000393226.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap + - bbox_id (list(int)). + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'mAP'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['mAP'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = defaultdict(list) + + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + cat = self.id2Cat[image_id][bbox_ids[i]]['category'] + kpts[image_id].append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i], + 'category': cat + }) + kpts = self._sort_and_unique_bboxes(kpts) + + # rescoring and oks nms + num_joints = self.ann_info['num_joints'] + vis_thr = self.vis_thr + oks_thr = self.oks_thr + valid_kpts = [] + for image_id in kpts.keys(): + img_kpts = kpts[image_id] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > vis_thr: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + if self.use_nms: + nms = soft_oks_nms if self.soft_nms else oks_nms + keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas) + valid_kpts.append([img_kpts[_keep] for _keep in keep]) + else: + valid_kpts.append(img_kpts) + + self._write_coco_keypoint_results(valid_kpts, res_file) + + info_str = self._do_python_keypoint_eval(res_file) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + def _write_coco_keypoint_results(self, keypoints, res_file): + """Write results into a json file.""" + data_pack = [{ + 'cat_id': self._class_to_coco_ind[cls], + 'cls_ind': cls_ind, + 'cls': cls, + 'ann_type': 'keypoints', + 'keypoints': keypoints + } for cls_ind, cls in enumerate(self.classes) + if not cls == '__background__'] + + results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) + + with open(res_file, 'w') as f: + json.dump(results, f, sort_keys=True, indent=4) + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + """Get coco keypoint results.""" + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpt['keypoints'] for img_kpt in img_kpts]) + key_points = _key_points.reshape(-1, + self.ann_info['num_joints'] * 3) + + result = [{ + 'image_id': img_kpt['image_id'], + 'category_id': img_kpt['category'], + 'keypoints': key_point.tolist(), + 'score': float(img_kpt['score']), + 'center': img_kpt['center'].tolist(), + 'scale': img_kpt['scale'].tolist() + } for img_kpt, key_point in zip(img_kpts, key_points)] + + cat_results.extend(result) + + return cat_results + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + for img_id, persons in kpts.items(): + num = len(persons) + kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key]) + for i in range(num - 1, 0, -1): + if kpts[img_id][i][key] == kpts[img_id][i - 1][key]: + del kpts[img_id][i] + + return kpts diff --git a/mmpose/datasets/datasets/animal/animal_atrw_dataset.py b/mmpose/datasets/datasets/animal/animal_atrw_dataset.py new file mode 100644 index 0000000..edfd3f9 --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_atrw_dataset.py @@ -0,0 +1,353 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning +from xtcocotools.cocoeval import COCOeval + +from ....core.post_processing import oks_nms, soft_oks_nms +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class AnimalATRWDataset(Kpt2dSviewRgbImgTopDownDataset): + """ATRW dataset for animal pose estimation. + + "ATRW: A Benchmark for Amur Tiger Re-identification in the Wild" + ACM MM'2020. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + ATRW keypoint indexes:: + + 0: "left_ear", + 1: "right_ear", + 2: "nose", + 3: "right_shoulder", + 4: "right_front_paw", + 5: "left_shoulder", + 6: "left_front_paw", + 7: "right_hip", + 8: "right_knee", + 9: "right_back_paw", + 10: "left_hip", + 11: "left_knee", + 12: "left_back_paw", + 13: "tail", + 14: "center" + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/atrw.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db + + def _load_coco_keypoint_annotations(self): + """Ground truth bbox and keypoints.""" + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4], padding=1.0) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + + return rec + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mAP', **kwargs): + """Evaluate coco keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['data/coco/val2017\ + /000000393226.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap + - bbox_id (list(int)). + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'mAP'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['mAP'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = defaultdict(list) + + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + kpts[image_id].append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + # rescoring and oks nms + num_joints = self.ann_info['num_joints'] + vis_thr = self.vis_thr + oks_thr = self.oks_thr + valid_kpts = [] + for image_id in kpts.keys(): + img_kpts = kpts[image_id] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > vis_thr: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + if self.use_nms: + nms = soft_oks_nms if self.soft_nms else oks_nms + keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas) + valid_kpts.append([img_kpts[_keep] for _keep in keep]) + else: + valid_kpts.append(img_kpts) + + self._write_coco_keypoint_results(valid_kpts, res_file) + + info_str = self._do_python_keypoint_eval(res_file) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + def _write_coco_keypoint_results(self, keypoints, res_file): + """Write results into a json file.""" + data_pack = [{ + 'cat_id': self._class_to_coco_ind[cls], + 'cls_ind': cls_ind, + 'cls': cls, + 'ann_type': 'keypoints', + 'keypoints': keypoints + } for cls_ind, cls in enumerate(self.classes) + if not cls == '__background__'] + + results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) + + with open(res_file, 'w') as f: + json.dump(results, f, sort_keys=True, indent=4) + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + """Get coco keypoint results.""" + cat_id = data_pack['cat_id'] + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpt['keypoints'] for img_kpt in img_kpts]) + key_points = _key_points.reshape(-1, + self.ann_info['num_joints'] * 3) + + result = [{ + 'image_id': img_kpt['image_id'], + 'category_id': cat_id, + 'keypoints': key_point.tolist(), + 'score': float(img_kpt['score']), + 'center': img_kpt['center'].tolist(), + 'scale': img_kpt['scale'].tolist() + } for img_kpt, key_point in zip(img_kpts, key_points)] + + cat_results.extend(result) + + return cat_results + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + for img_id, persons in kpts.items(): + num = len(persons) + kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key]) + for i in range(num - 1, 0, -1): + if kpts[img_id][i][key] == kpts[img_id][i - 1][key]: + del kpts[img_id][i] + + return kpts diff --git a/mmpose/datasets/datasets/animal/animal_base_dataset.py b/mmpose/datasets/datasets/animal/animal_base_dataset.py new file mode 100644 index 0000000..e191882 --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_base_dataset.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +from torch.utils.data import Dataset + + +class AnimalBaseDataset(Dataset, metaclass=ABCMeta): + """This class has been deprecated and replaced by + Kpt2dSviewRgbImgTopDownDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'AnimalBaseDataset has been replaced by ' + 'Kpt2dSviewRgbImgTopDownDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/663 for details.') + ) diff --git a/mmpose/datasets/datasets/animal/animal_fly_dataset.py b/mmpose/datasets/datasets/animal/animal_fly_dataset.py new file mode 100644 index 0000000..f414117 --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_fly_dataset.py @@ -0,0 +1,215 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class AnimalFlyDataset(Kpt2dSviewRgbImgTopDownDataset): + """AnimalFlyDataset for animal pose estimation. + + "Fast animal pose estimation using deep neural networks" + Nature methods'2019. More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Vinegar Fly keypoint indexes:: + + 0: "head", + 1: "eyeL", + 2: "eyeR", + 3: "neck", + 4: "thorax", + 5: "abdomen", + 6: "forelegR1", + 7: "forelegR2", + 8: "forelegR3", + 9: "forelegR4", + 10: "midlegR1", + 11: "midlegR2", + 12: "midlegR3", + 13: "midlegR4", + 14: "hindlegR1", + 15: "hindlegR2", + 16: "hindlegR3", + 17: "hindlegR4", + 18: "forelegL1", + 19: "forelegL2", + 20: "forelegL3", + 21: "forelegL4", + 22: "midlegL1", + 23: "midlegL2", + 24: "midlegL3", + 25: "midlegL4", + 26: "hindlegL1", + 27: "hindlegL2", + 28: "hindlegL3", + 29: "hindlegL4", + 30: "wingL", + 31: "wingR" + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/fly.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # the ori image is 192x192 + center, scale = self._xywh2cs(0, 0, 192, 192, 0.8) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate Fly keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['Test/source/0.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + + res_folder (str): Path of directory to save the results. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/animal/animal_horse10_dataset.py b/mmpose/datasets/datasets/animal/animal_horse10_dataset.py new file mode 100644 index 0000000..d2bf198 --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_horse10_dataset.py @@ -0,0 +1,220 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class AnimalHorse10Dataset(Kpt2dSviewRgbImgTopDownDataset): + """AnimalHorse10Dataset for animal pose estimation. + + "Pretraining boosts out-of-domain robustness for pose estimation" + WACV'2021. More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Horse-10 keypoint indexes:: + + 0: 'Nose', + 1: 'Eye', + 2: 'Nearknee', + 3: 'Nearfrontfetlock', + 4: 'Nearfrontfoot', + 5: 'Offknee', + 6: 'Offfrontfetlock', + 7: 'Offfrontfoot', + 8: 'Shoulder', + 9: 'Midshoulder', + 10: 'Elbow', + 11: 'Girth', + 12: 'Wither', + 13: 'Nearhindhock', + 14: 'Nearhindfetlock', + 15: 'Nearhindfoot', + 16: 'Hip', + 17: 'Stifle', + 18: 'Offhindhock', + 19: 'Offhindfetlock', + 20: 'Offhindfoot', + 21: 'Ischium' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/horse10.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # use 1.25 padded bbox as input + center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + def _get_normalize_factor(self, gts): + """Get inter-ocular distance as the normalize factor, measured as the + Euclidean distance between the outer corners of the eyes. + + Args: + gts (np.ndarray[N, K, 2]): Groundtruth keypoint location. + + Returns: + np.ndarray[N, 2]: normalized factor + """ + + interocular = np.linalg.norm( + gts[:, 0, :] - gts[:, 1, :], axis=1, keepdims=True) + return np.tile(interocular, [1, 2]) + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate horse-10 keypoint results. The pose prediction results will + be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['Test/source/0.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'NME'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/animal/animal_locust_dataset.py b/mmpose/datasets/datasets/animal/animal_locust_dataset.py new file mode 100644 index 0000000..95fb6ac --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_locust_dataset.py @@ -0,0 +1,218 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class AnimalLocustDataset(Kpt2dSviewRgbImgTopDownDataset): + """AnimalLocustDataset for animal pose estimation. + + "DeepPoseKit, a software toolkit for fast and robust animal + pose estimation using deep learning" Elife'2019. + More details can be found in the paper. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Desert Locust keypoint indexes:: + + 0: "head", + 1: "neck", + 2: "thorax", + 3: "abdomen1", + 4: "abdomen2", + 5: "anttipL", + 6: "antbaseL", + 7: "eyeL", + 8: "forelegL1", + 9: "forelegL2", + 10: "forelegL3", + 11: "forelegL4", + 12: "midlegL1", + 13: "midlegL2", + 14: "midlegL3", + 15: "midlegL4", + 16: "hindlegL1", + 17: "hindlegL2", + 18: "hindlegL3", + 19: "hindlegL4", + 20: "anttipR", + 21: "antbaseR", + 22: "eyeR", + 23: "forelegR1", + 24: "forelegR2", + 25: "forelegR3", + 26: "forelegR4", + 27: "midlegR1", + 28: "midlegR2", + 29: "midlegR3", + 30: "midlegR4", + 31: "hindlegR1", + 32: "hindlegR2", + 33: "hindlegR3", + 34: "hindlegR4" + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/locust.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # the ori image is 160x160 + center, scale = self._xywh2cs(0, 0, 160, 160, 0.8) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate Fly keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['Test/source/0.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/animal/animal_macaque_dataset.py b/mmpose/datasets/datasets/animal/animal_macaque_dataset.py new file mode 100644 index 0000000..359feca --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_macaque_dataset.py @@ -0,0 +1,355 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning +from xtcocotools.cocoeval import COCOeval + +from ....core.post_processing import oks_nms, soft_oks_nms +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class AnimalMacaqueDataset(Kpt2dSviewRgbImgTopDownDataset): + """MacaquePose dataset for animal pose estimation. + + "MacaquePose: A novel ‘in the wild’ macaque monkey pose dataset + for markerless motion capture" bioRxiv'2020. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Macaque keypoint indexes:: + + 0: 'nose', + 1: 'left_eye', + 2: 'right_eye', + 3: 'left_ear', + 4: 'right_ear', + 5: 'left_shoulder', + 6: 'right_shoulder', + 7: 'left_elbow', + 8: 'right_elbow', + 9: 'left_wrist', + 10: 'right_wrist', + 11: 'left_hip', + 12: 'right_hip', + 13: 'left_knee', + 14: 'right_knee', + 15: 'left_ankle', + 16: 'right_ankle' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/macaque.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db + + def _load_coco_keypoint_annotations(self): + """Ground truth bbox and keypoints.""" + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + + return rec + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mAP', **kwargs): + """Evaluate coco keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + batch_size: N + num_keypoints: K + heatmap height: H + heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['data/coco/val2017\ + /000000393226.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap + - bbox_id (list(int)). + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'mAP'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['mAP'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = defaultdict(list) + + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + kpts[image_id].append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + # rescoring and oks nms + num_joints = self.ann_info['num_joints'] + vis_thr = self.vis_thr + oks_thr = self.oks_thr + valid_kpts = [] + for image_id in kpts.keys(): + img_kpts = kpts[image_id] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > vis_thr: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + if self.use_nms: + nms = soft_oks_nms if self.soft_nms else oks_nms + keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas) + valid_kpts.append([img_kpts[_keep] for _keep in keep]) + else: + valid_kpts.append(img_kpts) + + self._write_coco_keypoint_results(valid_kpts, res_file) + + info_str = self._do_python_keypoint_eval(res_file) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + def _write_coco_keypoint_results(self, keypoints, res_file): + """Write results into a json file.""" + data_pack = [{ + 'cat_id': self._class_to_coco_ind[cls], + 'cls_ind': cls_ind, + 'cls': cls, + 'ann_type': 'keypoints', + 'keypoints': keypoints + } for cls_ind, cls in enumerate(self.classes) + if not cls == '__background__'] + + results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) + + with open(res_file, 'w') as f: + json.dump(results, f, sort_keys=True, indent=4) + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + """Get coco keypoint results.""" + cat_id = data_pack['cat_id'] + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpt['keypoints'] for img_kpt in img_kpts]) + key_points = _key_points.reshape(-1, + self.ann_info['num_joints'] * 3) + + result = [{ + 'image_id': img_kpt['image_id'], + 'category_id': cat_id, + 'keypoints': key_point.tolist(), + 'score': float(img_kpt['score']), + 'center': img_kpt['center'].tolist(), + 'scale': img_kpt['scale'].tolist() + } for img_kpt, key_point in zip(img_kpts, key_points)] + + cat_results.extend(result) + + return cat_results + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + for img_id, persons in kpts.items(): + num = len(persons) + kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key]) + for i in range(num - 1, 0, -1): + if kpts[img_id][i][key] == kpts[img_id][i - 1][key]: + del kpts[img_id][i] + + return kpts diff --git a/mmpose/datasets/datasets/animal/animal_pose_dataset.py b/mmpose/datasets/datasets/animal/animal_pose_dataset.py new file mode 100644 index 0000000..4ced570 --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_pose_dataset.py @@ -0,0 +1,359 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning +from xtcocotools.cocoeval import COCOeval + +from ....core.post_processing import oks_nms, soft_oks_nms +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class AnimalPoseDataset(Kpt2dSviewRgbImgTopDownDataset): + """Animal-Pose dataset for animal pose estimation. + + "Cross-domain Adaptation For Animal Pose Estimation" ICCV'2019 + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Animal-Pose keypoint indexes:: + + 0: 'L_Eye', + 1: 'R_Eye', + 2: 'L_EarBase', + 3: 'R_EarBase', + 4: 'Nose', + 5: 'Throat', + 6: 'TailBase', + 7: 'Withers', + 8: 'L_F_Elbow', + 9: 'R_F_Elbow', + 10: 'L_B_Elbow', + 11: 'R_B_Elbow', + 12: 'L_F_Knee', + 13: 'R_F_Knee', + 14: 'L_B_Knee', + 15: 'R_B_Knee', + 16: 'L_F_Paw', + 17: 'R_F_Paw', + 18: 'L_B_Paw', + 19: 'R_B_Paw' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/animalpose.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db + + def _load_coco_keypoint_annotations(self): + """Ground truth bbox and keypoints.""" + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + + Args: + img_id: coco image id + + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + + return rec + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mAP', **kwargs): + """Evaluate coco keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['data/coco/val2017\ + /000000393226.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap + - bbox_id (list(int)). + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'mAP'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['mAP'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = defaultdict(list) + + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + kpts[image_id].append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + # rescoring and oks nms + num_joints = self.ann_info['num_joints'] + vis_thr = self.vis_thr + oks_thr = self.oks_thr + valid_kpts = [] + for image_id in kpts.keys(): + img_kpts = kpts[image_id] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > vis_thr: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + if self.use_nms: + nms = soft_oks_nms if self.soft_nms else oks_nms + keep = nms(list(img_kpts), oks_thr, sigmas=self.sigmas) + valid_kpts.append([img_kpts[_keep] for _keep in keep]) + else: + valid_kpts.append(img_kpts) + + self._write_coco_keypoint_results(valid_kpts, res_file) + + info_str = self._do_python_keypoint_eval(res_file) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + def _write_coco_keypoint_results(self, keypoints, res_file): + """Write results into a json file.""" + data_pack = [{ + 'cat_id': self._class_to_coco_ind[cls], + 'cls_ind': cls_ind, + 'cls': cls, + 'ann_type': 'keypoints', + 'keypoints': keypoints + } for cls_ind, cls in enumerate(self.classes) + if not cls == '__background__'] + + results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) + + with open(res_file, 'w') as f: + json.dump(results, f, sort_keys=True, indent=4) + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + """Get coco keypoint results.""" + cat_id = data_pack['cat_id'] + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpt['keypoints'] for img_kpt in img_kpts]) + key_points = _key_points.reshape(-1, + self.ann_info['num_joints'] * 3) + + result = [{ + 'image_id': img_kpt['image_id'], + 'category_id': cat_id, + 'keypoints': key_point.tolist(), + 'score': float(img_kpt['score']), + 'center': img_kpt['center'].tolist(), + 'scale': img_kpt['scale'].tolist() + } for img_kpt, key_point in zip(img_kpts, key_points)] + + cat_results.extend(result) + + return cat_results + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + for img_id, persons in kpts.items(): + num = len(persons) + kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key]) + for i in range(num - 1, 0, -1): + if kpts[img_id][i][key] == kpts[img_id][i - 1][key]: + del kpts[img_id][i] + + return kpts diff --git a/mmpose/datasets/datasets/animal/animal_zebra_dataset.py b/mmpose/datasets/datasets/animal/animal_zebra_dataset.py new file mode 100644 index 0000000..9c5e3b7 --- /dev/null +++ b/mmpose/datasets/datasets/animal/animal_zebra_dataset.py @@ -0,0 +1,193 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class AnimalZebraDataset(Kpt2dSviewRgbImgTopDownDataset): + """AnimalZebraDataset for animal pose estimation. + + "DeepPoseKit, a software toolkit for fast and robust animal + pose estimation using deep learning" Elife'2019. + More details can be found in the paper. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Desert Locust keypoint indexes:: + + 0: "snout", + 1: "head", + 2: "neck", + 3: "forelegL1", + 4: "forelegR1", + 5: "hindlegL1", + 6: "hindlegR1", + 7: "tailbase", + 8: "tailtip" + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/zebra.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # the ori image is 160x160 + center, scale = self._xywh2cs(0, 0, 160, 160, 0.8) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate Fly keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['Test/source/0.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/base/__init__.py b/mmpose/datasets/datasets/base/__init__.py new file mode 100644 index 0000000..e5f9a08 --- /dev/null +++ b/mmpose/datasets/datasets/base/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .kpt_2d_sview_rgb_img_bottom_up_dataset import \ + Kpt2dSviewRgbImgBottomUpDataset +from .kpt_2d_sview_rgb_img_top_down_dataset import \ + Kpt2dSviewRgbImgTopDownDataset +from .kpt_2d_sview_rgb_vid_top_down_dataset import \ + Kpt2dSviewRgbVidTopDownDataset +from .kpt_3d_mview_rgb_img_direct_dataset import Kpt3dMviewRgbImgDirectDataset +from .kpt_3d_sview_kpt_2d_dataset import Kpt3dSviewKpt2dDataset +from .kpt_3d_sview_rgb_img_top_down_dataset import \ + Kpt3dSviewRgbImgTopDownDataset + +__all__ = [ + 'Kpt3dMviewRgbImgDirectDataset', 'Kpt2dSviewRgbImgTopDownDataset', + 'Kpt3dSviewRgbImgTopDownDataset', 'Kpt2dSviewRgbImgBottomUpDataset', + 'Kpt3dSviewKpt2dDataset', 'Kpt2dSviewRgbVidTopDownDataset' +] diff --git a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py new file mode 100644 index 0000000..9930621 --- /dev/null +++ b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_bottom_up_dataset.py @@ -0,0 +1,188 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta, abstractmethod + +import numpy as np +import xtcocotools +from torch.utils.data import Dataset +from xtcocotools.coco import COCO + +from mmpose.datasets import DatasetInfo +from mmpose.datasets.pipelines import Compose + + +class Kpt2dSviewRgbImgBottomUpDataset(Dataset, metaclass=ABCMeta): + """Base class for bottom-up datasets. + + All datasets should subclass it. + All subclasses should overwrite: + Methods:`_get_single` + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + coco_style (bool): Whether the annotation json is coco-style. + Default: True + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + coco_style=True, + test_mode=False): + + self.image_info = {} + self.ann_info = {} + + self.ann_file = ann_file + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + + # bottom-up + self.base_size = data_cfg['base_size'] + self.base_sigma = data_cfg['base_sigma'] + self.int_sigma = False + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + self.ann_info['num_scales'] = data_cfg['num_scales'] + self.ann_info['scale_aware_sigma'] = data_cfg['scale_aware_sigma'] + + self.ann_info['inference_channel'] = data_cfg['inference_channel'] + self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + + self.use_nms = data_cfg.get('use_nms', False) + self.soft_nms = data_cfg.get('soft_nms', True) + self.oks_thr = data_cfg.get('oks_thr', 0.9) + + if dataset_info is None: + raise ValueError( + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.') + + dataset_info = DatasetInfo(dataset_info) + + assert self.ann_info['num_joints'] == dataset_info.keypoint_num + self.ann_info['flip_pairs'] = dataset_info.flip_pairs + self.ann_info['flip_index'] = dataset_info.flip_index + self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids + self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids + self.ann_info['joint_weights'] = dataset_info.joint_weights + self.ann_info['skeleton'] = dataset_info.skeleton + self.sigmas = dataset_info.sigmas + self.dataset_name = dataset_info.dataset_name + + if coco_style: + self.coco = COCO(ann_file) + if 'categories' in self.coco.dataset: + cats = [ + cat['name'] + for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + self.classes = ['__background__'] + cats + self.num_classes = len(self.classes) + self._class_to_ind = dict( + zip(self.classes, range(self.num_classes))) + self._class_to_coco_ind = dict( + zip(cats, self.coco.getCatIds())) + self._coco_ind_to_class_ind = dict( + (self._class_to_coco_ind[cls], self._class_to_ind[cls]) + for cls in self.classes[1:]) + self.img_ids = self.coco.getImgIds() + if not test_mode: + self.img_ids = [ + img_id for img_id in self.img_ids if + len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0 + ] + self.num_images = len(self.img_ids) + self.id2name, self.name2id = self._get_mapping_id_name( + self.coco.imgs) + + self.pipeline = Compose(self.pipeline) + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _get_mask(self, anno, idx): + """Get ignore masks to mask out losses.""" + coco = self.coco + img_info = coco.loadImgs(self.img_ids[idx])[0] + + m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32) + + for obj in anno: + if 'segmentation' in obj: + if obj['iscrowd']: + rle = xtcocotools.mask.frPyObjects(obj['segmentation'], + img_info['height'], + img_info['width']) + m += xtcocotools.mask.decode(rle) + elif obj['num_keypoints'] == 0: + rles = xtcocotools.mask.frPyObjects( + obj['segmentation'], img_info['height'], + img_info['width']) + for rle in rles: + m += xtcocotools.mask.decode(rle) + + return m < 0.5 + + @abstractmethod + def _get_single(self, idx): + """Get anno for a single image.""" + raise NotImplementedError + + @abstractmethod + def evaluate(self, results, *args, **kwargs): + """Evaluate keypoint results.""" + + def prepare_train_img(self, idx): + """Prepare image for training given the index.""" + results = copy.deepcopy(self._get_single(idx)) + results['ann_info'] = self.ann_info + return self.pipeline(results) + + def prepare_test_img(self, idx): + """Prepare image for testing given the index.""" + results = copy.deepcopy(self._get_single(idx)) + results['ann_info'] = self.ann_info + return self.pipeline(results) + + def __len__(self): + """Get dataset length.""" + return len(self.img_ids) + + def __getitem__(self, idx): + """Get the sample for either training or testing given index.""" + if self.test_mode: + return self.prepare_test_img(idx) + + return self.prepare_train_img(idx) diff --git a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py new file mode 100644 index 0000000..fb281f1 --- /dev/null +++ b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py @@ -0,0 +1,287 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta, abstractmethod + +import json_tricks as json +import numpy as np +from torch.utils.data import Dataset +from xtcocotools.coco import COCO + +from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe, + keypoint_nme, + keypoint_pck_accuracy) +from mmpose.datasets import DatasetInfo +from mmpose.datasets.pipelines import Compose + + +class Kpt2dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta): + """Base class for keypoint 2D top-down pose estimation with single-view RGB + image as the input. + + All fashion datasets should subclass it. + All subclasses should overwrite: + Methods:`_get_db`, 'evaluate' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + coco_style (bool): Whether the annotation json is coco-style. + Default: True + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + coco_style=True, + test_mode=False): + + self.image_info = {} + self.ann_info = {} + + self.ann_file = ann_file + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + + self.ann_info['inference_channel'] = data_cfg['inference_channel'] + self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] + self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + + self.ann_info['max_num_joints'] = data_cfg.get('max_num_joints', None) + self.ann_info['dataset_idx'] = data_cfg.get('dataset_idx', 0) + + self.ann_info['use_different_joint_weights'] = data_cfg.get( + 'use_different_joint_weights', False) + + if dataset_info is None: + raise ValueError( + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.') + + dataset_info = DatasetInfo(dataset_info) + + assert self.ann_info['num_joints'] == dataset_info.keypoint_num + self.ann_info['flip_pairs'] = dataset_info.flip_pairs + self.ann_info['flip_index'] = dataset_info.flip_index + self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids + self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids + self.ann_info['joint_weights'] = dataset_info.joint_weights + self.ann_info['skeleton'] = dataset_info.skeleton + self.sigmas = dataset_info.sigmas + self.dataset_name = dataset_info.dataset_name + + if coco_style: + self.coco = COCO(ann_file) + if 'categories' in self.coco.dataset: + cats = [ + cat['name'] + for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + self.classes = ['__background__'] + cats + self.num_classes = len(self.classes) + self._class_to_ind = dict( + zip(self.classes, range(self.num_classes))) + self._class_to_coco_ind = dict( + zip(cats, self.coco.getCatIds())) + self._coco_ind_to_class_ind = dict( + (self._class_to_coco_ind[cls], self._class_to_ind[cls]) + for cls in self.classes[1:]) + self.img_ids = self.coco.getImgIds() + self.num_images = len(self.img_ids) + self.id2name, self.name2id = self._get_mapping_id_name( + self.coco.imgs) + + self.db = [] + + self.pipeline = Compose(self.pipeline) + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _xywh2cs(self, x, y, w, h, padding=1.25): + """This encodes bbox(x,y,w,h) into (center, scale) + + Args: + x, y, w, h (float): left, top, width and height + padding (float): bounding box padding factor + + Returns: + center (np.ndarray[float32](2,)): center of the bbox (x, y). + scale (np.ndarray[float32](2,)): scale of the bbox w & h. + """ + aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[ + 'image_size'][1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + + if (not self.test_mode) and np.random.rand() < 0.3: + center += 0.4 * (np.random.rand(2) - 0.5) * [w, h] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + # padding to include proper amount of context + scale = scale * padding + + return center, scale + + def _get_normalize_factor(self, gts, *args, **kwargs): + """Get the normalize factor. generally inter-ocular distance measured + as the Euclidean distance between the outer corners of the eyes is + used. This function should be overrode, to measure NME. + + Args: + gts (np.ndarray[N, K, 2]): Groundtruth keypoint location. + + Returns: + np.ndarray[N, 2]: normalized factor + """ + return np.ones([gts.shape[0], 2], dtype=np.float32) + + @abstractmethod + def _get_db(self): + """Load dataset.""" + raise NotImplementedError + + @abstractmethod + def evaluate(self, results, *args, **kwargs): + """Evaluate keypoint results.""" + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def _report_metric(self, + res_file, + metrics, + pck_thr=0.2, + pckh_thr=0.7, + auc_nor=30): + """Keypoint evaluation. + + Args: + res_file (str): Json file stored prediction results. + metrics (str | list[str]): Metric to be performed. + Options: 'PCK', 'PCKh', 'AUC', 'EPE', 'NME'. + pck_thr (float): PCK threshold, default as 0.2. + pckh_thr (float): PCKh threshold, default as 0.7. + auc_nor (float): AUC normalization factor, default as 30 pixel. + + Returns: + List: Evaluation results for evaluation metric. + """ + info_str = [] + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.db) + + outputs = [] + gts = [] + masks = [] + box_sizes = [] + threshold_bbox = [] + threshold_head_box = [] + + for pred, item in zip(preds, self.db): + outputs.append(np.array(pred['keypoints'])[:, :-1]) + gts.append(np.array(item['joints_3d'])[:, :-1]) + masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0) + if 'PCK' in metrics: + bbox = np.array(item['bbox']) + bbox_thr = np.max(bbox[2:]) + threshold_bbox.append(np.array([bbox_thr, bbox_thr])) + if 'PCKh' in metrics: + head_box_thr = item['head_size'] + threshold_head_box.append( + np.array([head_box_thr, head_box_thr])) + box_sizes.append(item.get('box_size', 1)) + + outputs = np.array(outputs) + gts = np.array(gts) + masks = np.array(masks) + threshold_bbox = np.array(threshold_bbox) + threshold_head_box = np.array(threshold_head_box) + box_sizes = np.array(box_sizes).reshape([-1, 1]) + + if 'PCK' in metrics: + _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr, + threshold_bbox) + info_str.append(('PCK', pck)) + + if 'PCKh' in metrics: + _, pckh, _ = keypoint_pck_accuracy(outputs, gts, masks, pckh_thr, + threshold_head_box) + info_str.append(('PCKh', pckh)) + + if 'AUC' in metrics: + info_str.append(('AUC', keypoint_auc(outputs, gts, masks, + auc_nor))) + + if 'EPE' in metrics: + info_str.append(('EPE', keypoint_epe(outputs, gts, masks))) + + if 'NME' in metrics: + normalize_factor = self._get_normalize_factor( + gts=gts, box_sizes=box_sizes) + info_str.append( + ('NME', keypoint_nme(outputs, gts, masks, normalize_factor))) + + return info_str + + def __len__(self): + """Get the size of the dataset.""" + return len(self.db) + + def __getitem__(self, idx): + """Get the sample given index.""" + results = copy.deepcopy(self.db[idx]) + results['ann_info'] = self.ann_info + return self.pipeline(results) + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py new file mode 100644 index 0000000..e529270 --- /dev/null +++ b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py @@ -0,0 +1,200 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta, abstractmethod + +import numpy as np +from torch.utils.data import Dataset +from xtcocotools.coco import COCO + +from mmpose.datasets import DatasetInfo +from mmpose.datasets.pipelines import Compose + + +class Kpt2dSviewRgbVidTopDownDataset(Dataset, metaclass=ABCMeta): + """Base class for keypoint 2D top-down pose estimation with single-view RGB + video as the input. + + All fashion datasets should subclass it. + All subclasses should overwrite: + Methods:`_get_db`, 'evaluate' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where videos/images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + coco_style (bool): Whether the annotation json is coco-style. + Default: True + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + coco_style=True, + test_mode=False): + + self.image_info = {} + self.ann_info = {} + + self.ann_file = ann_file + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + + self.ann_info['inference_channel'] = data_cfg['inference_channel'] + self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] + self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + + self.ann_info['use_different_joint_weights'] = data_cfg.get( + 'use_different_joint_weights', False) + + if dataset_info is None: + raise ValueError( + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.') + + dataset_info = DatasetInfo(dataset_info) + + assert self.ann_info['num_joints'] == dataset_info.keypoint_num + self.ann_info['flip_pairs'] = dataset_info.flip_pairs + self.ann_info['flip_index'] = dataset_info.flip_index + self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids + self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids + self.ann_info['joint_weights'] = dataset_info.joint_weights + self.ann_info['skeleton'] = dataset_info.skeleton + self.sigmas = dataset_info.sigmas + self.dataset_name = dataset_info.dataset_name + + if coco_style: + self.coco = COCO(ann_file) + if 'categories' in self.coco.dataset: + cats = [ + cat['name'] + for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + self.classes = ['__background__'] + cats + self.num_classes = len(self.classes) + self._class_to_ind = dict( + zip(self.classes, range(self.num_classes))) + self._class_to_coco_ind = dict( + zip(cats, self.coco.getCatIds())) + self._coco_ind_to_class_ind = dict( + (self._class_to_coco_ind[cls], self._class_to_ind[cls]) + for cls in self.classes[1:]) + self.img_ids = self.coco.getImgIds() + self.num_images = len(self.img_ids) + self.id2name, self.name2id = self._get_mapping_id_name( + self.coco.imgs) + + self.db = [] + + self.pipeline = Compose(self.pipeline) + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _xywh2cs(self, x, y, w, h, padding=1.25): + """This encodes bbox(x,y,w,h) into (center, scale) + + Args: + x, y, w, h (float): left, top, width and height + padding (float): bounding box padding factor + + Returns: + center (np.ndarray[float32](2,)): center of the bbox (x, y). + scale (np.ndarray[float32](2,)): scale of the bbox w & h. + """ + aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[ + 'image_size'][1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + + if (not self.test_mode) and np.random.rand() < 0.3: + center += 0.4 * (np.random.rand(2) - 0.5) * [w, h] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + # padding to include proper amount of context + scale = scale * padding + + return center, scale + + @abstractmethod + def _get_db(self): + """Load dataset.""" + + @abstractmethod + def evaluate(self, results, *args, **kwargs): + """Evaluate keypoint results.""" + + @staticmethod + @abstractmethod + def _write_keypoint_results(keypoint_results, gt_folder, pred_folder): + """Write results into a json file.""" + + @abstractmethod + def _do_keypoint_eval(self, gt_folder, pred_folder): + """Keypoint evaluation. + Args: + gt_folder (str): The folder of the json files storing + ground truth keypoint annotations. + pred_folder (str): The folder of the json files storing + prediction results. + + Returns: + List: Evaluation results for evaluation metric. + """ + + def __len__(self): + """Get the size of the dataset.""" + return len(self.db) + + def __getitem__(self, idx): + """Get the sample given index.""" + results = copy.deepcopy(self.db[idx]) + results['ann_info'] = self.ann_info + return self.pipeline(results) + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + for img_id, persons in kpts.items(): + num = len(persons) + kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key]) + for i in range(num - 1, 0, -1): + if kpts[img_id][i][key] == kpts[img_id][i - 1][key]: + del kpts[img_id][i] + + return kpts diff --git a/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py b/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py new file mode 100644 index 0000000..94cc1c2 --- /dev/null +++ b/mmpose/datasets/datasets/base/kpt_3d_mview_rgb_img_direct_dataset.py @@ -0,0 +1,143 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta, abstractmethod + +import json_tricks as json +import numpy as np +from torch.utils.data import Dataset + +from mmpose.datasets import DatasetInfo +from mmpose.datasets.pipelines import Compose + + +class Kpt3dMviewRgbImgDirectDataset(Dataset, metaclass=ABCMeta): + """Base class for keypoint 3D top-down pose estimation with multi-view RGB + images as the input. + + All subclasses should overwrite: + Methods:`_get_db`, 'evaluate' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + self.image_info = {} + self.ann_info = {} + + self.ann_file = ann_file + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + + self.ann_info['space_size'] = data_cfg['space_size'] + self.ann_info['space_center'] = data_cfg['space_center'] + self.ann_info['cube_size'] = data_cfg['cube_size'] + self.ann_info['scale_aware_sigma'] = data_cfg.get( + 'scale_aware_sigma', False) + + if dataset_info is None: + raise ValueError( + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.') + + dataset_info = DatasetInfo(dataset_info) + + assert self.ann_info['num_joints'] <= dataset_info.keypoint_num + self.ann_info['flip_pairs'] = dataset_info.flip_pairs + self.ann_info['num_scales'] = 1 + self.ann_info['flip_index'] = dataset_info.flip_index + self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids + self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids + self.ann_info['joint_weights'] = dataset_info.joint_weights + self.ann_info['skeleton'] = dataset_info.skeleton + self.sigmas = dataset_info.sigmas + self.dataset_name = dataset_info.dataset_name + + self.load_config(data_cfg) + + self.db = [] + + self.pipeline = Compose(self.pipeline) + + def load_config(self, data_cfg): + """Initialize dataset attributes according to the config. + + Override this method to set dataset specific attributes. + """ + self.num_joints = data_cfg['num_joints'] + self.num_cameras = data_cfg['num_cameras'] + self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1) + self.subset = data_cfg.get('subset', 'train') + self.need_2d_label = data_cfg.get('need_2d_label', False) + self.need_camera_param = True + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + @abstractmethod + def _get_db(self): + """Load dataset.""" + raise NotImplementedError + + @abstractmethod + def evaluate(self, results, *args, **kwargs): + """Evaluate keypoint results.""" + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def __len__(self): + """Get the size of the dataset.""" + return len(self.db) // self.num_cameras + + def __getitem__(self, idx): + """Get the sample given index.""" + results = {} + # return self.pipeline(results) + for c in range(self.num_cameras): + result = copy.deepcopy(self.db[self.num_cameras * idx + c]) + result['ann_info'] = self.ann_info + results[c] = result + + return self.pipeline(results) diff --git a/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py b/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py new file mode 100644 index 0000000..dbdb998 --- /dev/null +++ b/mmpose/datasets/datasets/base/kpt_3d_sview_kpt_2d_dataset.py @@ -0,0 +1,226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta, abstractmethod + +import numpy as np +from torch.utils.data import Dataset + +from mmpose.datasets import DatasetInfo +from mmpose.datasets.pipelines import Compose + + +class Kpt3dSviewKpt2dDataset(Dataset, metaclass=ABCMeta): + """Base class for 3D human pose datasets. + + Subclasses should consider overwriting following methods: + - load_config + - load_annotations + - build_sample_indices + - evaluate + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + - num_joints: Number of joints. + - seq_len: Number of frames in a sequence. Default: 1. + - seq_frame_interval: Extract frames from the video at certain + intervals. Default: 1. + - causal: If set to True, the rightmost input frame will be the + target frame. Otherwise, the middle input frame will be the + target frame. Default: True. + - temporal_padding: Whether to pad the video so that poses will be + predicted for every frame in the video. Default: False + - subset: Reduce dataset size by fraction. Default: 1. + - need_2d_label: Whether need 2D joint labels or not. + Default: False. + + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + self.ann_file = ann_file + self.img_prefix = img_prefix + self.data_cfg = copy.deepcopy(data_cfg) + self.pipeline = pipeline + self.test_mode = test_mode + self.ann_info = {} + + if dataset_info is None: + raise ValueError( + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.') + + dataset_info = DatasetInfo(dataset_info) + + self.load_config(self.data_cfg) + + self.ann_info['num_joints'] = data_cfg['num_joints'] + assert self.ann_info['num_joints'] == dataset_info.keypoint_num + self.ann_info['flip_pairs'] = dataset_info.flip_pairs + self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids + self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids + self.ann_info['joint_weights'] = dataset_info.joint_weights + self.ann_info['skeleton'] = dataset_info.skeleton + self.sigmas = dataset_info.sigmas + self.dataset_name = dataset_info.dataset_name + + self.data_info = self.load_annotations() + self.sample_indices = self.build_sample_indices() + self.pipeline = Compose(pipeline) + + self.name2id = { + name: i + for i, name in enumerate(self.data_info['imgnames']) + } + + def load_config(self, data_cfg): + """Initialize dataset attributes according to the config. + + Override this method to set dataset specific attributes. + """ + + self.num_joints = data_cfg['num_joints'] + self.seq_len = data_cfg.get('seq_len', 1) + self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1) + self.causal = data_cfg.get('causal', True) + self.temporal_padding = data_cfg.get('temporal_padding', False) + self.subset = data_cfg.get('subset', 1) + self.need_2d_label = data_cfg.get('need_2d_label', False) + self.need_camera_param = False + + def load_annotations(self): + """Load data annotation.""" + data = np.load(self.ann_file) + + # get image info + _imgnames = data['imgname'] + num_imgs = len(_imgnames) + num_joints = self.ann_info['num_joints'] + + if 'scale' in data: + _scales = data['scale'].astype(np.float32) + else: + _scales = np.zeros(num_imgs, dtype=np.float32) + + if 'center' in data: + _centers = data['center'].astype(np.float32) + else: + _centers = np.zeros((num_imgs, 2), dtype=np.float32) + + # get 3D pose + if 'S' in data.keys(): + _joints_3d = data['S'].astype(np.float32) + else: + _joints_3d = np.zeros((num_imgs, num_joints, 4), dtype=np.float32) + + # get 2D pose + if 'part' in data.keys(): + _joints_2d = data['part'].astype(np.float32) + else: + _joints_2d = np.zeros((num_imgs, num_joints, 3), dtype=np.float32) + + data_info = { + 'imgnames': _imgnames, + 'joints_3d': _joints_3d, + 'joints_2d': _joints_2d, + 'scales': _scales, + 'centers': _centers, + } + + return data_info + + def build_sample_indices(self): + """Build sample indices. + + The default method creates sample indices that each sample is a single + frame (i.e. seq_len=1). Override this method in the subclass to define + how frames are sampled to form data samples. + + Outputs: + sample_indices [list(tuple)]: the frame indices of each sample. + For a sample, all frames will be treated as an input sequence, + and the ground-truth pose of the last frame will be the target. + """ + sample_indices = [] + if self.seq_len == 1: + num_imgs = len(self.ann_info['imgnames']) + sample_indices = [(idx, ) for idx in range(num_imgs)] + else: + raise NotImplementedError('Multi-frame data sample unsupported!') + return sample_indices + + @abstractmethod + def evaluate(self, results, *args, **kwargs): + """Evaluate keypoint results.""" + + def prepare_data(self, idx): + """Get data sample.""" + data = self.data_info + + frame_ids = self.sample_indices[idx] + assert len(frame_ids) == self.seq_len + + # get the 3D/2D pose sequence + _joints_3d = data['joints_3d'][frame_ids] + _joints_2d = data['joints_2d'][frame_ids] + + # get the image info + _imgnames = data['imgnames'][frame_ids] + _centers = data['centers'][frame_ids] + _scales = data['scales'][frame_ids] + if _scales.ndim == 1: + _scales = np.stack([_scales, _scales], axis=1) + + target_idx = -1 if self.causal else int(self.seq_len) // 2 + + results = { + 'input_2d': _joints_2d[:, :, :2], + 'input_2d_visible': _joints_2d[:, :, -1:], + 'input_3d': _joints_3d[:, :, :3], + 'input_3d_visible': _joints_3d[:, :, -1:], + 'target': _joints_3d[target_idx, :, :3], + 'target_visible': _joints_3d[target_idx, :, -1:], + 'image_paths': _imgnames, + 'target_image_path': _imgnames[target_idx], + 'scales': _scales, + 'centers': _centers, + } + + if self.need_2d_label: + results['target_2d'] = _joints_2d[target_idx, :, :2] + + if self.need_camera_param: + _cam_param = self.get_camera_param(_imgnames[0]) + results['camera_param'] = _cam_param + # get image size from camera parameters + if 'w' in _cam_param and 'h' in _cam_param: + results['image_width'] = _cam_param['w'] + results['image_height'] = _cam_param['h'] + + return results + + def __len__(self): + """Get the size of the dataset.""" + return len(self.sample_indices) + + def __getitem__(self, idx): + """Get a sample with given index.""" + results = copy.deepcopy(self.prepare_data(idx)) + results['ann_info'] = self.ann_info + return self.pipeline(results) + + def get_camera_param(self, imgname): + """Get camera parameters of a frame by its image name.""" + raise NotImplementedError diff --git a/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py new file mode 100644 index 0000000..af01e81 --- /dev/null +++ b/mmpose/datasets/datasets/base/kpt_3d_sview_rgb_img_top_down_dataset.py @@ -0,0 +1,256 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta, abstractmethod + +import json_tricks as json +import numpy as np +from torch.utils.data import Dataset +from xtcocotools.coco import COCO + +from mmpose.datasets import DatasetInfo +from mmpose.datasets.pipelines import Compose + + +class Kpt3dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta): + """Base class for keypoint 3D top-down pose estimation with single-view RGB + image as the input. + + All fashion datasets should subclass it. + All subclasses should overwrite: + Methods:`_get_db`, 'evaluate' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + coco_style (bool): Whether the annotation json is coco-style. + Default: True + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + coco_style=True, + test_mode=False): + + self.image_info = {} + self.ann_info = {} + + self.ann_file = ann_file + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + + self.ann_info['inference_channel'] = data_cfg['inference_channel'] + self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] + self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + + if dataset_info is None: + raise ValueError( + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.') + + dataset_info = DatasetInfo(dataset_info) + + assert self.ann_info['num_joints'] == dataset_info.keypoint_num + self.ann_info['flip_pairs'] = dataset_info.flip_pairs + self.ann_info['flip_index'] = dataset_info.flip_index + self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids + self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids + self.ann_info['joint_weights'] = dataset_info.joint_weights + self.ann_info['skeleton'] = dataset_info.skeleton + self.sigmas = dataset_info.sigmas + self.dataset_name = dataset_info.dataset_name + + if coco_style: + self.coco = COCO(ann_file) + if 'categories' in self.coco.dataset: + cats = [ + cat['name'] + for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + self.classes = ['__background__'] + cats + self.num_classes = len(self.classes) + self._class_to_ind = dict( + zip(self.classes, range(self.num_classes))) + self._class_to_coco_ind = dict( + zip(cats, self.coco.getCatIds())) + self._coco_ind_to_class_ind = dict( + (self._class_to_coco_ind[cls], self._class_to_ind[cls]) + for cls in self.classes[1:]) + self.img_ids = self.coco.getImgIds() + self.num_images = len(self.img_ids) + self.id2name, self.name2id = self._get_mapping_id_name( + self.coco.imgs) + + self.db = [] + + self.pipeline = Compose(self.pipeline) + + @staticmethod + def _cam2pixel(cam_coord, f, c): + """Transform the joints from their camera coordinates to their pixel + coordinates. + + Note: + N: number of joints + + Args: + cam_coord (ndarray[N, 3]): 3D joints coordinates + in the camera coordinate system + f (ndarray[2]): focal length of x and y axis + c (ndarray[2]): principal point of x and y axis + + Returns: + img_coord (ndarray[N, 3]): the coordinates (x, y, 0) + in the image plane. + """ + x = cam_coord[:, 0] / (cam_coord[:, 2] + 1e-8) * f[0] + c[0] + y = cam_coord[:, 1] / (cam_coord[:, 2] + 1e-8) * f[1] + c[1] + z = np.zeros_like(x) + img_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1) + return img_coord + + @staticmethod + def _world2cam(world_coord, R, T): + """Transform the joints from their world coordinates to their camera + coordinates. + + Note: + N: number of joints + + Args: + world_coord (ndarray[3, N]): 3D joints coordinates + in the world coordinate system + R (ndarray[3, 3]): camera rotation matrix + T (ndarray[3, 1]): camera position (x, y, z) + + Returns: + cam_coord (ndarray[3, N]): 3D joints coordinates + in the camera coordinate system + """ + cam_coord = np.dot(R, world_coord - T) + return cam_coord + + @staticmethod + def _pixel2cam(pixel_coord, f, c): + """Transform the joints from their pixel coordinates to their camera + coordinates. + + Note: + N: number of joints + + Args: + pixel_coord (ndarray[N, 3]): 3D joints coordinates + in the pixel coordinate system + f (ndarray[2]): focal length of x and y axis + c (ndarray[2]): principal point of x and y axis + + Returns: + cam_coord (ndarray[N, 3]): 3D joints coordinates + in the camera coordinate system + """ + x = (pixel_coord[:, 0] - c[0]) / f[0] * pixel_coord[:, 2] + y = (pixel_coord[:, 1] - c[1]) / f[1] * pixel_coord[:, 2] + z = pixel_coord[:, 2] + cam_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1) + return cam_coord + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _xywh2cs(self, x, y, w, h, padding=1.25): + """This encodes bbox(x,y,w,h) into (center, scale) + + Args: + x, y, w, h (float): left, top, width and height + padding (float): bounding box padding factor + + Returns: + center (np.ndarray[float32](2,)): center of the bbox (x, y). + scale (np.ndarray[float32](2,)): scale of the bbox w & h. + """ + aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[ + 'image_size'][1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + + if (not self.test_mode) and np.random.rand() < 0.3: + center += 0.4 * (np.random.rand(2) - 0.5) * [w, h] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + # padding to include proper amount of context + scale = scale * padding + + return center, scale + + @abstractmethod + def _get_db(self): + """Load dataset.""" + raise NotImplementedError + + @abstractmethod + def evaluate(self, results, *args, **kwargs): + """Evaluate keypoint results.""" + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def __len__(self): + """Get the size of the dataset.""" + return len(self.db) + + def __getitem__(self, idx): + """Get the sample given index.""" + results = copy.deepcopy(self.db[idx]) + results['ann_info'] = self.ann_info + return self.pipeline(results) + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/mmpose/datasets/datasets/body3d/__init__.py b/mmpose/datasets/datasets/body3d/__init__.py new file mode 100644 index 0000000..5bc25a9 --- /dev/null +++ b/mmpose/datasets/datasets/body3d/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .body3d_h36m_dataset import Body3DH36MDataset +from .body3d_mpi_inf_3dhp_dataset import Body3DMpiInf3dhpDataset +from .body3d_mview_direct_panoptic_dataset import \ + Body3DMviewDirectPanopticDataset +from .body3d_semi_supervision_dataset import Body3DSemiSupervisionDataset + +__all__ = [ + 'Body3DH36MDataset', 'Body3DSemiSupervisionDataset', + 'Body3DMpiInf3dhpDataset', 'Body3DMviewDirectPanopticDataset' +] diff --git a/mmpose/datasets/datasets/body3d/body3d_base_dataset.py b/mmpose/datasets/datasets/body3d/body3d_base_dataset.py new file mode 100644 index 0000000..10c2923 --- /dev/null +++ b/mmpose/datasets/datasets/body3d/body3d_base_dataset.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +from torch.utils.data import Dataset + + +class Body3DBaseDataset(Dataset, metaclass=ABCMeta): + """This class has been deprecated and replaced by + Kpt3dSviewKpt2dDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'Body3DBaseDataset has been replaced by ' + 'Kpt3dSviewKpt2dDataset' + 'check https://github.com/open-mmlab/mmpose/pull/663 for details.') + ) diff --git a/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py b/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py new file mode 100644 index 0000000..ae4949d --- /dev/null +++ b/mmpose/datasets/datasets/body3d/body3d_h36m_dataset.py @@ -0,0 +1,343 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import mmcv +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.core.evaluation import keypoint_mpjpe +from mmpose.datasets.datasets.base import Kpt3dSviewKpt2dDataset +from ...builder import DATASETS + + +@DATASETS.register_module() +class Body3DH36MDataset(Kpt3dSviewKpt2dDataset): + """Human3.6M dataset for 3D human pose estimation. + + "Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human + Sensing in Natural Environments", TPAMI`2014. + More details can be found in the `paper + `__. + + Human3.6M keypoint indexes:: + + 0: 'root (pelvis)', + 1: 'right_hip', + 2: 'right_knee', + 3: 'right_foot', + 4: 'left_hip', + 5: 'left_knee', + 6: 'left_foot', + 7: 'spine', + 8: 'thorax', + 9: 'neck_base', + 10: 'head', + 11: 'left_shoulder', + 12: 'left_elbow', + 13: 'left_wrist', + 14: 'right_shoulder', + 15: 'right_elbow', + 16: 'right_wrist' + + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + JOINT_NAMES = [ + 'Root', 'RHip', 'RKnee', 'RFoot', 'LHip', 'LKnee', 'LFoot', 'Spine', + 'Thorax', 'NeckBase', 'Head', 'LShoulder', 'LElbow', 'LWrist', + 'RShoulder', 'RElbow', 'RWrist' + ] + + # 2D joint source options: + # "gt": from the annotation file + # "detection": from a detection result file of 2D keypoint + # "pipeline": will be generate by the pipeline + SUPPORTED_JOINT_2D_SRC = {'gt', 'detection', 'pipeline'} + + # metric + ALLOWED_METRICS = {'mpjpe', 'p-mpjpe', 'n-mpjpe'} + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/h36m.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + def load_config(self, data_cfg): + super().load_config(data_cfg) + # h36m specific attributes + self.joint_2d_src = data_cfg.get('joint_2d_src', 'gt') + if self.joint_2d_src not in self.SUPPORTED_JOINT_2D_SRC: + raise ValueError( + f'Unsupported joint_2d_src "{self.joint_2d_src}". ' + f'Supported options are {self.SUPPORTED_JOINT_2D_SRC}') + + self.joint_2d_det_file = data_cfg.get('joint_2d_det_file', None) + + self.need_camera_param = data_cfg.get('need_camera_param', False) + if self.need_camera_param: + assert 'camera_param_file' in data_cfg + self.camera_param = self._load_camera_param( + data_cfg['camera_param_file']) + + # h36m specific annotation info + ann_info = {} + ann_info['use_different_joint_weights'] = False + # action filter + actions = data_cfg.get('actions', '_all_') + self.actions = set( + actions if isinstance(actions, (list, tuple)) else [actions]) + + # subject filter + subjects = data_cfg.get('subjects', '_all_') + self.subjects = set( + subjects if isinstance(subjects, (list, tuple)) else [subjects]) + + self.ann_info.update(ann_info) + + def load_annotations(self): + data_info = super().load_annotations() + + # get 2D joints + if self.joint_2d_src == 'gt': + data_info['joints_2d'] = data_info['joints_2d'] + elif self.joint_2d_src == 'detection': + data_info['joints_2d'] = self._load_joint_2d_detection( + self.joint_2d_det_file) + assert data_info['joints_2d'].shape[0] == data_info[ + 'joints_3d'].shape[0] + assert data_info['joints_2d'].shape[2] == 3 + elif self.joint_2d_src == 'pipeline': + # joint_2d will be generated in the pipeline + pass + else: + raise NotImplementedError( + f'Unhandled joint_2d_src option {self.joint_2d_src}') + + return data_info + + @staticmethod + def _parse_h36m_imgname(imgname): + """Parse imgname to get information of subject, action and camera. + + A typical h36m image filename is like: + S1_Directions_1.54138969_000001.jpg + """ + subj, rest = osp.basename(imgname).split('_', 1) + action, rest = rest.split('.', 1) + camera, rest = rest.split('_', 1) + + return subj, action, camera + + def build_sample_indices(self): + """Split original videos into sequences and build frame indices. + + This method overrides the default one in the base class. + """ + + # Group frames into videos. Assume that self.data_info is + # chronological. + video_frames = defaultdict(list) + for idx, imgname in enumerate(self.data_info['imgnames']): + subj, action, camera = self._parse_h36m_imgname(imgname) + + if '_all_' not in self.actions and action not in self.actions: + continue + + if '_all_' not in self.subjects and subj not in self.subjects: + continue + + video_frames[(subj, action, camera)].append(idx) + + # build sample indices + sample_indices = [] + _len = (self.seq_len - 1) * self.seq_frame_interval + 1 + _step = self.seq_frame_interval + for _, _indices in sorted(video_frames.items()): + n_frame = len(_indices) + + if self.temporal_padding: + # Pad the sequence so that every frame in the sequence will be + # predicted. + if self.causal: + frames_left = self.seq_len - 1 + frames_right = 0 + else: + frames_left = (self.seq_len - 1) // 2 + frames_right = frames_left + for i in range(n_frame): + pad_left = max(0, frames_left - i // _step) + pad_right = max(0, + frames_right - (n_frame - 1 - i) // _step) + start = max(i % _step, i - frames_left * _step) + end = min(n_frame - (n_frame - 1 - i) % _step, + i + frames_right * _step + 1) + sample_indices.append([_indices[0]] * pad_left + + _indices[start:end:_step] + + [_indices[-1]] * pad_right) + else: + seqs_from_video = [ + _indices[i:(i + _len):_step] + for i in range(0, n_frame - _len + 1) + ] + sample_indices.extend(seqs_from_video) + + # reduce dataset size if self.subset < 1 + assert 0 < self.subset <= 1 + subset_size = int(len(sample_indices) * self.subset) + start = np.random.randint(0, len(sample_indices) - subset_size + 1) + end = start + subset_size + + return sample_indices[start:end] + + def _load_joint_2d_detection(self, det_file): + """"Load 2D joint detection results from file.""" + joints_2d = np.load(det_file).astype(np.float32) + + return joints_2d + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs): + metrics = metric if isinstance(metric, list) else [metric] + for _metric in metrics: + if _metric not in self.ALLOWED_METRICS: + raise ValueError( + f'Unsupported metric "{_metric}" for human3.6 dataset.' + f'Supported metrics are {self.ALLOWED_METRICS}') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + image_paths = result['target_image_paths'] + batch_size = len(image_paths) + for i in range(batch_size): + target_id = self.name2id[image_paths[i]] + kpts.append({ + 'keypoints': preds[i], + 'target_id': target_id, + }) + + mmcv.dump(kpts, res_file) + + name_value_tuples = [] + for _metric in metrics: + if _metric == 'mpjpe': + _nv_tuples = self._report_mpjpe(kpts) + elif _metric == 'p-mpjpe': + _nv_tuples = self._report_mpjpe(kpts, mode='p-mpjpe') + elif _metric == 'n-mpjpe': + _nv_tuples = self._report_mpjpe(kpts, mode='n-mpjpe') + else: + raise NotImplementedError + name_value_tuples.extend(_nv_tuples) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return OrderedDict(name_value_tuples) + + def _report_mpjpe(self, keypoint_results, mode='mpjpe'): + """Cauculate mean per joint position error (MPJPE) or its variants like + P-MPJPE or N-MPJPE. + + Args: + keypoint_results (list): Keypoint predictions. See + 'Body3DH36MDataset.evaluate' for details. + mode (str): Specify mpjpe variants. Supported options are: + + - ``'mpjpe'``: Standard MPJPE. + - ``'p-mpjpe'``: MPJPE after aligning prediction to groundtruth + via a rigid transformation (scale, rotation and + translation). + - ``'n-mpjpe'``: MPJPE after aligning prediction to groundtruth + in scale only. + """ + + preds = [] + gts = [] + masks = [] + action_category_indices = defaultdict(list) + for idx, result in enumerate(keypoint_results): + pred = result['keypoints'] + target_id = result['target_id'] + gt, gt_visible = np.split( + self.data_info['joints_3d'][target_id], [3], axis=-1) + preds.append(pred) + gts.append(gt) + masks.append(gt_visible) + + action = self._parse_h36m_imgname( + self.data_info['imgnames'][target_id])[1] + action_category = action.split('_')[0] + action_category_indices[action_category].append(idx) + + preds = np.stack(preds) + gts = np.stack(gts) + masks = np.stack(masks).squeeze(-1) > 0 + + err_name = mode.upper() + if mode == 'mpjpe': + alignment = 'none' + elif mode == 'p-mpjpe': + alignment = 'procrustes' + elif mode == 'n-mpjpe': + alignment = 'scale' + else: + raise ValueError(f'Invalid mode: {mode}') + + error = keypoint_mpjpe(preds, gts, masks, alignment) + name_value_tuples = [(err_name, error)] + + for action_category, indices in action_category_indices.items(): + _error = keypoint_mpjpe(preds[indices], gts[indices], + masks[indices]) + name_value_tuples.append((f'{err_name}_{action_category}', _error)) + + return name_value_tuples + + def _load_camera_param(self, camera_param_file): + """Load camera parameters from file.""" + return mmcv.load(camera_param_file) + + def get_camera_param(self, imgname): + """Get camera parameters of a frame by its image name.""" + assert hasattr(self, 'camera_param') + subj, _, camera = self._parse_h36m_imgname(imgname) + return self.camera_param[(subj, camera)] diff --git a/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py b/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py new file mode 100644 index 0000000..4d06fcd --- /dev/null +++ b/mmpose/datasets/datasets/body3d/body3d_mpi_inf_3dhp_dataset.py @@ -0,0 +1,417 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import mmcv +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.core.evaluation import (keypoint_3d_auc, keypoint_3d_pck, + keypoint_mpjpe) +from mmpose.datasets.datasets.base import Kpt3dSviewKpt2dDataset +from ...builder import DATASETS + + +@DATASETS.register_module() +class Body3DMpiInf3dhpDataset(Kpt3dSviewKpt2dDataset): + """MPI-INF-3DHP dataset for 3D human pose estimation. + + "Monocular 3D Human Pose Estimation In The Wild Using Improved CNN + Supervision", 3DV'2017. + More details can be found in the `paper + `__. + + MPI-INF-3DHP keypoint indexes: + + 0: 'head_top', + 1: 'neck', + 2: 'right_shoulder', + 3: 'right_elbow', + 4: 'right_wrist', + 5: 'left_shoulder;, + 6: 'left_elbow', + 7: 'left_wrist', + 8: 'right_hip', + 9: 'right_knee', + 10: 'right_ankle', + 11: 'left_hip', + 12: 'left_knee', + 13: 'left_ankle', + 14: 'root (pelvis)', + 15: 'spine', + 16: 'head' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): Data configurations. Please refer to the docstring of + Body3DBaseDataset for common data attributes. Here are MPI-INF-3DHP + specific attributes. + - joint_2d_src: 2D joint source. Options include: + "gt": from the annotation file + "detection": from a detection result file of 2D keypoint + "pipeline": will be generate by the pipeline + Default: "gt". + - joint_2d_det_file: Path to the detection result file of 2D + keypoint. Only used when joint_2d_src == "detection". + - need_camera_param: Whether need camera parameters or not. + Default: False. + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + JOINT_NAMES = [ + 'HeadTop', 'Neck', 'RShoulder', 'RElbow', 'RWrist', 'LShoulder', + 'LElbow', 'LWrist', 'RHip', 'RKnee', 'RAnkle', 'LHip', 'LKnee', + 'LAnkle', 'Root', 'Spine', 'Head' + ] + + # 2D joint source options: + # "gt": from the annotation file + # "detection": from a detection result file of 2D keypoint + # "pipeline": will be generate by the pipeline + SUPPORTED_JOINT_2D_SRC = {'gt', 'detection', 'pipeline'} + + # metric + ALLOWED_METRICS = { + 'mpjpe', 'p-mpjpe', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc' + } + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/mpi_inf_3dhp.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + def load_config(self, data_cfg): + super().load_config(data_cfg) + # mpi-inf-3dhp specific attributes + self.joint_2d_src = data_cfg.get('joint_2d_src', 'gt') + if self.joint_2d_src not in self.SUPPORTED_JOINT_2D_SRC: + raise ValueError( + f'Unsupported joint_2d_src "{self.joint_2d_src}". ' + f'Supported options are {self.SUPPORTED_JOINT_2D_SRC}') + + self.joint_2d_det_file = data_cfg.get('joint_2d_det_file', None) + + self.need_camera_param = data_cfg.get('need_camera_param', False) + if self.need_camera_param: + assert 'camera_param_file' in data_cfg + self.camera_param = self._load_camera_param( + data_cfg['camera_param_file']) + + # mpi-inf-3dhp specific annotation info + ann_info = {} + ann_info['use_different_joint_weights'] = False + + self.ann_info.update(ann_info) + + def load_annotations(self): + data_info = super().load_annotations() + + # get 2D joints + if self.joint_2d_src == 'gt': + data_info['joints_2d'] = data_info['joints_2d'] + elif self.joint_2d_src == 'detection': + data_info['joints_2d'] = self._load_joint_2d_detection( + self.joint_2d_det_file) + assert data_info['joints_2d'].shape[0] == data_info[ + 'joints_3d'].shape[0] + assert data_info['joints_2d'].shape[2] == 3 + elif self.joint_2d_src == 'pipeline': + # joint_2d will be generated in the pipeline + pass + else: + raise NotImplementedError( + f'Unhandled joint_2d_src option {self.joint_2d_src}') + + return data_info + + @staticmethod + def _parse_mpi_inf_3dhp_imgname(imgname): + """Parse imgname to get information of subject, sequence and camera. + + A typical mpi-inf-3dhp training image filename is like: + S1_Seq1_Cam0_000001.jpg. A typical mpi-inf-3dhp testing image filename + is like: TS1_000001.jpg + """ + if imgname[0] == 'S': + subj, rest = imgname.split('_', 1) + seq, rest = rest.split('_', 1) + camera, rest = rest.split('_', 1) + return subj, seq, camera + else: + subj, rest = imgname.split('_', 1) + return subj, None, None + + def build_sample_indices(self): + """Split original videos into sequences and build frame indices. + + This method overrides the default one in the base class. + """ + + # Group frames into videos. Assume that self.data_info is + # chronological. + video_frames = defaultdict(list) + for idx, imgname in enumerate(self.data_info['imgnames']): + subj, seq, camera = self._parse_mpi_inf_3dhp_imgname(imgname) + if seq is not None: + video_frames[(subj, seq, camera)].append(idx) + else: + video_frames[subj].append(idx) + + # build sample indices + sample_indices = [] + _len = (self.seq_len - 1) * self.seq_frame_interval + 1 + _step = self.seq_frame_interval + for _, _indices in sorted(video_frames.items()): + n_frame = len(_indices) + + if self.temporal_padding: + # Pad the sequence so that every frame in the sequence will be + # predicted. + if self.causal: + frames_left = self.seq_len - 1 + frames_right = 0 + else: + frames_left = (self.seq_len - 1) // 2 + frames_right = frames_left + for i in range(n_frame): + pad_left = max(0, frames_left - i // _step) + pad_right = max(0, + frames_right - (n_frame - 1 - i) // _step) + start = max(i % _step, i - frames_left * _step) + end = min(n_frame - (n_frame - 1 - i) % _step, + i + frames_right * _step + 1) + sample_indices.append([_indices[0]] * pad_left + + _indices[start:end:_step] + + [_indices[-1]] * pad_right) + else: + seqs_from_video = [ + _indices[i:(i + _len):_step] + for i in range(0, n_frame - _len + 1) + ] + sample_indices.extend(seqs_from_video) + + # reduce dataset size if self.subset < 1 + assert 0 < self.subset <= 1 + subset_size = int(len(sample_indices) * self.subset) + start = np.random.randint(0, len(sample_indices) - subset_size + 1) + end = start + subset_size + + return sample_indices[start:end] + + def _load_joint_2d_detection(self, det_file): + """"Load 2D joint detection results from file.""" + joints_2d = np.load(det_file).astype(np.float32) + + return joints_2d + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs): + metrics = metric if isinstance(metric, list) else [metric] + for _metric in metrics: + if _metric not in self.ALLOWED_METRICS: + raise ValueError( + f'Unsupported metric "{_metric}" for mpi-inf-3dhp dataset.' + f'Supported metrics are {self.ALLOWED_METRICS}') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + image_paths = result['target_image_paths'] + batch_size = len(image_paths) + for i in range(batch_size): + target_id = self.name2id[image_paths[i]] + kpts.append({ + 'keypoints': preds[i], + 'target_id': target_id, + }) + + mmcv.dump(kpts, res_file) + + name_value_tuples = [] + for _metric in metrics: + if _metric == 'mpjpe': + _nv_tuples = self._report_mpjpe(kpts) + elif _metric == 'p-mpjpe': + _nv_tuples = self._report_mpjpe(kpts, mode='p-mpjpe') + elif _metric == '3dpck': + _nv_tuples = self._report_3d_pck(kpts) + elif _metric == 'p-3dpck': + _nv_tuples = self._report_3d_pck(kpts, mode='p-3dpck') + elif _metric == '3dauc': + _nv_tuples = self._report_3d_auc(kpts) + elif _metric == 'p-3dauc': + _nv_tuples = self._report_3d_auc(kpts, mode='p-3dauc') + else: + raise NotImplementedError + name_value_tuples.extend(_nv_tuples) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return OrderedDict(name_value_tuples) + + def _report_mpjpe(self, keypoint_results, mode='mpjpe'): + """Cauculate mean per joint position error (MPJPE) or its variants + P-MPJPE. + + Args: + keypoint_results (list): Keypoint predictions. See + 'Body3DMpiInf3dhpDataset.evaluate' for details. + mode (str): Specify mpjpe variants. Supported options are: + - ``'mpjpe'``: Standard MPJPE. + - ``'p-mpjpe'``: MPJPE after aligning prediction to groundtruth + via a rigid transformation (scale, rotation and + translation). + """ + + preds = [] + gts = [] + for idx, result in enumerate(keypoint_results): + pred = result['keypoints'] + target_id = result['target_id'] + gt, gt_visible = np.split( + self.data_info['joints_3d'][target_id], [3], axis=-1) + preds.append(pred) + gts.append(gt) + + preds = np.stack(preds) + gts = np.stack(gts) + masks = np.ones_like(gts[:, :, 0], dtype=bool) + + err_name = mode.upper() + if mode == 'mpjpe': + alignment = 'none' + elif mode == 'p-mpjpe': + alignment = 'procrustes' + else: + raise ValueError(f'Invalid mode: {mode}') + + error = keypoint_mpjpe(preds, gts, masks, alignment) + name_value_tuples = [(err_name, error)] + + return name_value_tuples + + def _report_3d_pck(self, keypoint_results, mode='3dpck'): + """Cauculate Percentage of Correct Keypoints (3DPCK) w. or w/o + Procrustes alignment. + + Args: + keypoint_results (list): Keypoint predictions. See + 'Body3DMpiInf3dhpDataset.evaluate' for details. + mode (str): Specify mpjpe variants. Supported options are: + - ``'3dpck'``: Standard 3DPCK. + - ``'p-3dpck'``: 3DPCK after aligning prediction to groundtruth + via a rigid transformation (scale, rotation and + translation). + """ + + preds = [] + gts = [] + for idx, result in enumerate(keypoint_results): + pred = result['keypoints'] + target_id = result['target_id'] + gt, gt_visible = np.split( + self.data_info['joints_3d'][target_id], [3], axis=-1) + preds.append(pred) + gts.append(gt) + + preds = np.stack(preds) + gts = np.stack(gts) + masks = np.ones_like(gts[:, :, 0], dtype=bool) + + err_name = mode.upper() + if mode == '3dpck': + alignment = 'none' + elif mode == 'p-3dpck': + alignment = 'procrustes' + else: + raise ValueError(f'Invalid mode: {mode}') + + error = keypoint_3d_pck(preds, gts, masks, alignment) + name_value_tuples = [(err_name, error)] + + return name_value_tuples + + def _report_3d_auc(self, keypoint_results, mode='3dauc'): + """Cauculate the Area Under the Curve (AUC) computed for a range of + 3DPCK thresholds. + + Args: + keypoint_results (list): Keypoint predictions. See + 'Body3DMpiInf3dhpDataset.evaluate' for details. + mode (str): Specify mpjpe variants. Supported options are: + + - ``'3dauc'``: Standard 3DAUC. + - ``'p-3dauc'``: 3DAUC after aligning prediction to + groundtruth via a rigid transformation (scale, rotation and + translation). + """ + + preds = [] + gts = [] + for idx, result in enumerate(keypoint_results): + pred = result['keypoints'] + target_id = result['target_id'] + gt, gt_visible = np.split( + self.data_info['joints_3d'][target_id], [3], axis=-1) + preds.append(pred) + gts.append(gt) + + preds = np.stack(preds) + gts = np.stack(gts) + masks = np.ones_like(gts[:, :, 0], dtype=bool) + + err_name = mode.upper() + if mode == '3dauc': + alignment = 'none' + elif mode == 'p-3dauc': + alignment = 'procrustes' + else: + raise ValueError(f'Invalid mode: {mode}') + + error = keypoint_3d_auc(preds, gts, masks, alignment) + name_value_tuples = [(err_name, error)] + + return name_value_tuples + + def _load_camera_param(self, camear_param_file): + """Load camera parameters from file.""" + return mmcv.load(camear_param_file) + + def get_camera_param(self, imgname): + """Get camera parameters of a frame by its image name.""" + assert hasattr(self, 'camera_param') + return self.camera_param[imgname[:-11]] diff --git a/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py b/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py new file mode 100644 index 0000000..b5bf92d --- /dev/null +++ b/mmpose/datasets/datasets/body3d/body3d_mview_direct_panoptic_dataset.py @@ -0,0 +1,493 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import glob +import json +import os.path as osp +import pickle +import tempfile +import warnings +from collections import OrderedDict + +import mmcv +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.core.camera import SimpleCamera +from mmpose.datasets.builder import DATASETS +from mmpose.datasets.datasets.base import Kpt3dMviewRgbImgDirectDataset + + +@DATASETS.register_module() +class Body3DMviewDirectPanopticDataset(Kpt3dMviewRgbImgDirectDataset): + """Panoptic dataset for direct multi-view human pose estimation. + + `Panoptic Studio: A Massively Multiview System for Social Motion + Capture' ICCV'2015 + More details can be found in the `paper + `__ . + + The dataset loads both 2D and 3D annotations as well as camera parameters. + + Panoptic keypoint indexes:: + + 'neck': 0, + 'nose': 1, + 'mid-hip': 2, + 'l-shoulder': 3, + 'l-elbow': 4, + 'l-wrist': 5, + 'l-hip': 6, + 'l-knee': 7, + 'l-ankle': 8, + 'r-shoulder': 9, + 'r-elbow': 10, + 'r-wrist': 11, + 'r-hip': 12, + 'r-knee': 13, + 'r-ankle': 14, + 'l-eye': 15, + 'l-ear': 16, + 'r-eye': 17, + 'r-ear': 18, + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + ALLOWED_METRICS = {'mpjpe', 'mAP'} + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/panoptic_body3d.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.load_config(data_cfg) + self.ann_info['use_different_joint_weights'] = False + + if ann_file is None: + self.db_file = osp.join( + img_prefix, f'group_{self.subset}_cam{self.num_cameras}.pkl') + else: + self.db_file = ann_file + + if osp.exists(self.db_file): + with open(self.db_file, 'rb') as f: + info = pickle.load(f) + assert info['sequence_list'] == self.seq_list + assert info['interval'] == self.seq_frame_interval + assert info['cam_list'] == self.cam_list + self.db = info['db'] + else: + self.db = self._get_db() + info = { + 'sequence_list': self.seq_list, + 'interval': self.seq_frame_interval, + 'cam_list': self.cam_list, + 'db': self.db + } + with open(self.db_file, 'wb') as f: + pickle.dump(info, f) + + self.db_size = len(self.db) + + print(f'=> load {len(self.db)} samples') + + def load_config(self, data_cfg): + """Initialize dataset attributes according to the config. + + Override this method to set dataset specific attributes. + """ + self.num_joints = data_cfg['num_joints'] + assert self.num_joints <= 19 + self.seq_list = data_cfg['seq_list'] + self.cam_list = data_cfg['cam_list'] + self.num_cameras = data_cfg['num_cameras'] + assert self.num_cameras == len(self.cam_list) + self.seq_frame_interval = data_cfg.get('seq_frame_interval', 1) + self.subset = data_cfg.get('subset', 'train') + self.need_camera_param = True + self.root_id = data_cfg.get('root_id', 0) + self.max_persons = data_cfg.get('max_num', 10) + + def _get_scale(self, raw_image_size): + heatmap_size = self.ann_info['heatmap_size'] + image_size = self.ann_info['image_size'] + assert heatmap_size[0][0] / heatmap_size[0][1] \ + == image_size[0] / image_size[1] + w, h = raw_image_size + w_resized, h_resized = image_size + if w / w_resized < h / h_resized: + w_pad = h / h_resized * w_resized + h_pad = h + else: + w_pad = w + h_pad = w / w_resized * h_resized + + scale = np.array([w_pad, h_pad], dtype=np.float32) + + return scale + + def _get_cam(self, seq): + """Get camera parameters. + + Args: + seq (str): Sequence name. + + Returns: Camera parameters. + """ + cam_file = osp.join(self.img_prefix, seq, + 'calibration_{:s}.json'.format(seq)) + with open(cam_file) as cfile: + calib = json.load(cfile) + + M = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, -1.0], [0.0, 1.0, 0.0]]) + cameras = {} + for cam in calib['cameras']: + if (cam['panel'], cam['node']) in self.cam_list: + sel_cam = {} + R_w2c = np.array(cam['R']).dot(M) + T_w2c = np.array(cam['t']).reshape((3, 1)) * 10.0 # cm to mm + R_c2w = R_w2c.T + T_c2w = -R_w2c.T @ T_w2c + sel_cam['R'] = R_c2w.tolist() + sel_cam['T'] = T_c2w.tolist() + sel_cam['K'] = cam['K'][:2] + distCoef = cam['distCoef'] + sel_cam['k'] = [distCoef[0], distCoef[1], distCoef[4]] + sel_cam['p'] = [distCoef[2], distCoef[3]] + cameras[(cam['panel'], cam['node'])] = sel_cam + + return cameras + + def _get_db(self): + """Get dataset base. + + Returns: + dict: the dataset base (2D and 3D information) + """ + width = 1920 + height = 1080 + db = [] + sample_id = 0 + for seq in self.seq_list: + cameras = self._get_cam(seq) + curr_anno = osp.join(self.img_prefix, seq, + 'hdPose3d_stage1_coco19') + anno_files = sorted(glob.iglob('{:s}/*.json'.format(curr_anno))) + print(f'load sequence: {seq}', flush=True) + for i, file in enumerate(anno_files): + if i % self.seq_frame_interval == 0: + with open(file) as dfile: + bodies = json.load(dfile)['bodies'] + if len(bodies) == 0: + continue + + for k, cam_param in cameras.items(): + single_view_camera = SimpleCamera(cam_param) + postfix = osp.basename(file).replace('body3DScene', '') + prefix = '{:02d}_{:02d}'.format(k[0], k[1]) + image_file = osp.join(seq, 'hdImgs', prefix, + prefix + postfix) + image_file = image_file.replace('json', 'jpg') + + all_poses_3d = np.zeros( + (self.max_persons, self.num_joints, 3), + dtype=np.float32) + all_poses_vis_3d = np.zeros( + (self.max_persons, self.num_joints, 3), + dtype=np.float32) + all_roots_3d = np.zeros((self.max_persons, 3), + dtype=np.float32) + all_poses = np.zeros( + (self.max_persons, self.num_joints, 3), + dtype=np.float32) + + cnt = 0 + person_ids = -np.ones(self.max_persons, dtype=np.int) + for body in bodies: + if cnt >= self.max_persons: + break + pose3d = np.array(body['joints19']).reshape( + (-1, 4)) + pose3d = pose3d[:self.num_joints] + + joints_vis = pose3d[:, -1] > 0.1 + + if not joints_vis[self.root_id]: + continue + + # Coordinate transformation + M = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, -1.0], + [0.0, 1.0, 0.0]]) + pose3d[:, 0:3] = pose3d[:, 0:3].dot(M) * 10.0 + + all_poses_3d[cnt] = pose3d[:, :3] + all_roots_3d[cnt] = pose3d[self.root_id, :3] + all_poses_vis_3d[cnt] = np.repeat( + np.reshape(joints_vis, (-1, 1)), 3, axis=1) + + pose2d = np.zeros((pose3d.shape[0], 3)) + # get pose_2d from pose_3d + pose2d[:, :2] = single_view_camera.world_to_pixel( + pose3d[:, :3]) + x_check = np.bitwise_and(pose2d[:, 0] >= 0, + pose2d[:, 0] <= width - 1) + y_check = np.bitwise_and( + pose2d[:, 1] >= 0, pose2d[:, 1] <= height - 1) + check = np.bitwise_and(x_check, y_check) + joints_vis[np.logical_not(check)] = 0 + pose2d[:, -1] = joints_vis + + all_poses[cnt] = pose2d + person_ids[cnt] = body['id'] + cnt += 1 + + if cnt > 0: + db.append({ + 'image_file': + osp.join(self.img_prefix, image_file), + 'joints_3d': + all_poses_3d, + 'person_ids': + person_ids, + 'joints_3d_visible': + all_poses_vis_3d, + 'joints': [all_poses], + 'roots_3d': + all_roots_3d, + 'camera': + cam_param, + 'num_persons': + cnt, + 'sample_id': + sample_id, + 'center': + np.array((width / 2, height / 2), + dtype=np.float32), + 'scale': + self._get_scale((width, height)) + }) + sample_id += 1 + return db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mpjpe', **kwargs): + """ + + Args: + results (list[dict]): Testing results containing the following + items: + - pose_3d (np.ndarray): predicted 3D human pose + - sample_id (np.ndarray): sample id of a frame. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Defaults: 'mpjpe'. + **kwargs: + + Returns: + + """ + pose_3ds = np.concatenate([result['pose_3d'] for result in results], + axis=0) + sample_ids = [] + for result in results: + sample_ids.extend(result['sample_id']) + + _results = [ + dict(sample_id=sample_id, pose_3d=pose_3d) + for (sample_id, pose_3d) in zip(sample_ids, pose_3ds) + ] + _results = self._sort_and_unique_outputs(_results, key='sample_id') + + metrics = metric if isinstance(metric, list) else [metric] + for _metric in metrics: + if _metric not in self.ALLOWED_METRICS: + raise ValueError( + f'Unsupported metric "{_metric}"' + f'Supported metrics are {self.ALLOWED_METRICS}') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + mmcv.dump(_results, res_file) + + eval_list = [] + gt_num = self.db_size // self.num_cameras + assert len( + _results) == gt_num, f'number mismatch: {len(_results)}, {gt_num}' + + total_gt = 0 + for i in range(gt_num): + index = self.num_cameras * i + db_rec = copy.deepcopy(self.db[index]) + joints_3d = db_rec['joints_3d'] + joints_3d_vis = db_rec['joints_3d_visible'] + + if joints_3d_vis.sum() < 1: + continue + + pred = _results[i]['pose_3d'].copy() + pred = pred[pred[:, 0, 3] >= 0] + for pose in pred: + mpjpes = [] + for (gt, gt_vis) in zip(joints_3d, joints_3d_vis): + vis = gt_vis[:, 0] > 0 + if vis.sum() < 1: + break + mpjpe = np.mean( + np.sqrt( + np.sum((pose[vis, 0:3] - gt[vis])**2, axis=-1))) + mpjpes.append(mpjpe) + min_gt = np.argmin(mpjpes) + min_mpjpe = np.min(mpjpes) + score = pose[0, 4] + eval_list.append({ + 'mpjpe': float(min_mpjpe), + 'score': float(score), + 'gt_id': int(total_gt + min_gt) + }) + + total_gt += (joints_3d_vis[:, :, 0].sum(-1) >= 1).sum() + + mpjpe_threshold = np.arange(25, 155, 25) + aps = [] + ars = [] + for t in mpjpe_threshold: + ap, ar = self._eval_list_to_ap(eval_list, total_gt, t) + aps.append(ap) + ars.append(ar) + + name_value_tuples = [] + for _metric in metrics: + if _metric == 'mpjpe': + stats_names = ['RECALL 500mm', 'MPJPE 500mm'] + info_str = list( + zip(stats_names, [ + self._eval_list_to_recall(eval_list, total_gt), + self._eval_list_to_mpjpe(eval_list) + ])) + elif _metric == 'mAP': + stats_names = [ + 'AP 25', 'AP 50', 'AP 75', 'AP 100', 'AP 125', 'AP 150', + 'mAP', 'AR 25', 'AR 50', 'AR 75', 'AR 100', 'AR 125', + 'AR 150', 'mAR' + ] + mAP = np.array(aps).mean() + mAR = np.array(ars).mean() + info_str = list(zip(stats_names, aps + [mAP] + ars + [mAR])) + else: + raise NotImplementedError + name_value_tuples.extend(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return OrderedDict(name_value_tuples) + + @staticmethod + def _eval_list_to_ap(eval_list, total_gt, threshold): + """Get Average Precision (AP) and Average Recall at a certain + threshold.""" + + eval_list.sort(key=lambda k: k['score'], reverse=True) + total_num = len(eval_list) + + tp = np.zeros(total_num) + fp = np.zeros(total_num) + gt_det = [] + for i, item in enumerate(eval_list): + if item['mpjpe'] < threshold and item['gt_id'] not in gt_det: + tp[i] = 1 + gt_det.append(item['gt_id']) + else: + fp[i] = 1 + tp = np.cumsum(tp) + fp = np.cumsum(fp) + recall = tp / (total_gt + 1e-5) + precise = tp / (tp + fp + 1e-5) + for n in range(total_num - 2, -1, -1): + precise[n] = max(precise[n], precise[n + 1]) + + precise = np.concatenate(([0], precise, [0])) + recall = np.concatenate(([0], recall, [1])) + index = np.where(recall[1:] != recall[:-1])[0] + ap = np.sum((recall[index + 1] - recall[index]) * precise[index + 1]) + + return ap, recall[-2] + + @staticmethod + def _eval_list_to_mpjpe(eval_list, threshold=500): + """Get MPJPE within a certain threshold.""" + eval_list.sort(key=lambda k: k['score'], reverse=True) + gt_det = [] + + mpjpes = [] + for i, item in enumerate(eval_list): + if item['mpjpe'] < threshold and item['gt_id'] not in gt_det: + mpjpes.append(item['mpjpe']) + gt_det.append(item['gt_id']) + + return np.mean(mpjpes) if len(mpjpes) > 0 else np.inf + + @staticmethod + def _eval_list_to_recall(eval_list, total_gt, threshold=500): + """Get Recall at a certain threshold.""" + gt_ids = [e['gt_id'] for e in eval_list if e['mpjpe'] < threshold] + + return len(np.unique(gt_ids)) / total_gt + + def __getitem__(self, idx): + """Get the sample given index.""" + results = {} + for c in range(self.num_cameras): + result = copy.deepcopy(self.db[self.num_cameras * idx + c]) + result['ann_info'] = self.ann_info + width = 1920 + height = 1080 + result['mask'] = [np.ones((height, width), dtype=np.float32)] + results[c] = result + + return self.pipeline(results) + + @staticmethod + def _sort_and_unique_outputs(outputs, key='sample_id'): + """sort outputs and remove the repeated ones.""" + outputs = sorted(outputs, key=lambda x: x[key]) + num_outputs = len(outputs) + for i in range(num_outputs - 1, 0, -1): + if outputs[i][key] == outputs[i - 1][key]: + del outputs[i] + + return outputs diff --git a/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py b/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py new file mode 100644 index 0000000..491d549 --- /dev/null +++ b/mmpose/datasets/datasets/body3d/body3d_semi_supervision_dataset.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from torch.utils.data import Dataset + +from mmpose.datasets.builder import DATASETS, build_dataset + + +@DATASETS.register_module() +class Body3DSemiSupervisionDataset(Dataset): + """Mix Dataset for semi-supervised training in 3D human pose estimation + task. + + The dataset combines data from two datasets (a labeled one and an unlabeled + one) and return a dict containing data from two datasets. + + Args: + labeled_dataset (Dataset): Dataset with 3D keypoint annotations. + unlabeled_dataset (Dataset): Dataset without 3D keypoint annotations. + """ + + def __init__(self, labeled_dataset, unlabeled_dataset): + super().__init__() + self.labeled_dataset = build_dataset(labeled_dataset) + self.unlabeled_dataset = build_dataset(unlabeled_dataset) + self.length = len(self.unlabeled_dataset) + + def __len__(self): + """Get the size of the dataset.""" + return self.length + + def __getitem__(self, i): + """Given index, get the data from unlabeled dataset and randomly sample + an item from labeled dataset. + + Return a dict containing data from labeled and unlabeled dataset. + """ + data = self.unlabeled_dataset[i] + rand_ind = np.random.randint(0, len(self.labeled_dataset)) + labeled_data = self.labeled_dataset[rand_ind] + data.update(labeled_data) + return data diff --git a/mmpose/datasets/datasets/bottom_up/__init__.py b/mmpose/datasets/datasets/bottom_up/__init__.py new file mode 100644 index 0000000..2ac7937 --- /dev/null +++ b/mmpose/datasets/datasets/bottom_up/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bottom_up_aic import BottomUpAicDataset +from .bottom_up_coco import BottomUpCocoDataset +from .bottom_up_coco_wholebody import BottomUpCocoWholeBodyDataset +from .bottom_up_crowdpose import BottomUpCrowdPoseDataset +from .bottom_up_mhp import BottomUpMhpDataset + +__all__ = [ + 'BottomUpCocoDataset', 'BottomUpCrowdPoseDataset', 'BottomUpMhpDataset', + 'BottomUpAicDataset', 'BottomUpCocoWholeBodyDataset' +] diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py b/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py new file mode 100644 index 0000000..e56b725 --- /dev/null +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_aic.py @@ -0,0 +1,105 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import json_tricks as json +from mmcv import Config +from xtcocotools.cocoeval import COCOeval + +from mmpose.datasets.builder import DATASETS +from .bottom_up_coco import BottomUpCocoDataset + + +@DATASETS.register_module() +class BottomUpAicDataset(BottomUpCocoDataset): + """Aic dataset for bottom-up pose estimation. + + "AI Challenger : A Large-scale Dataset for Going Deeper + in Image Understanding", arXiv'2017. + More details can be found in the `paper + `__ + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + AIC keypoint indexes:: + + 0: "right_shoulder", + 1: "right_elbow", + 2: "right_wrist", + 3: "left_shoulder", + 4: "left_elbow", + 5: "left_wrist", + 6: "right_hip", + 7: "right_knee", + 8: "right_ankle", + 9: "left_hip", + 10: "left_knee", + 11: "left_ankle", + 12: "head_top", + 13: "neck" + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/aic.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(BottomUpCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + print(f'=> num_images: {self.num_images}') + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + with open(res_file, 'r') as file: + res_json = json.load(file) + if not res_json: + info_str = list(zip(stats_names, [ + 0, + ] * len(stats_names))) + return info_str + + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval( + self.coco, coco_det, 'keypoints', self.sigmas, use_area=False) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py b/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py new file mode 100644 index 0000000..6a2fea5 --- /dev/null +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_base_dataset.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch.utils.data import Dataset + + +class BottomUpBaseDataset(Dataset): + """This class has been deprecated and replaced by + Kpt2dSviewRgbImgBottomUpDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'BottomUpBaseDataset has been replaced by ' + 'Kpt2dSviewRgbImgBottomUpDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/663 for details.') + ) diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py b/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py new file mode 100644 index 0000000..fa2967f --- /dev/null +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_coco.py @@ -0,0 +1,305 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning +from xtcocotools.cocoeval import COCOeval + +from mmpose.core.post_processing import oks_nms, soft_oks_nms +from mmpose.datasets.builder import DATASETS +from mmpose.datasets.datasets.base import Kpt2dSviewRgbImgBottomUpDataset + + +@DATASETS.register_module() +class BottomUpCocoDataset(Kpt2dSviewRgbImgBottomUpDataset): + """COCO dataset for bottom-up pose estimation. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + COCO keypoint indexes:: + + 0: 'nose', + 1: 'left_eye', + 2: 'right_eye', + 3: 'left_ear', + 4: 'right_ear', + 5: 'left_shoulder', + 6: 'right_shoulder', + 7: 'left_elbow', + 8: 'right_elbow', + 9: 'left_wrist', + 10: 'right_wrist', + 11: 'left_hip', + 12: 'right_hip', + 13: 'left_knee', + 14: 'right_knee', + 15: 'left_ankle', + 16: 'right_ankle' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/coco.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + print(f'=> num_images: {self.num_images}') + + def _get_single(self, idx): + """Get anno for a single image. + + Args: + idx (int): image idx + + Returns: + dict: info for model training + """ + coco = self.coco + img_id = self.img_ids[idx] + ann_ids = coco.getAnnIds(imgIds=img_id) + anno = coco.loadAnns(ann_ids) + + mask = self._get_mask(anno, idx) + anno = [ + obj.copy() for obj in anno + if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0 + ] + + joints = self._get_joints(anno) + mask_list = [mask.copy() for _ in range(self.ann_info['num_scales'])] + joints_list = [ + joints.copy() for _ in range(self.ann_info['num_scales']) + ] + + db_rec = {} + db_rec['dataset'] = self.dataset_name + db_rec['image_file'] = osp.join(self.img_prefix, self.id2name[img_id]) + db_rec['mask'] = mask_list + db_rec['joints'] = joints_list + + return db_rec + + def _get_joints(self, anno): + """Get joints for all people in an image.""" + num_people = len(anno) + + if self.ann_info['scale_aware_sigma']: + joints = np.zeros((num_people, self.ann_info['num_joints'], 4), + dtype=np.float32) + else: + joints = np.zeros((num_people, self.ann_info['num_joints'], 3), + dtype=np.float32) + + for i, obj in enumerate(anno): + joints[i, :, :3] = \ + np.array(obj['keypoints']).reshape([-1, 3]) + if self.ann_info['scale_aware_sigma']: + # get person box + box = obj['bbox'] + size = max(box[2], box[3]) + sigma = size / self.base_size * self.base_sigma + if self.int_sigma: + sigma = int(np.ceil(sigma)) + assert sigma > 0, sigma + joints[i, :, 3] = sigma + + return joints + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mAP', **kwargs): + """Evaluate coco keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - num_people: P + - num_keypoints: K + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (list[np.ndarray(P, K, 3+tag_num)]): \ + Pose predictions for all people in images. + - scores (list[P]): List of person scores. + - image_path (list[str]): For example, ['coco/images/\ + val2017/000000397133.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model outputs. + + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'mAP'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['mAP'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + preds = [] + scores = [] + image_paths = [] + + for result in results: + preds.append(result['preds']) + scores.append(result['scores']) + image_paths.append(result['image_paths'][0]) + + kpts = defaultdict(list) + # iterate over images + for idx, _preds in enumerate(preds): + str_image_path = image_paths[idx] + image_id = self.name2id[osp.basename(str_image_path)] + # iterate over people + for idx_person, kpt in enumerate(_preds): + # use bbox area + area = (np.max(kpt[:, 0]) - np.min(kpt[:, 0])) * ( + np.max(kpt[:, 1]) - np.min(kpt[:, 1])) + + kpts[image_id].append({ + 'keypoints': kpt[:, 0:3], + 'score': scores[idx][idx_person], + 'tags': kpt[:, 3], + 'image_id': image_id, + 'area': area, + }) + + valid_kpts = [] + for img in kpts.keys(): + img_kpts = kpts[img] + if self.use_nms: + nms = soft_oks_nms if self.soft_nms else oks_nms + keep = nms(img_kpts, self.oks_thr, sigmas=self.sigmas) + valid_kpts.append([img_kpts[_keep] for _keep in keep]) + else: + valid_kpts.append(img_kpts) + + self._write_coco_keypoint_results(valid_kpts, res_file) + + info_str = self._do_python_keypoint_eval(res_file) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + def _write_coco_keypoint_results(self, keypoints, res_file): + """Write results into a json file.""" + data_pack = [{ + 'cat_id': self._class_to_coco_ind[cls], + 'cls_ind': cls_ind, + 'cls': cls, + 'ann_type': 'keypoints', + 'keypoints': keypoints + } for cls_ind, cls in enumerate(self.classes) + if not cls == '__background__'] + + results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) + + with open(res_file, 'w') as f: + json.dump(results, f, sort_keys=True, indent=4) + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + """Get coco keypoint results.""" + cat_id = data_pack['cat_id'] + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpt['keypoints'] for img_kpt in img_kpts]) + key_points = _key_points.reshape(-1, + self.ann_info['num_joints'] * 3) + + for img_kpt, key_point in zip(img_kpts, key_points): + kpt = key_point.reshape((self.ann_info['num_joints'], 3)) + left_top = np.amin(kpt, axis=0) + right_bottom = np.amax(kpt, axis=0) + + w = right_bottom[0] - left_top[0] + h = right_bottom[1] - left_top[1] + + cat_results.append({ + 'image_id': img_kpt['image_id'], + 'category_id': cat_id, + 'keypoints': key_point.tolist(), + 'score': img_kpt['score'], + 'bbox': [left_top[0], left_top[1], w, h] + }) + + return cat_results + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + with open(res_file, 'r') as file: + res_json = json.load(file) + if not res_json: + info_str = list(zip(stats_names, [ + 0, + ] * len(stats_names))) + return info_str + + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py b/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py new file mode 100644 index 0000000..363d2ef --- /dev/null +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_coco_wholebody.py @@ -0,0 +1,238 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np +from mmcv import Config +from xtcocotools.cocoeval import COCOeval + +from mmpose.datasets.builder import DATASETS +from .bottom_up_coco import BottomUpCocoDataset + + +@DATASETS.register_module() +class BottomUpCocoWholeBodyDataset(BottomUpCocoDataset): + """CocoWholeBodyDataset dataset for bottom-up pose estimation. + + `Whole-Body Human Pose Estimation in the Wild', ECCV'2020. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + In total, we have 133 keypoints for wholebody pose estimation. + + COCO-WholeBody keypoint indexes:: + + 0-16: 17 body keypoints, + 17-22: 6 foot keypoints, + 23-90: 68 face keypoints, + 91-132: 42 hand keypoints + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/coco_wholebody.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(BottomUpCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + + self.body_num = 17 + self.foot_num = 6 + self.face_num = 68 + self.left_hand_num = 21 + self.right_hand_num = 21 + + print(f'=> num_images: {self.num_images}') + + def _get_joints(self, anno): + """Get joints for all people in an image.""" + num_people = len(anno) + + if self.ann_info['scale_aware_sigma']: + joints = np.zeros((num_people, self.ann_info['num_joints'], 4), + dtype=np.float32) + else: + joints = np.zeros((num_people, self.ann_info['num_joints'], 3), + dtype=np.float32) + + for i, obj in enumerate(anno): + keypoints = np.array(obj['keypoints'] + obj['foot_kpts'] + + obj['face_kpts'] + obj['lefthand_kpts'] + + obj['righthand_kpts']).reshape(-1, 3) + + joints[i, :self.ann_info['num_joints'], :3] = keypoints + if self.ann_info['scale_aware_sigma']: + # get person box + box = obj['bbox'] + size = max(box[2], box[3]) + sigma = size / self.base_size * self.base_sigma + if self.int_sigma: + sigma = int(np.ceil(sigma)) + assert sigma > 0, sigma + joints[i, :, 3] = sigma + + return joints + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + """Get coco keypoint results.""" + cat_id = data_pack['cat_id'] + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpt['keypoints'] for img_kpt in img_kpts]) + key_points = _key_points.reshape(-1, + self.ann_info['num_joints'] * 3) + + cuts = np.cumsum([ + 0, self.body_num, self.foot_num, self.face_num, + self.left_hand_num, self.right_hand_num + ]) * 3 + + for img_kpt, key_point in zip(img_kpts, key_points): + kpt = key_point.reshape((self.ann_info['num_joints'], 3)) + left_top = np.amin(kpt, axis=0) + right_bottom = np.amax(kpt, axis=0) + + w = right_bottom[0] - left_top[0] + h = right_bottom[1] - left_top[1] + + cat_results.append({ + 'image_id': + img_kpt['image_id'], + 'category_id': + cat_id, + 'keypoints': + key_point[cuts[0]:cuts[1]].tolist(), + 'foot_kpts': + key_point[cuts[1]:cuts[2]].tolist(), + 'face_kpts': + key_point[cuts[2]:cuts[3]].tolist(), + 'lefthand_kpts': + key_point[cuts[3]:cuts[4]].tolist(), + 'righthand_kpts': + key_point[cuts[4]:cuts[5]].tolist(), + 'score': + img_kpt['score'], + 'bbox': [left_top[0], left_top[1], w, h] + }) + + return cat_results + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + + cuts = np.cumsum([ + 0, self.body_num, self.foot_num, self.face_num, self.left_hand_num, + self.right_hand_num + ]) + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_body', + self.sigmas[cuts[0]:cuts[1]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_foot', + self.sigmas[cuts[1]:cuts[2]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_face', + self.sigmas[cuts[2]:cuts[3]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_lefthand', + self.sigmas[cuts[3]:cuts[4]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_righthand', + self.sigmas[cuts[4]:cuts[5]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_wholebody', + self.sigmas, + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py b/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py new file mode 100644 index 0000000..ebabf3e --- /dev/null +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_crowdpose.py @@ -0,0 +1,109 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import json_tricks as json +from mmcv import Config +from xtcocotools.cocoeval import COCOeval + +from mmpose.datasets.builder import DATASETS +from .bottom_up_coco import BottomUpCocoDataset + + +@DATASETS.register_module() +class BottomUpCrowdPoseDataset(BottomUpCocoDataset): + """CrowdPose dataset for bottom-up pose estimation. + + "CrowdPose: Efficient Crowded Scenes Pose Estimation and + A New Benchmark", CVPR'2019. + More details can be found in the `paper + `__. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + CrowdPose keypoint indexes:: + + 0: 'left_shoulder', + 1: 'right_shoulder', + 2: 'left_elbow', + 3: 'right_elbow', + 4: 'left_wrist', + 5: 'right_wrist', + 6: 'left_hip', + 7: 'right_hip', + 8: 'left_knee', + 9: 'right_knee', + 10: 'left_ankle', + 11: 'right_ankle', + 12: 'top_head', + 13: 'neck' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/crowdpose.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(BottomUpCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + print(f'=> num_images: {self.num_images}') + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AR', 'AR .5', 'AR .75', 'AP(E)', 'AP(M)', + 'AP(H)' + ] + + with open(res_file, 'r') as file: + res_json = json.load(file) + if not res_json: + info_str = list(zip(stats_names, [ + 0, + ] * len(stats_names))) + return info_str + + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_crowd', + self.sigmas, + use_area=False) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py b/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py new file mode 100644 index 0000000..1438123 --- /dev/null +++ b/mmpose/datasets/datasets/bottom_up/bottom_up_mhp.py @@ -0,0 +1,108 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import json_tricks as json +from mmcv import Config +from xtcocotools.cocoeval import COCOeval + +from mmpose.datasets.builder import DATASETS +from .bottom_up_coco import BottomUpCocoDataset + + +@DATASETS.register_module() +class BottomUpMhpDataset(BottomUpCocoDataset): + """MHPv2.0 dataset for top-down pose estimation. + + "Understanding Humans in Crowded Scenes: Deep Nested Adversarial + Learning and A New Benchmark for Multi-Human Parsing", ACM MM'2018. + More details can be found in the `paper + `__ + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + MHP keypoint indexes:: + + 0: "right ankle", + 1: "right knee", + 2: "right hip", + 3: "left hip", + 4: "left knee", + 5: "left ankle", + 6: "pelvis", + 7: "thorax", + 8: "upper neck", + 9: "head top", + 10: "right wrist", + 11: "right elbow", + 12: "right shoulder", + 13: "left shoulder", + 14: "left elbow", + 15: "left wrist", + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/mhp.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(BottomUpCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + print(f'=> num_images: {self.num_images}') + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + with open(res_file, 'r') as file: + res_json = json.load(file) + if not res_json: + info_str = list(zip(stats_names, [ + 0, + ] * len(stats_names))) + return info_str + + coco_det = self.coco.loadRes(res_file) + + coco_eval = COCOeval( + self.coco, coco_det, 'keypoints', self.sigmas, use_area=False) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/face/__init__.py b/mmpose/datasets/datasets/face/__init__.py new file mode 100644 index 0000000..1ba42d4 --- /dev/null +++ b/mmpose/datasets/datasets/face/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .face_300w_dataset import Face300WDataset +from .face_aflw_dataset import FaceAFLWDataset +from .face_coco_wholebody_dataset import FaceCocoWholeBodyDataset +from .face_cofw_dataset import FaceCOFWDataset +from .face_wflw_dataset import FaceWFLWDataset + +__all__ = [ + 'Face300WDataset', 'FaceAFLWDataset', 'FaceWFLWDataset', 'FaceCOFWDataset', + 'FaceCocoWholeBodyDataset' +] diff --git a/mmpose/datasets/datasets/face/face_300w_dataset.py b/mmpose/datasets/datasets/face/face_300w_dataset.py new file mode 100644 index 0000000..e5b602e --- /dev/null +++ b/mmpose/datasets/datasets/face/face_300w_dataset.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class Face300WDataset(Kpt2dSviewRgbImgTopDownDataset): + """Face300W dataset for top-down face keypoint localization. + + "300 faces In-the-wild challenge: Database and results", + Image and Vision Computing (IMAVIS) 2019. + + The dataset loads raw images and apply specified transforms + to return a dict containing the image tensors and other information. + + The landmark annotations follow the 68 points mark-up. The definition + can be found in `https://ibug.doc.ic.ac.uk/resources/300-W/`. + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/300w.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + if 'center' in obj and 'scale' in obj: + center = np.array(obj['center']) + scale = np.array([obj['scale'], obj['scale']]) * 1.25 + else: + center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + def _get_normalize_factor(self, gts, *args, **kwargs): + """Get inter-ocular distance as the normalize factor, measured as the + Euclidean distance between the outer corners of the eyes. + + Args: + gts (np.ndarray[N, K, 2]): Groundtruth keypoint location. + + Returns: + np.ndarray[N, 2]: normalized factor + """ + + interocular = np.linalg.norm( + gts[:, 36, :] - gts[:, 45, :], axis=1, keepdims=True) + return np.tile(interocular, [1, 2]) + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='NME', **kwargs): + """Evaluate freihand keypoint results. The pose prediction results will + be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[1,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_path (list[str]): For example, ['300W/ibug/\ + image_018.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'NME'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/face/face_aflw_dataset.py b/mmpose/datasets/datasets/face/face_aflw_dataset.py new file mode 100644 index 0000000..292d9ee --- /dev/null +++ b/mmpose/datasets/datasets/face/face_aflw_dataset.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class FaceAFLWDataset(Kpt2dSviewRgbImgTopDownDataset): + """Face AFLW dataset for top-down face keypoint localization. + + "Annotated Facial Landmarks in the Wild: A Large-scale, + Real-world Database for Facial Landmark Localization". + In Proc. First IEEE International Workshop on Benchmarking + Facial Image Analysis Technologies, 2011. + + The dataset loads raw images and apply specified transforms + to return a dict containing the image tensors and other information. + + The landmark annotations follow the 19 points mark-up. The definition + can be found in `https://www.tugraz.at/institute/icg/research` + `/team-bischof/lrs/downloads/aflw/` + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/aflw.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if self.test_mode: + # 'box_size' is used as normalization factor + assert 'box_size' in obj + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + if 'center' in obj and 'scale' in obj: + center = np.array(obj['center']) + scale = np.array([obj['scale'], obj['scale']]) * 1.25 + else: + center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'box_size': obj['box_size'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + def _get_normalize_factor(self, box_sizes, *args, **kwargs): + """Get normalize factor for evaluation. + + Args: + box_sizes (np.ndarray[N, 1]): box size + + Returns: + np.ndarray[N, 2]: normalized factor + """ + + return np.tile(box_sizes, [1, 2]) + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='NME', **kwargs): + """Evaluate freihand keypoint results. The pose prediction results will + be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[1,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_path (list[str]): For example, ['aflw/images/flickr/ \ + 0/image00002.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'NME'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/face/face_base_dataset.py b/mmpose/datasets/datasets/face/face_base_dataset.py new file mode 100644 index 0000000..466fabb --- /dev/null +++ b/mmpose/datasets/datasets/face/face_base_dataset.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +from torch.utils.data import Dataset + + +class FaceBaseDataset(Dataset, metaclass=ABCMeta): + """This class has been deprecated and replaced by + Kpt2dSviewRgbImgTopDownDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'FaceBaseDataset has been replaced by ' + 'Kpt2dSviewRgbImgTopDownDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/663 for details.') + ) diff --git a/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py b/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py new file mode 100644 index 0000000..ef5117a --- /dev/null +++ b/mmpose/datasets/datasets/face/face_coco_wholebody_dataset.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class FaceCocoWholeBodyDataset(Kpt2dSviewRgbImgTopDownDataset): + """CocoWholeBodyDataset for face keypoint localization. + + `Whole-Body Human Pose Estimation in the Wild', ECCV'2020. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + The face landmark annotations follow the 68 points mark-up. + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/' + 'coco_wholebody_face.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if obj['face_valid'] and max(obj['face_kpts']) > 0: + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), + dtype=np.float32) + + keypoints = np.array(obj['face_kpts']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['face_box'][:4], 1.25) + + image_file = osp.join(self.img_prefix, + self.id2name[img_id]) + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['face_box'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + def _get_normalize_factor(self, gts, *args, **kwargs): + """Get inter-ocular distance as the normalize factor, measured as the + Euclidean distance between the outer corners of the eyes. + + Args: + gts (np.ndarray[N, K, 2]): Groundtruth keypoint location. + + Returns: + np.ndarray[N, 2]: normalized factor + """ + + interocular = np.linalg.norm( + gts[:, 36, :] - gts[:, 45, :], axis=1, keepdims=True) + return np.tile(interocular, [1, 2]) + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='NME', **kwargs): + """Evaluate COCO-WholeBody Face keypoint results. The pose prediction + results will be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[1,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_path (list[str]): For example, ['coco/train2017/\ + 000000000009.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'NME'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/face/face_cofw_dataset.py b/mmpose/datasets/datasets/face/face_cofw_dataset.py new file mode 100644 index 0000000..456ea0e --- /dev/null +++ b/mmpose/datasets/datasets/face/face_cofw_dataset.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class FaceCOFWDataset(Kpt2dSviewRgbImgTopDownDataset): + """Face COFW dataset for top-down face keypoint localization. + + "Robust face landmark estimation under occlusion", ICCV'2013. + + The dataset loads raw images and apply specified transforms + to return a dict containing the image tensors and other information. + + The landmark annotations follow the 29 points mark-up. The definition + can be found in `http://www.vision.caltech.edu/xpburgos/ICCV13/`. + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/cofw.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + if 'center' in obj and 'scale' in obj: + center = np.array(obj['center']) + scale = np.array([obj['scale'], obj['scale']]) * 1.25 + else: + center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + def _get_normalize_factor(self, gts, *args, **kwargs): + """Get normalize factor for evaluation. + + Args: + gts (np.ndarray[N, K, 2]): Groundtruth keypoint location. + + Returns: + np.ndarray[N, 2]: normalized factor + """ + + interocular = np.linalg.norm( + gts[:, 8, :] - gts[:, 9, :], axis=1, keepdims=True) + return np.tile(interocular, [1, 2]) + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='NME', **kwargs): + """Evaluate freihand keypoint results. The pose prediction results will + be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[1,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_path (list[str]): For example, ['cofw/images/\ + 000001.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'NME'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/face/face_wflw_dataset.py b/mmpose/datasets/datasets/face/face_wflw_dataset.py new file mode 100644 index 0000000..e4611e1 --- /dev/null +++ b/mmpose/datasets/datasets/face/face_wflw_dataset.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class FaceWFLWDataset(Kpt2dSviewRgbImgTopDownDataset): + """Face WFLW dataset for top-down face keypoint localization. + + "Look at Boundary: A Boundary-Aware Face Alignment Algorithm", + CVPR'2018. + + The dataset loads raw images and apply specified transforms + to return a dict containing the image tensors and other information. + + The landmark annotations follow the 98 points mark-up. The definition + can be found in `https://wywu.github.io/projects/LAB/WFLW.html`. + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/wflw.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + if 'center' in obj and 'scale' in obj: + center = np.array(obj['center']) + scale = np.array([obj['scale'], obj['scale']]) * 1.25 + else: + center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + def _get_normalize_factor(self, gts, *args, **kwargs): + """Get normalize factor for evaluation. + + Args: + gts (np.ndarray[N, K, 2]): Groundtruth keypoint location. + + Returns: + np.ndarray[N, 2]: normalized factor + """ + + interocular = np.linalg.norm( + gts[:, 60, :] - gts[:, 72, :], axis=1, keepdims=True) + return np.tile(interocular, [1, 2]) + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='NME', **kwargs): + """Evaluate freihand keypoint results. The pose prediction results will + be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[1,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[1,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_path (list[str]): For example, ['wflw/images/\ + 0--Parade/0_Parade_marchingband_1_1015.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'NME'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/fashion/__init__.py b/mmpose/datasets/datasets/fashion/__init__.py new file mode 100644 index 0000000..575d6ed --- /dev/null +++ b/mmpose/datasets/datasets/fashion/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .deepfashion_dataset import DeepFashionDataset + +__all__ = ['DeepFashionDataset'] diff --git a/mmpose/datasets/datasets/fashion/deepfashion_dataset.py b/mmpose/datasets/datasets/fashion/deepfashion_dataset.py new file mode 100644 index 0000000..0fef655 --- /dev/null +++ b/mmpose/datasets/datasets/fashion/deepfashion_dataset.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class DeepFashionDataset(Kpt2dSviewRgbImgTopDownDataset): + """DeepFashion dataset (full-body clothes) for fashion landmark detection. + + "DeepFashion: Powering Robust Clothes Recognition + and Retrieval with Rich Annotations", CVPR'2016. + "Fashion Landmark Detection in the Wild", ECCV'2016. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + The dataset contains 3 categories for full-body, upper-body and lower-body. + + Fashion landmark indexes for upper-body clothes:: + + 0: 'left collar', + 1: 'right collar', + 2: 'left sleeve', + 3: 'right sleeve', + 4: 'left hem', + 5: 'right hem' + + Fashion landmark indexes for lower-body clothes:: + + 0: 'left waistline', + 1: 'right waistline', + 2: 'left hem', + 3: 'right hem' + + Fashion landmark indexes for full-body clothes:: + + 0: 'left collar', + 1: 'right collar', + 2: 'left sleeve', + 3: 'right sleeve', + 4: 'left waistline', + 5: 'right waistline', + 6: 'left hem', + 7: 'right hem' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + subset='', + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + if subset != '': + warnings.warn( + 'subset is deprecated.' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + if subset == 'upper': + cfg = Config.fromfile( + 'configs/_base_/datasets/deepfashion_upper.py') + dataset_info = cfg._cfg_dict['dataset_info'] + elif subset == 'lower': + cfg = Config.fromfile( + 'configs/_base_/datasets/deepfashion_lower.py') + dataset_info = cfg._cfg_dict['dataset_info'] + elif subset == 'full': + cfg = Config.fromfile( + 'configs/_base_/datasets/deepfashion_full.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # use 1.25bbox as input + center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate freihand keypoint results. The pose prediction results will + be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['img_00000001.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/fashion/fashion_base_dataset.py b/mmpose/datasets/datasets/fashion/fashion_base_dataset.py new file mode 100644 index 0000000..d4e5860 --- /dev/null +++ b/mmpose/datasets/datasets/fashion/fashion_base_dataset.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +from torch.utils.data import Dataset + + +class FashionBaseDataset(Dataset, metaclass=ABCMeta): + """This class has been deprecated and replaced by + Kpt2dSviewRgbImgTopDownDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'FashionBaseDataset has been replaced by ' + 'Kpt2dSviewRgbImgTopDownDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/663 for details.') + ) diff --git a/mmpose/datasets/datasets/hand/__init__.py b/mmpose/datasets/datasets/hand/__init__.py new file mode 100644 index 0000000..49159af --- /dev/null +++ b/mmpose/datasets/datasets/hand/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .freihand_dataset import FreiHandDataset +from .hand_coco_wholebody_dataset import HandCocoWholeBodyDataset +from .interhand2d_dataset import InterHand2DDataset +from .interhand3d_dataset import InterHand3DDataset +from .onehand10k_dataset import OneHand10KDataset +from .panoptic_hand2d_dataset import PanopticDataset +from .rhd2d_dataset import Rhd2DDataset + +__all__ = [ + 'FreiHandDataset', 'InterHand2DDataset', 'InterHand3DDataset', + 'OneHand10KDataset', 'PanopticDataset', 'Rhd2DDataset', + 'HandCocoWholeBodyDataset' +] diff --git a/mmpose/datasets/datasets/hand/freihand_dataset.py b/mmpose/datasets/datasets/hand/freihand_dataset.py new file mode 100644 index 0000000..e9ceeff --- /dev/null +++ b/mmpose/datasets/datasets/hand/freihand_dataset.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class FreiHandDataset(Kpt2dSviewRgbImgTopDownDataset): + """FreiHand dataset for top-down hand pose estimation. + + "FreiHAND: A Dataset for Markerless Capture of Hand Pose + and Shape from Single RGB Images", ICCV'2019. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + FreiHand keypoint indexes:: + + 0: 'wrist', + 1: 'thumb1', + 2: 'thumb2', + 3: 'thumb3', + 4: 'thumb4', + 5: 'forefinger1', + 6: 'forefinger2', + 7: 'forefinger3', + 8: 'forefinger4', + 9: 'middle_finger1', + 10: 'middle_finger2', + 11: 'middle_finger3', + 12: 'middle_finger4', + 13: 'ring_finger1', + 14: 'ring_finger2', + 15: 'ring_finger3', + 16: 'ring_finger4', + 17: 'pinky_finger1', + 18: 'pinky_finger2', + 19: 'pinky_finger3', + 20: 'pinky_finger4' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/freihand2d.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # the ori image is 224x224 + center, scale = self._xywh2cs(0, 0, 224, 224, 0.8) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate freihand keypoint results. The pose prediction results will + be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['training/rgb/\ + 00031426.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/hand/hand_base_dataset.py b/mmpose/datasets/datasets/hand/hand_base_dataset.py new file mode 100644 index 0000000..fd20846 --- /dev/null +++ b/mmpose/datasets/datasets/hand/hand_base_dataset.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +from torch.utils.data import Dataset + + +class HandBaseDataset(Dataset, metaclass=ABCMeta): + """This class has been deprecated and replaced by + Kpt2dSviewRgbImgTopDownDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'HandBaseDataset has been replaced by ' + 'Kpt2dSviewRgbImgTopDownDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/663 for details.') + ) diff --git a/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py b/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py new file mode 100644 index 0000000..7c95cc0 --- /dev/null +++ b/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py @@ -0,0 +1,211 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class HandCocoWholeBodyDataset(Kpt2dSviewRgbImgTopDownDataset): + """CocoWholeBodyDataset for top-down hand pose estimation. + + "Whole-Body Human Pose Estimation in the Wild", ECCV'2020. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + COCO-WholeBody Hand keypoint indexes:: + + 0: 'wrist', + 1: 'thumb1', + 2: 'thumb2', + 3: 'thumb3', + 4: 'thumb4', + 5: 'forefinger1', + 6: 'forefinger2', + 7: 'forefinger3', + 8: 'forefinger4', + 9: 'middle_finger1', + 10: 'middle_finger2', + 11: 'middle_finger3', + 12: 'middle_finger4', + 13: 'ring_finger1', + 14: 'ring_finger2', + 15: 'ring_finger3', + 16: 'ring_finger4', + 17: 'pinky_finger1', + 18: 'pinky_finger2', + 19: 'pinky_finger3', + 20: 'pinky_finger4' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile( + 'configs/_base_/datasets/coco_wholebody_hand.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + for type in ['left', 'right']: + if obj[f'{type}hand_valid'] and max( + obj[f'{type}hand_kpts']) > 0: + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), + dtype=np.float32) + + keypoints = np.array(obj[f'{type}hand_kpts']).reshape( + -1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum( + 1, keypoints[:, 2:3]) + + # use 1.25 padded bbox as input + center, scale = self._xywh2cs( + *obj[f'{type}hand_box'][:4], 1.25) + + image_file = osp.join(self.img_prefix, + self.id2name[img_id]) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj[f'{type}hand_box'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate COCO-WholeBody Hand keypoint results. The pose prediction + results will be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['Test/source/0.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/hand/interhand2d_dataset.py b/mmpose/datasets/datasets/hand/interhand2d_dataset.py new file mode 100644 index 0000000..fea17fa --- /dev/null +++ b/mmpose/datasets/datasets/hand/interhand2d_dataset.py @@ -0,0 +1,306 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class InterHand2DDataset(Kpt2dSviewRgbImgTopDownDataset): + """InterHand2.6M 2D dataset for top-down hand pose estimation. + + "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose + Estimation from a Single RGB Image", ECCV'2020. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + InterHand2.6M keypoint indexes:: + + 0: 'thumb4', + 1: 'thumb3', + 2: 'thumb2', + 3: 'thumb1', + 4: 'forefinger4', + 5: 'forefinger3', + 6: 'forefinger2', + 7: 'forefinger1', + 8: 'middle_finger4', + 9: 'middle_finger3', + 10: 'middle_finger2', + 11: 'middle_finger1', + 12: 'ring_finger4', + 13: 'ring_finger3', + 14: 'ring_finger2', + 15: 'ring_finger1', + 16: 'pinky_finger4', + 17: 'pinky_finger3', + 18: 'pinky_finger2', + 19: 'pinky_finger1', + 20: 'wrist' + + Args: + ann_file (str): Path to the annotation file. + camera_file (str): Path to the camera file. + joint_file (str): Path to the joint file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (str): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + camera_file, + joint_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/interhand2d.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.camera_file = camera_file + self.joint_file = joint_file + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + @staticmethod + def _cam2pixel(cam_coord, f, c): + """Transform the joints from their camera coordinates to their pixel + coordinates. + + Note: + - N: number of joints + + Args: + cam_coord (ndarray[N, 3]): 3D joints coordinates + in the camera coordinate system + f (ndarray[2]): focal length of x and y axis + c (ndarray[2]): principal point of x and y axis + + Returns: + img_coord (ndarray[N, 3]): the coordinates (x, y, 0) + in the image plane. + """ + x = cam_coord[:, 0] / (cam_coord[:, 2] + 1e-8) * f[0] + c[0] + y = cam_coord[:, 1] / (cam_coord[:, 2] + 1e-8) * f[1] + c[1] + z = np.zeros_like(x) + img_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1) + return img_coord + + @staticmethod + def _world2cam(world_coord, R, T): + """Transform the joints from their world coordinates to their camera + coordinates. + + Note: + - N: number of joints + + Args: + world_coord (ndarray[3, N]): 3D joints coordinates + in the world coordinate system + R (ndarray[3, 3]): camera rotation matrix + T (ndarray[3]): camera position (x, y, z) + + Returns: + cam_coord (ndarray[3, N]): 3D joints coordinates + in the camera coordinate system + """ + cam_coord = np.dot(R, world_coord - T) + return cam_coord + + def _get_db(self): + """Load dataset. + + Adapted from 'https://github.com/facebookresearch/InterHand2.6M/' + 'blob/master/data/InterHand2.6M/dataset.py' + Copyright (c) FaceBook Research, under CC-BY-NC 4.0 license. + """ + with open(self.camera_file, 'r') as f: + cameras = json.load(f) + with open(self.joint_file, 'r') as f: + joints = json.load(f) + gt_db = [] + bbox_id = 0 + for img_id in self.img_ids: + num_joints = self.ann_info['num_joints'] + + ann_id = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + ann = self.coco.loadAnns(ann_id)[0] + img = self.coco.loadImgs(img_id)[0] + + capture_id = str(img['capture']) + camera_name = img['camera'] + frame_idx = str(img['frame_idx']) + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + camera_pos, camera_rot = np.array( + cameras[capture_id]['campos'][camera_name], + dtype=np.float32), np.array( + cameras[capture_id]['camrot'][camera_name], + dtype=np.float32) + focal, principal_pt = np.array( + cameras[capture_id]['focal'][camera_name], + dtype=np.float32), np.array( + cameras[capture_id]['princpt'][camera_name], + dtype=np.float32) + joint_world = np.array( + joints[capture_id][frame_idx]['world_coord'], dtype=np.float32) + joint_cam = self._world2cam( + joint_world.transpose(1, 0), camera_rot, + camera_pos.reshape(3, 1)).transpose(1, 0) + joint_img = self._cam2pixel(joint_cam, focal, principal_pt)[:, :2] + joint_img = joint_img.reshape(2, -1, 2) + + joint_valid = np.array( + ann['joint_valid'], dtype=np.float32).reshape(2, -1) + # if root is not valid -> root-relative 3D pose is also not valid. + # Therefore, mark all joints as invalid + for hand in range(2): + joint_valid[hand, :] *= joint_valid[hand][-1] + + if np.sum(joint_valid[hand, :]) > 11: + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), + dtype=np.float32) + joints_3d[:, :2] = joint_img[hand, :, :] + joints_3d_visible[:, :2] = np.minimum( + 1, joint_valid[hand, :].reshape(-1, 1)) + + # use the tightest bbox enclosing all keypoints as bbox + bbox = [img['width'], img['height'], 0, 0] + for i in range(num_joints): + if joints_3d_visible[i][0]: + bbox[0] = min(bbox[0], joints_3d[i][0]) + bbox[1] = min(bbox[1], joints_3d[i][1]) + bbox[2] = max(bbox[2], joints_3d[i][0]) + bbox[3] = max(bbox[3], joints_3d[i][1]) + + bbox[2] -= bbox[0] + bbox[3] -= bbox[1] + + # use 1.5bbox as input + center, scale = self._xywh2cs(*bbox, 1.5) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': bbox, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate interhand2d keypoint results. The pose prediction results + will be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['Capture12/\ + 0390_dh_touchROM/cam410209/image62434.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/hand/interhand3d_dataset.py b/mmpose/datasets/datasets/hand/interhand3d_dataset.py new file mode 100644 index 0000000..318d73f --- /dev/null +++ b/mmpose/datasets/datasets/hand/interhand3d_dataset.py @@ -0,0 +1,505 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.core.evaluation.top_down_eval import keypoint_epe +from mmpose.datasets.builder import DATASETS +from ..base import Kpt3dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class InterHand3DDataset(Kpt3dSviewRgbImgTopDownDataset): + """InterHand2.6M 3D dataset for top-down hand pose estimation. + + "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose + Estimation from a Single RGB Image", ECCV'2020. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + InterHand2.6M keypoint indexes:: + + 0: 'r_thumb4', + 1: 'r_thumb3', + 2: 'r_thumb2', + 3: 'r_thumb1', + 4: 'r_index4', + 5: 'r_index3', + 6: 'r_index2', + 7: 'r_index1', + 8: 'r_middle4', + 9: 'r_middle3', + 10: 'r_middle2', + 11: 'r_middle1', + 12: 'r_ring4', + 13: 'r_ring3', + 14: 'r_ring2', + 15: 'r_ring1', + 16: 'r_pinky4', + 17: 'r_pinky3', + 18: 'r_pinky2', + 19: 'r_pinky1', + 20: 'r_wrist', + 21: 'l_thumb4', + 22: 'l_thumb3', + 23: 'l_thumb2', + 24: 'l_thumb1', + 25: 'l_index4', + 26: 'l_index3', + 27: 'l_index2', + 28: 'l_index1', + 29: 'l_middle4', + 30: 'l_middle3', + 31: 'l_middle2', + 32: 'l_middle1', + 33: 'l_ring4', + 34: 'l_ring3', + 35: 'l_ring2', + 36: 'l_ring1', + 37: 'l_pinky4', + 38: 'l_pinky3', + 39: 'l_pinky2', + 40: 'l_pinky1', + 41: 'l_wrist' + + Args: + ann_file (str): Path to the annotation file. + camera_file (str): Path to the camera file. + joint_file (str): Path to the joint file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + use_gt_root_depth (bool): Using the ground truth depth of the wrist + or given depth from rootnet_result_file. + rootnet_result_file (str): Path to the wrist depth file. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (str): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + camera_file, + joint_file, + img_prefix, + data_cfg, + pipeline, + use_gt_root_depth=True, + rootnet_result_file=None, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/interhand3d.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['heatmap3d_depth_bound'] = data_cfg[ + 'heatmap3d_depth_bound'] + self.ann_info['heatmap_size_root'] = data_cfg['heatmap_size_root'] + self.ann_info['root_depth_bound'] = data_cfg['root_depth_bound'] + self.ann_info['use_different_joint_weights'] = False + + self.camera_file = camera_file + self.joint_file = joint_file + + self.use_gt_root_depth = use_gt_root_depth + if not self.use_gt_root_depth: + assert rootnet_result_file is not None + self.rootnet_result_file = rootnet_result_file + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + @staticmethod + def _encode_handtype(hand_type): + if hand_type == 'right': + return np.array([1, 0], dtype=np.float32) + elif hand_type == 'left': + return np.array([0, 1], dtype=np.float32) + elif hand_type == 'interacting': + return np.array([1, 1], dtype=np.float32) + else: + assert 0, f'Not support hand type: {hand_type}' + + def _get_db(self): + """Load dataset. + + Adapted from 'https://github.com/facebookresearch/InterHand2.6M/' + 'blob/master/data/InterHand2.6M/dataset.py' + Copyright (c) FaceBook Research, under CC-BY-NC 4.0 license. + """ + with open(self.camera_file, 'r') as f: + cameras = json.load(f) + with open(self.joint_file, 'r') as f: + joints = json.load(f) + + if not self.use_gt_root_depth: + rootnet_result = {} + with open(self.rootnet_result_file, 'r') as f: + rootnet_annot = json.load(f) + for i in range(len(rootnet_annot)): + rootnet_result[str( + rootnet_annot[i]['annot_id'])] = rootnet_annot[i] + + gt_db = [] + bbox_id = 0 + for img_id in self.img_ids: + num_joints = self.ann_info['num_joints'] + + ann_id = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + ann = self.coco.loadAnns(ann_id)[0] + img = self.coco.loadImgs(img_id)[0] + + capture_id = str(img['capture']) + camera_name = img['camera'] + frame_idx = str(img['frame_idx']) + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + camera_pos = np.array( + cameras[capture_id]['campos'][camera_name], dtype=np.float32) + camera_rot = np.array( + cameras[capture_id]['camrot'][camera_name], dtype=np.float32) + focal = np.array( + cameras[capture_id]['focal'][camera_name], dtype=np.float32) + principal_pt = np.array( + cameras[capture_id]['princpt'][camera_name], dtype=np.float32) + joint_world = np.array( + joints[capture_id][frame_idx]['world_coord'], dtype=np.float32) + joint_cam = self._world2cam( + joint_world.transpose(1, 0), camera_rot, + camera_pos.reshape(3, 1)).transpose(1, 0) + joint_img = self._cam2pixel(joint_cam, focal, principal_pt)[:, :2] + + joint_valid = np.array( + ann['joint_valid'], dtype=np.float32).flatten() + hand_type = self._encode_handtype(ann['hand_type']) + hand_type_valid = ann['hand_type_valid'] + + if self.use_gt_root_depth: + bbox = np.array(ann['bbox'], dtype=np.float32) + # extend the bbox to include some context + center, scale = self._xywh2cs(*bbox, 1.25) + abs_depth = [joint_cam[20, 2], joint_cam[41, 2]] + else: + rootnet_ann_data = rootnet_result[str(ann_id[0])] + bbox = np.array(rootnet_ann_data['bbox'], dtype=np.float32) + # the bboxes have been extended + center, scale = self._xywh2cs(*bbox, 1.0) + abs_depth = rootnet_ann_data['abs_depth'] + # 41: 'l_wrist', left hand root + # 20: 'r_wrist', right hand root + rel_root_depth = joint_cam[41, 2] - joint_cam[20, 2] + # if root is not valid, root-relative 3D depth is also invalid. + rel_root_valid = joint_valid[20] * joint_valid[41] + + # if root is not valid -> root-relative 3D pose is also not valid. + # Therefore, mark all joints as invalid + joint_valid[:20] *= joint_valid[20] + joint_valid[21:] *= joint_valid[41] + + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d[:, :2] = joint_img + joints_3d[:21, 2] = joint_cam[:21, 2] - joint_cam[20, 2] + joints_3d[21:, 2] = joint_cam[21:, 2] - joint_cam[41, 2] + joints_3d_visible[...] = np.minimum(1, joint_valid.reshape(-1, 1)) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'hand_type': hand_type, + 'hand_type_valid': hand_type_valid, + 'rel_root_depth': rel_root_depth, + 'rel_root_valid': rel_root_valid, + 'abs_depth': abs_depth, + 'joints_cam': joint_cam, + 'focal': focal, + 'princpt': principal_pt, + 'dataset': self.dataset_name, + 'bbox': bbox, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='MPJPE', **kwargs): + """Evaluate interhand2d keypoint results. The pose prediction results + will be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - hand_type (np.ndarray[N, 4]): The first two dimensions are \ + hand type, scores is the last two dimensions. + - rel_root_depth (np.ndarray[N]): The relative depth of left \ + wrist and right wrist. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['Capture6/\ + 0012_aokay_upright/cam410061/image4996.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'MRRPE', 'MPJPE', 'Handedness_acc'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['MRRPE', 'MPJPE', 'Handedness_acc'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result.get('preds') + if preds is None and 'MPJPE' in metrics: + raise KeyError('metric MPJPE is not supported') + + hand_type = result.get('hand_type') + if hand_type is None and 'Handedness_acc' in metrics: + raise KeyError('metric Handedness_acc is not supported') + + rel_root_depth = result.get('rel_root_depth') + if rel_root_depth is None and 'MRRPE' in metrics: + raise KeyError('metric MRRPE is not supported') + + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpt = { + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + } + + if preds is not None: + kpt['keypoints'] = preds[i, :, :3].tolist() + if hand_type is not None: + kpt['hand_type'] = hand_type[i][0:2].tolist() + kpt['hand_type_score'] = hand_type[i][2:4].tolist() + if rel_root_depth is not None: + kpt['rel_root_depth'] = float(rel_root_depth[i]) + + kpts.append(kpt) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + @staticmethod + def _get_accuracy(outputs, gts, masks): + """Get accuracy of multi-label classification. + + Note: + - batch_size: N + - label_num: C + + Args: + outputs (np.array[N, C]): predicted multi-label. + gts (np.array[N, C]): Groundtruth muti-label. + masks (np.array[N, ]): masked outputs will be ignored for + accuracy calculation. + + Returns: + float: mean accuracy + """ + acc = (outputs == gts).all(axis=1) + return np.mean(acc[masks]) + + def _report_metric(self, res_file, metrics): + """Keypoint evaluation. + + Args: + res_file (str): Json file stored prediction results. + metrics (str | list[str]): Metric to be performed. + Options: 'MRRPE', 'MPJPE', 'Handedness_acc'. + + Returns: + list: Evaluation results for evaluation metric. + """ + info_str = [] + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.db) + + gts_rel_root = [] + preds_rel_root = [] + rel_root_masks = [] + gts_joint_coord_cam = [] + preds_joint_coord_cam = [] + single_masks = [] + interacting_masks = [] + all_masks = [] + gts_hand_type = [] + preds_hand_type = [] + hand_type_masks = [] + + for pred, item in zip(preds, self.db): + # mrrpe + if 'MRRPE' in metrics: + if item['hand_type'].all() and item['joints_3d_visible'][ + 20, 0] and item['joints_3d_visible'][41, 0]: + rel_root_masks.append(True) + + pred_left_root_img = np.array( + pred['keypoints'][41], dtype=np.float32)[None, :] + pred_left_root_img[:, 2] += item['abs_depth'][0] + pred[ + 'rel_root_depth'] + pred_left_root_cam = self._pixel2cam( + pred_left_root_img, item['focal'], item['princpt']) + + pred_right_root_img = np.array( + pred['keypoints'][20], dtype=np.float32)[None, :] + pred_right_root_img[:, 2] += item['abs_depth'][0] + pred_right_root_cam = self._pixel2cam( + pred_right_root_img, item['focal'], item['princpt']) + + preds_rel_root.append(pred_left_root_cam - + pred_right_root_cam) + gts_rel_root.append( + [item['joints_cam'][41] - item['joints_cam'][20]]) + else: + rel_root_masks.append(False) + preds_rel_root.append([[0., 0., 0.]]) + gts_rel_root.append([[0., 0., 0.]]) + + if 'MPJPE' in metrics: + pred_joint_coord_img = np.array( + pred['keypoints'], dtype=np.float32) + gt_joint_coord_cam = item['joints_cam'].copy() + + pred_joint_coord_img[:21, 2] += item['abs_depth'][0] + pred_joint_coord_img[21:, 2] += item['abs_depth'][1] + pred_joint_coord_cam = self._pixel2cam(pred_joint_coord_img, + item['focal'], + item['princpt']) + + pred_joint_coord_cam[:21] -= pred_joint_coord_cam[20] + pred_joint_coord_cam[21:] -= pred_joint_coord_cam[41] + gt_joint_coord_cam[:21] -= gt_joint_coord_cam[20] + gt_joint_coord_cam[21:] -= gt_joint_coord_cam[41] + + preds_joint_coord_cam.append(pred_joint_coord_cam) + gts_joint_coord_cam.append(gt_joint_coord_cam) + + mask = (np.array(item['joints_3d_visible'])[:, 0]) > 0 + + if item['hand_type'].all(): + single_masks.append( + np.zeros(self.ann_info['num_joints'], dtype=bool)) + interacting_masks.append(mask) + all_masks.append(mask) + else: + single_masks.append(mask) + interacting_masks.append( + np.zeros(self.ann_info['num_joints'], dtype=bool)) + all_masks.append(mask) + + if 'Handedness_acc' in metrics: + pred_hand_type = np.array(pred['hand_type'], dtype=int) + preds_hand_type.append(pred_hand_type) + gts_hand_type.append(item['hand_type']) + hand_type_masks.append(item['hand_type_valid'] > 0) + + gts_rel_root = np.array(gts_rel_root, dtype=np.float32) + preds_rel_root = np.array(preds_rel_root, dtype=np.float32) + rel_root_masks = np.array(rel_root_masks, dtype=bool)[:, None] + gts_joint_coord_cam = np.array(gts_joint_coord_cam, dtype=np.float32) + preds_joint_coord_cam = np.array( + preds_joint_coord_cam, dtype=np.float32) + single_masks = np.array(single_masks, dtype=bool) + interacting_masks = np.array(interacting_masks, dtype=bool) + all_masks = np.array(all_masks, dtype=bool) + gts_hand_type = np.array(gts_hand_type, dtype=int) + preds_hand_type = np.array(preds_hand_type, dtype=int) + hand_type_masks = np.array(hand_type_masks, dtype=bool) + + if 'MRRPE' in metrics: + info_str.append(('MRRPE', + keypoint_epe(preds_rel_root, gts_rel_root, + rel_root_masks))) + + if 'MPJPE' in metrics: + info_str.append(('MPJPE_all', + keypoint_epe(preds_joint_coord_cam, + gts_joint_coord_cam, all_masks))) + info_str.append(('MPJPE_single', + keypoint_epe(preds_joint_coord_cam, + gts_joint_coord_cam, single_masks))) + info_str.append( + ('MPJPE_interacting', + keypoint_epe(preds_joint_coord_cam, gts_joint_coord_cam, + interacting_masks))) + + if 'Handedness_acc' in metrics: + info_str.append(('Handedness_acc', + self._get_accuracy(preds_hand_type, gts_hand_type, + hand_type_masks))) + + return info_str diff --git a/mmpose/datasets/datasets/hand/onehand10k_dataset.py b/mmpose/datasets/datasets/hand/onehand10k_dataset.py new file mode 100644 index 0000000..9783cab --- /dev/null +++ b/mmpose/datasets/datasets/hand/onehand10k_dataset.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class OneHand10KDataset(Kpt2dSviewRgbImgTopDownDataset): + """OneHand10K dataset for top-down hand pose estimation. + + "Mask-pose Cascaded CNN for 2D Hand Pose Estimation from + Single Color Images", TCSVT'2019. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + OneHand10K keypoint indexes:: + + 0: 'wrist', + 1: 'thumb1', + 2: 'thumb2', + 3: 'thumb3', + 4: 'thumb4', + 5: 'forefinger1', + 6: 'forefinger2', + 7: 'forefinger3', + 8: 'forefinger4', + 9: 'middle_finger1', + 10: 'middle_finger2', + 11: 'middle_finger3', + 12: 'middle_finger4', + 13: 'ring_finger1', + 14: 'ring_finger2', + 15: 'ring_finger3', + 16: 'ring_finger4', + 17: 'pinky_finger1', + 18: 'pinky_finger2', + 19: 'pinky_finger3', + 20: 'pinky_finger4' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/onehand10k.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # use 1.25 padded bbox as input + center, scale = self._xywh2cs(*obj['bbox'][:4], 1.25) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate onehand10k keypoint results. The pose prediction results + will be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['Test/source/0.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py b/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py new file mode 100644 index 0000000..c1d7fc6 --- /dev/null +++ b/mmpose/datasets/datasets/hand/panoptic_hand2d_dataset.py @@ -0,0 +1,208 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class PanopticDataset(Kpt2dSviewRgbImgTopDownDataset): + """Panoptic dataset for top-down hand pose estimation. + + "Hand Keypoint Detection in Single Images using Multiview + Bootstrapping", CVPR'2017. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Panoptic keypoint indexes:: + + 0: 'wrist', + 1: 'thumb1', + 2: 'thumb2', + 3: 'thumb3', + 4: 'thumb4', + 5: 'forefinger1', + 6: 'forefinger2', + 7: 'forefinger3', + 8: 'forefinger4', + 9: 'middle_finger1', + 10: 'middle_finger2', + 11: 'middle_finger3', + 12: 'middle_finger4', + 13: 'ring_finger1', + 14: 'ring_finger2', + 15: 'ring_finger3', + 16: 'ring_finger4', + 17: 'pinky_finger1', + 18: 'pinky_finger2', + 19: 'pinky_finger3', + 20: 'pinky_finger4' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/panoptic_hand2d.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # The bbox is the tightest bbox enclosing keypoints. + # The paper uses 2.2 bbox as the input, while + # we use 1.76 (2.2 * 0.8) bbox as the input. + center, scale = self._xywh2cs(*obj['bbox'][:4], 1.76) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'head_size': obj['head_size'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs): + """Evaluate panoptic keypoint results. The pose prediction results will + be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['hand_labels/\ + manual_test/000648952_02_l.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCKh', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCKh', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/hand/rhd2d_dataset.py b/mmpose/datasets/datasets/hand/rhd2d_dataset.py new file mode 100644 index 0000000..3667f5f --- /dev/null +++ b/mmpose/datasets/datasets/hand/rhd2d_dataset.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class Rhd2DDataset(Kpt2dSviewRgbImgTopDownDataset): + """Rendered Handpose Dataset for top-down hand pose estimation. + + "Learning to Estimate 3D Hand Pose from Single RGB Images", + ICCV'2017. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Rhd keypoint indexes:: + + 0: 'wrist', + 1: 'thumb1', + 2: 'thumb2', + 3: 'thumb3', + 4: 'thumb4', + 5: 'forefinger1', + 6: 'forefinger2', + 7: 'forefinger3', + 8: 'forefinger4', + 9: 'middle_finger1', + 10: 'middle_finger2', + 11: 'middle_finger3', + 12: 'middle_finger4', + 13: 'ring_finger1', + 14: 'ring_finger2', + 15: 'ring_finger3', + 16: 'ring_finger4', + 17: 'pinky_finger1', + 18: 'pinky_finger2', + 19: 'pinky_finger3', + 20: 'pinky_finger4' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/rhd2d.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['use_different_joint_weights'] = False + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # the ori image is 224x224 + center, scale = self._xywh2cs(*obj['bbox'][:4], padding=1.25) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate rhd keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1], area, score] + - image_paths (list[str]): For example, + ['training/rgb/00031426.jpg'] + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value diff --git a/mmpose/datasets/datasets/mesh/__init__.py b/mmpose/datasets/datasets/mesh/__init__.py new file mode 100644 index 0000000..14297c7 --- /dev/null +++ b/mmpose/datasets/datasets/mesh/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .mesh_adv_dataset import MeshAdversarialDataset +from .mesh_h36m_dataset import MeshH36MDataset +from .mesh_mix_dataset import MeshMixDataset +from .mosh_dataset import MoshDataset + +__all__ = [ + 'MeshH36MDataset', 'MoshDataset', 'MeshMixDataset', + 'MeshAdversarialDataset' +] diff --git a/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py b/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py new file mode 100644 index 0000000..cd9ba39 --- /dev/null +++ b/mmpose/datasets/datasets/mesh/mesh_adv_dataset.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from torch.utils.data import Dataset + +from mmpose.datasets.builder import DATASETS, build_dataset + + +@DATASETS.register_module() +class MeshAdversarialDataset(Dataset): + """Mix Dataset for the adversarial training in 3D human mesh estimation + task. + + The dataset combines data from two datasets and + return a dict containing data from two datasets. + + Args: + train_dataset (Dataset): Dataset for 3D human mesh estimation. + adversarial_dataset (Dataset): Dataset for adversarial learning, + provides real SMPL parameters. + """ + + def __init__(self, train_dataset, adversarial_dataset): + super().__init__() + self.train_dataset = build_dataset(train_dataset) + self.adversarial_dataset = build_dataset(adversarial_dataset) + self.length = len(self.train_dataset) + + def __len__(self): + """Get the size of the dataset.""" + return self.length + + def __getitem__(self, i): + """Given index, get the data from train dataset and randomly sample an + item from adversarial dataset. + + Return a dict containing data from train and adversarial dataset. + """ + data = self.train_dataset[i] + ind_adv = np.random.randint( + low=0, high=len(self.adversarial_dataset), dtype=int) + data.update(self.adversarial_dataset[ind_adv % + len(self.adversarial_dataset)]) + return data diff --git a/mmpose/datasets/datasets/mesh/mesh_base_dataset.py b/mmpose/datasets/datasets/mesh/mesh_base_dataset.py new file mode 100644 index 0000000..79c8a8a --- /dev/null +++ b/mmpose/datasets/datasets/mesh/mesh_base_dataset.py @@ -0,0 +1,155 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +import os +from abc import ABCMeta + +import numpy as np +from torch.utils.data import Dataset + +from mmpose.datasets.pipelines import Compose + + +class MeshBaseDataset(Dataset, metaclass=ABCMeta): + """Base dataset for 3D human mesh estimation task. In 3D humamesh + estimation task, all datasets share this BaseDataset for training and have + their own evaluate function. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + This dataset can only be used for training. + For evaluation, subclass should write an extra evaluate function. + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + test_mode=False): + + self.image_info = {} + self.ann_info = {} + + self.ann_file = ann_file + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['iuv_size'] = np.array(data_cfg['iuv_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + self.ann_info['flip_pairs'] = None + self.db = [] + self.pipeline = Compose(self.pipeline) + + # flip_pairs + # For all mesh dataset, we use 24 joints as CMR and SPIN. + self.ann_info['flip_pairs'] = [[0, 5], [1, 4], [2, 3], [6, 11], + [7, 10], [8, 9], [20, 21], [22, 23]] + self.ann_info['use_different_joint_weights'] = False + assert self.ann_info['num_joints'] == 24 + self.ann_info['joint_weights'] = np.ones([24, 1], dtype=np.float32) + + self.ann_info['uv_type'] = data_cfg['uv_type'] + self.ann_info['use_IUV'] = data_cfg['use_IUV'] + uv_type = self.ann_info['uv_type'] + self.iuv_prefix = os.path.join(self.img_prefix, f'{uv_type}_IUV_gt') + self.db = self._get_db(ann_file) + + def _get_db(self, ann_file): + """Load dataset.""" + data = np.load(ann_file) + tmpl = dict( + image_file=None, + center=None, + scale=None, + rotation=0, + joints_2d=None, + joints_2d_visible=None, + joints_3d=None, + joints_3d_visible=None, + gender=None, + pose=None, + beta=None, + has_smpl=0, + iuv_file=None, + has_iuv=0) + gt_db = [] + + _imgnames = data['imgname'] + _scales = data['scale'].astype(np.float32) + _centers = data['center'].astype(np.float32) + dataset_len = len(_imgnames) + + # Get 2D keypoints + if 'part' in data.keys(): + _keypoints = data['part'].astype(np.float32) + else: + _keypoints = np.zeros((dataset_len, 24, 3), dtype=np.float32) + + # Get gt 3D joints, if available + if 'S' in data.keys(): + _joints_3d = data['S'].astype(np.float32) + else: + _joints_3d = np.zeros((dataset_len, 24, 4), dtype=np.float32) + + # Get gt SMPL parameters, if available + if 'pose' in data.keys() and 'shape' in data.keys(): + _poses = data['pose'].astype(np.float32) + _betas = data['shape'].astype(np.float32) + has_smpl = 1 + else: + _poses = np.zeros((dataset_len, 72), dtype=np.float32) + _betas = np.zeros((dataset_len, 10), dtype=np.float32) + has_smpl = 0 + + # Get gender data, if available + if 'gender' in data.keys(): + _genders = data['gender'] + _genders = np.array([str(g) != 'm' for g in _genders]).astype(int) + else: + _genders = -1 * np.ones(dataset_len).astype(int) + + # Get IUV image, if available + if 'iuv_names' in data.keys(): + _iuv_names = data['iuv_names'] + has_iuv = has_smpl + else: + _iuv_names = [''] * dataset_len + has_iuv = 0 + + for i in range(len(_imgnames)): + newitem = cp.deepcopy(tmpl) + newitem['image_file'] = os.path.join(self.img_prefix, _imgnames[i]) + newitem['scale'] = np.array([_scales[i], _scales[i]]) + newitem['center'] = _centers[i] + newitem['joints_2d'] = _keypoints[i, :, :2] + newitem['joints_2d_visible'] = _keypoints[i, :, -1][:, None] + newitem['joints_3d'] = _joints_3d[i, :, :3] + newitem['joints_3d_visible'] = _joints_3d[i, :, -1][:, None] + newitem['pose'] = _poses[i] + newitem['beta'] = _betas[i] + newitem['has_smpl'] = has_smpl + newitem['gender'] = _genders[i] + newitem['iuv_file'] = os.path.join(self.iuv_prefix, _iuv_names[i]) + newitem['has_iuv'] = has_iuv + gt_db.append(newitem) + return gt_db + + def __len__(self, ): + """Get the size of the dataset.""" + return len(self.db) + + def __getitem__(self, idx): + """Get the sample given index.""" + results = cp.deepcopy(self.db[idx]) + results['ann_info'] = self.ann_info + return self.pipeline(results) diff --git a/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py b/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py new file mode 100644 index 0000000..9ac9ead --- /dev/null +++ b/mmpose/datasets/datasets/mesh/mesh_h36m_dataset.py @@ -0,0 +1,101 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from collections import OrderedDict + +import json_tricks as json +import numpy as np + +from mmpose.core.evaluation import keypoint_mpjpe +from mmpose.datasets.builder import DATASETS +from .mesh_base_dataset import MeshBaseDataset + + +@DATASETS.register_module() +class MeshH36MDataset(MeshBaseDataset): + """Human3.6M Dataset for 3D human mesh estimation. It inherits all function + from MeshBaseDataset and has its own evaluate function. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def evaluate(self, outputs, res_folder, metric='joint_error', logger=None): + """Evaluate 3D keypoint results.""" + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['joint_error'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + res_file = os.path.join(res_folder, 'result_keypoints.json') + kpts = [] + for out in outputs: + for (keypoints, image_path) in zip(out['keypoints_3d'], + out['image_path']): + kpts.append({ + 'keypoints': keypoints.tolist(), + 'image': image_path, + }) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file) + name_value = OrderedDict(info_str) + return name_value + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def _report_metric(self, res_file): + """Keypoint evaluation. + + Report mean per joint position error (MPJPE) and mean per joint + position error after rigid alignment (MPJPE-PA) + """ + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.db) + + pred_joints_3d = [pred['keypoints'] for pred in preds] + gt_joints_3d = [item['joints_3d'] for item in self.db] + gt_joints_visible = [item['joints_3d_visible'] for item in self.db] + + pred_joints_3d = np.array(pred_joints_3d) + gt_joints_3d = np.array(gt_joints_3d) + gt_joints_visible = np.array(gt_joints_visible) + + # we only evaluate on 14 lsp joints + joint_mapper = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18] + pred_joints_3d = pred_joints_3d[:, joint_mapper, :] + pred_pelvis = (pred_joints_3d[:, 2] + pred_joints_3d[:, 3]) / 2 + pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :] + + gt_joints_3d = gt_joints_3d[:, joint_mapper, :] + gt_pelvis = (gt_joints_3d[:, 2] + gt_joints_3d[:, 3]) / 2 + gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :] + gt_joints_visible = gt_joints_visible[:, joint_mapper, 0] > 0 + + mpjpe = keypoint_mpjpe(pred_joints_3d, gt_joints_3d, gt_joints_visible) + mpjpe_pa = keypoint_mpjpe( + pred_joints_3d, + gt_joints_3d, + gt_joints_visible, + alignment='procrustes') + + info_str = [] + info_str.append(('MPJPE', mpjpe * 1000)) + info_str.append(('MPJPE-PA', mpjpe_pa * 1000)) + return info_str diff --git a/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py b/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py new file mode 100644 index 0000000..244a7c3 --- /dev/null +++ b/mmpose/datasets/datasets/mesh/mesh_mix_dataset.py @@ -0,0 +1,73 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +import numpy as np +from torch.utils.data import ConcatDataset, Dataset, WeightedRandomSampler + +from mmpose.datasets.builder import DATASETS +from .mesh_base_dataset import MeshBaseDataset + + +@DATASETS.register_module() +class MeshMixDataset(Dataset, metaclass=ABCMeta): + """Mix Dataset for 3D human mesh estimation. + + The dataset combines data from multiple datasets (MeshBaseDataset) and + sample the data from different datasets with the provided proportions. + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Args: + configs (list): List of configs for multiple datasets. + partition (list): Sample proportion of multiple datasets. The length + of partition should be same with that of configs. The elements + of it should be non-negative and is not necessary summing up to + one. + + Example: + >>> from mmpose.datasets import MeshMixDataset + >>> data_cfg = dict( + >>> image_size=[256, 256], + >>> iuv_size=[64, 64], + >>> num_joints=24, + >>> use_IUV=True, + >>> uv_type='BF') + >>> + >>> mix_dataset = MeshMixDataset( + >>> configs=[ + >>> dict( + >>> ann_file='tests/data/h36m/test_h36m.npz', + >>> img_prefix='tests/data/h36m', + >>> data_cfg=data_cfg, + >>> pipeline=[]), + >>> dict( + >>> ann_file='tests/data/h36m/test_h36m.npz', + >>> img_prefix='tests/data/h36m', + >>> data_cfg=data_cfg, + >>> pipeline=[]), + >>> ], + >>> partition=[0.6, 0.4]) + """ + + def __init__(self, configs, partition): + """Load data from multiple datasets.""" + assert min(partition) >= 0 + datasets = [MeshBaseDataset(**cfg) for cfg in configs] + self.dataset = ConcatDataset(datasets) + self.length = max(len(ds) for ds in datasets) + weights = [ + np.ones(len(ds)) * p / len(ds) + for (p, ds) in zip(partition, datasets) + ] + weights = np.concatenate(weights, axis=0) + self.sampler = WeightedRandomSampler(weights, 1) + + def __len__(self): + """Get the size of the dataset.""" + return self.length + + def __getitem__(self, idx): + """Given index, sample the data from multiple datasets with the given + proportion.""" + idx_new = list(self.sampler)[0] + return self.dataset[idx_new] diff --git a/mmpose/datasets/datasets/mesh/mosh_dataset.py b/mmpose/datasets/datasets/mesh/mosh_dataset.py new file mode 100644 index 0000000..3185265 --- /dev/null +++ b/mmpose/datasets/datasets/mesh/mosh_dataset.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +from abc import ABCMeta + +import numpy as np +from torch.utils.data import Dataset + +from mmpose.datasets.builder import DATASETS +from mmpose.datasets.pipelines import Compose + + +@DATASETS.register_module() +class MoshDataset(Dataset, metaclass=ABCMeta): + """Mosh Dataset for the adversarial training in 3D human mesh estimation + task. + + The dataset return a dict containing real-world SMPL parameters. + + Args: + ann_file (str): Path to the annotation file. + pipeline (list[dict | callable]): A sequence of data transforms. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, ann_file, pipeline, test_mode=False): + + self.ann_file = ann_file + self.pipeline = pipeline + self.test_mode = test_mode + + self.db = self._get_db(ann_file) + self.pipeline = Compose(self.pipeline) + + @staticmethod + def _get_db(ann_file): + """Load dataset.""" + data = np.load(ann_file) + _betas = data['shape'].astype(np.float32) + _poses = data['pose'].astype(np.float32) + tmpl = dict( + pose=None, + beta=None, + ) + gt_db = [] + dataset_len = len(_betas) + + for i in range(dataset_len): + newitem = cp.deepcopy(tmpl) + newitem['pose'] = _poses[i] + newitem['beta'] = _betas[i] + gt_db.append(newitem) + return gt_db + + def __len__(self, ): + """Get the size of the dataset.""" + return len(self.db) + + def __getitem__(self, idx): + """Get the sample given index.""" + item = cp.deepcopy(self.db[idx]) + trivial, pose, beta = \ + np.zeros(3, dtype=np.float32), item['pose'], item['beta'] + results = { + 'mosh_theta': + np.concatenate((trivial, pose, beta), axis=0).astype(np.float32) + } + return self.pipeline(results) diff --git a/mmpose/datasets/datasets/top_down/__init__.py b/mmpose/datasets/datasets/top_down/__init__.py new file mode 100644 index 0000000..cc5b46a --- /dev/null +++ b/mmpose/datasets/datasets/top_down/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .topdown_aic_dataset import TopDownAicDataset +from .topdown_coco_dataset import TopDownCocoDataset +from .topdown_coco_wholebody_dataset import TopDownCocoWholeBodyDataset +from .topdown_crowdpose_dataset import TopDownCrowdPoseDataset +from .topdown_h36m_dataset import TopDownH36MDataset +from .topdown_halpe_dataset import TopDownHalpeDataset +from .topdown_jhmdb_dataset import TopDownJhmdbDataset +from .topdown_mhp_dataset import TopDownMhpDataset +from .topdown_mpii_dataset import TopDownMpiiDataset +from .topdown_mpii_trb_dataset import TopDownMpiiTrbDataset +from .topdown_ochuman_dataset import TopDownOCHumanDataset +from .topdown_posetrack18_dataset import TopDownPoseTrack18Dataset +from .topdown_posetrack18_video_dataset import TopDownPoseTrack18VideoDataset + +__all__ = [ + 'TopDownAicDataset', + 'TopDownCocoDataset', + 'TopDownCocoWholeBodyDataset', + 'TopDownCrowdPoseDataset', + 'TopDownMpiiDataset', + 'TopDownMpiiTrbDataset', + 'TopDownOCHumanDataset', + 'TopDownPoseTrack18Dataset', + 'TopDownJhmdbDataset', + 'TopDownMhpDataset', + 'TopDownH36MDataset', + 'TopDownHalpeDataset', + 'TopDownPoseTrack18VideoDataset', +] diff --git a/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py b/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py new file mode 100644 index 0000000..13c41df --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_aic_dataset.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmcv import Config +from xtcocotools.cocoeval import COCOeval + +from ...builder import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + + +@DATASETS.register_module() +class TopDownAicDataset(TopDownCocoDataset): + """AicDataset dataset for top-down pose estimation. + + "AI Challenger : A Large-scale Dataset for Going Deeper + in Image Understanding", arXiv'2017. + More details can be found in the `paper + `__ + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + AIC keypoint indexes:: + + 0: "right_shoulder", + 1: "right_elbow", + 2: "right_wrist", + 3: "left_shoulder", + 4: "left_elbow", + 5: "left_wrist", + 6: "right_hip", + 7: "right_knee", + 8: "right_ankle", + 9: "left_hip", + 10: "left_knee", + 11: "left_ankle", + 12: "head_top", + 13: "neck" + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/aic.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(TopDownCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval( + self.coco, coco_det, 'keypoints', self.sigmas, use_area=False) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/top_down/topdown_base_dataset.py b/mmpose/datasets/datasets/top_down/topdown_base_dataset.py new file mode 100644 index 0000000..dc99576 --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_base_dataset.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +from torch.utils.data import Dataset + + +class TopDownBaseDataset(Dataset, metaclass=ABCMeta): + """This class has been deprecated and replaced by + Kpt2dSviewRgbImgTopDownDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'TopDownBaseDataset has been replaced by ' + 'Kpt2dSviewRgbImgTopDownDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/663 for details.') + ) diff --git a/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py b/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py new file mode 100644 index 0000000..664c881 --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py @@ -0,0 +1,405 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning +from xtcocotools.cocoeval import COCOeval + +from ....core.post_processing import oks_nms, soft_oks_nms +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class TopDownCocoDataset(Kpt2dSviewRgbImgTopDownDataset): + """CocoDataset dataset for top-down pose estimation. + + "Microsoft COCO: Common Objects in Context", ECCV'2014. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + COCO keypoint indexes:: + + 0: 'nose', + 1: 'left_eye', + 2: 'right_eye', + 3: 'left_ear', + 4: 'right_ear', + 5: 'left_shoulder', + 6: 'right_shoulder', + 7: 'left_elbow', + 8: 'right_elbow', + 9: 'left_wrist', + 10: 'right_wrist', + 11: 'left_hip', + 12: 'right_hip', + 13: 'left_knee', + 14: 'right_knee', + 15: 'left_ankle', + 16: 'right_ankle' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/coco.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + if (not self.test_mode) or self.use_gt_bbox: + # use ground truth bbox + gt_db = self._load_coco_keypoint_annotations() + else: + # use bbox from detection + gt_db = self._load_coco_person_detection_results() + return gt_db + + def _load_coco_keypoint_annotations(self): + """Ground truth bbox and keypoints.""" + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + + Args: + img_id: coco image id + + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + + return rec + + def _load_coco_person_detection_results(self): + """Load coco person detection results.""" + num_joints = self.ann_info['num_joints'] + all_boxes = None + with open(self.bbox_file, 'r') as f: + all_boxes = json.load(f) + + if not all_boxes: + raise ValueError('=> Load %s fail!' % self.bbox_file) + + print(f'=> Total boxes: {len(all_boxes)}') + + kpt_db = [] + bbox_id = 0 + for det_res in all_boxes: + if det_res['category_id'] != 1: + continue + + image_file = osp.join(self.img_prefix, + self.id2name[det_res['image_id']]) + box = det_res['bbox'] + score = det_res['score'] + + if score < self.det_bbox_thr: + continue + + center, scale = self._xywh2cs(*box[:4]) + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.ones((num_joints, 3), dtype=np.float32) + kpt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'bbox': box[:4], + 'bbox_score': score, + 'dataset': self.dataset_name, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + print(f'=> Total boxes after filter ' + f'low score@{self.det_bbox_thr}: {bbox_id}') + return kpt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mAP', **kwargs): + """Evaluate coco keypoint results. The pose prediction results will be + saved in ``${res_folder}/result_keypoints.json``. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['data/coco/val2017\ + /000000393226.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap + - bbox_id (list(int)). + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'mAP'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['mAP'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = defaultdict(list) + + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + kpts[image_id].append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + # rescoring and oks nms + num_joints = self.ann_info['num_joints'] + vis_thr = self.vis_thr + oks_thr = self.oks_thr + valid_kpts = [] + for image_id in kpts.keys(): + img_kpts = kpts[image_id] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > vis_thr: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + if self.use_nms: + nms = soft_oks_nms if self.soft_nms else oks_nms + keep = nms(img_kpts, oks_thr, sigmas=self.sigmas) + valid_kpts.append([img_kpts[_keep] for _keep in keep]) + else: + valid_kpts.append(img_kpts) + + self._write_coco_keypoint_results(valid_kpts, res_file) + + info_str = self._do_python_keypoint_eval(res_file) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + def _write_coco_keypoint_results(self, keypoints, res_file): + """Write results into a json file.""" + data_pack = [{ + 'cat_id': self._class_to_coco_ind[cls], + 'cls_ind': cls_ind, + 'cls': cls, + 'ann_type': 'keypoints', + 'keypoints': keypoints + } for cls_ind, cls in enumerate(self.classes) + if not cls == '__background__'] + + results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) + + with open(res_file, 'w') as f: + json.dump(results, f, sort_keys=True, indent=4) + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + """Get coco keypoint results.""" + cat_id = data_pack['cat_id'] + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpt['keypoints'] for img_kpt in img_kpts]) + key_points = _key_points.reshape(-1, + self.ann_info['num_joints'] * 3) + + result = [{ + 'image_id': img_kpt['image_id'], + 'category_id': cat_id, + 'keypoints': key_point.tolist(), + 'score': float(img_kpt['score']), + 'center': img_kpt['center'].tolist(), + 'scale': img_kpt['scale'].tolist() + } for img_kpt, key_point in zip(img_kpts, key_points)] + + cat_results.extend(result) + + return cat_results + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval(self.coco, coco_det, 'keypoints', self.sigmas) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + for img_id, persons in kpts.items(): + num = len(persons) + kpts[img_id] = sorted(kpts[img_id], key=lambda x: x[key]) + for i in range(num - 1, 0, -1): + if kpts[img_id][i][key] == kpts[img_id][i - 1][key]: + del kpts[img_id][i] + + return kpts diff --git a/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py b/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py new file mode 100644 index 0000000..791a3c5 --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_coco_wholebody_dataset.py @@ -0,0 +1,274 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import warnings + +import numpy as np +from mmcv import Config +from xtcocotools.cocoeval import COCOeval + +from ...builder import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + + +@DATASETS.register_module() +class TopDownCocoWholeBodyDataset(TopDownCocoDataset): + """CocoWholeBodyDataset dataset for top-down pose estimation. + + "Whole-Body Human Pose Estimation in the Wild", ECCV'2020. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + COCO-WholeBody keypoint indexes:: + + 0-16: 17 body keypoints, + 17-22: 6 foot keypoints, + 23-90: 68 face keypoints, + 91-132: 42 hand keypoints + + In total, we have 133 keypoints for wholebody pose estimation. + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/coco_wholebody.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(TopDownCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.body_num = 17 + self.foot_num = 6 + self.face_num = 68 + self.left_hand_num = 21 + self.right_hand_num = 21 + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + rec = [] + bbox_id = 0 + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints'] + obj['foot_kpts'] + + obj['face_kpts'] + obj['lefthand_kpts'] + + obj['righthand_kpts']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3] > 0) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = os.path.join(self.img_prefix, self.id2name[img_id]) + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + + return rec + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + """Get coco keypoint results.""" + cat_id = data_pack['cat_id'] + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpt['keypoints'] for img_kpt in img_kpts]) + key_points = _key_points.reshape(-1, + self.ann_info['num_joints'] * 3) + + cuts = np.cumsum([ + 0, self.body_num, self.foot_num, self.face_num, + self.left_hand_num, self.right_hand_num + ]) * 3 + + result = [{ + 'image_id': img_kpt['image_id'], + 'category_id': cat_id, + 'keypoints': key_point[cuts[0]:cuts[1]].tolist(), + 'foot_kpts': key_point[cuts[1]:cuts[2]].tolist(), + 'face_kpts': key_point[cuts[2]:cuts[3]].tolist(), + 'lefthand_kpts': key_point[cuts[3]:cuts[4]].tolist(), + 'righthand_kpts': key_point[cuts[4]:cuts[5]].tolist(), + 'score': float(img_kpt['score']), + 'center': img_kpt['center'].tolist(), + 'scale': img_kpt['scale'].tolist() + } for img_kpt, key_point in zip(img_kpts, key_points)] + + cat_results.extend(result) + + return cat_results + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + + cuts = np.cumsum([ + 0, self.body_num, self.foot_num, self.face_num, self.left_hand_num, + self.right_hand_num + ]) + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_body', + self.sigmas[cuts[0]:cuts[1]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_foot', + self.sigmas[cuts[1]:cuts[2]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_face', + self.sigmas[cuts[2]:cuts[3]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_lefthand', + self.sigmas[cuts[3]:cuts[4]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_righthand', + self.sigmas[cuts[4]:cuts[5]], + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_wholebody', + self.sigmas, + use_area=True) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py b/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py new file mode 100644 index 0000000..b9b196f --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_crowdpose_dataset.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmcv import Config +from xtcocotools.cocoeval import COCOeval + +from ...builder import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + + +@DATASETS.register_module() +class TopDownCrowdPoseDataset(TopDownCocoDataset): + """CrowdPoseDataset dataset for top-down pose estimation. + + "CrowdPose: Efficient Crowded Scenes Pose Estimation and + A New Benchmark", CVPR'2019. + More details can be found in the `paper + `__. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + CrowdPose keypoint indexes:: + + 0: 'left_shoulder', + 1: 'right_shoulder', + 2: 'left_elbow', + 3: 'right_elbow', + 4: 'left_wrist', + 5: 'right_wrist', + 6: 'left_hip', + 7: 'right_hip', + 8: 'left_knee', + 9: 'right_knee', + 10: 'left_ankle', + 11: 'right_ankle', + 12: 'top_head', + 13: 'neck' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/crowdpose.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(TopDownCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval( + self.coco, + coco_det, + 'keypoints_crowd', + self.sigmas, + use_area=False) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AR', 'AR .5', 'AR .75', 'AP(E)', 'AP(M)', + 'AP(H)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py b/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py new file mode 100644 index 0000000..6bc49e3 --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_h36m_dataset.py @@ -0,0 +1,206 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning + +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class TopDownH36MDataset(Kpt2dSviewRgbImgTopDownDataset): + """Human3.6M dataset for top-down 2D pose estimation. + + "Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human + Sensing in Natural Environments", TPAMI`2014. + More details can be found in the `paper + `__. + + Human3.6M keypoint indexes:: + + 0: 'root (pelvis)', + 1: 'right_hip', + 2: 'right_knee', + 3: 'right_foot', + 4: 'left_hip', + 5: 'left_knee', + 6: 'left_foot', + 7: 'spine', + 8: 'thorax', + 9: 'neck_base', + 10: 'head', + 11: 'left_shoulder', + 12: 'left_elbow', + 13: 'left_wrist', + 14: 'right_shoulder', + 15: 'right_elbow', + 16: 'right_wrist' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/h36m.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + gt_db = [] + bbox_id = 0 + num_joints = self.ann_info['num_joints'] + for img_id in self.img_ids: + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + # use 1.25 padded bbox as input + center, scale = self._xywh2cs(*obj['bbox'][:4]) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + gt_db.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox': obj['bbox'], + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate human3.6m 2d keypoint results. The pose prediction results + will be saved in `${res_folder}/result_keypoints.json`. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], + scale[1],area, score] + - image_paths (list[str]): For example, ['data/coco/val2017 + /000000393226.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap + - bbox_id (list(int)). + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'PCK'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) diff --git a/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py b/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py new file mode 100644 index 0000000..7042daa --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_halpe_dataset.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmcv import Config + +from ...builder import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + + +@DATASETS.register_module() +class TopDownHalpeDataset(TopDownCocoDataset): + """HalpeDataset for top-down pose estimation. + + 'https://github.com/Fang-Haoshu/Halpe-FullBody' + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + Halpe keypoint indexes:: + + 0-19: 20 body keypoints, + 20-25: 6 foot keypoints, + 26-93: 68 face keypoints, + 94-135: 42 hand keypoints + + In total, we have 136 keypoints for wholebody pose estimation. + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/halpe.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(TopDownCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.ann_info['use_different_joint_weights'] = False + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') diff --git a/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py b/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py new file mode 100644 index 0000000..5204f04 --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_jhmdb_dataset.py @@ -0,0 +1,361 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.core.evaluation.top_down_eval import keypoint_pck_accuracy +from ...builder import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + + +@DATASETS.register_module() +class TopDownJhmdbDataset(TopDownCocoDataset): + """JhmdbDataset dataset for top-down pose estimation. + + "Towards understanding action recognition", ICCV'2013. + More details can be found in the `paper + `__ + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + sub-JHMDB keypoint indexes:: + + 0: "neck", + 1: "belly", + 2: "head", + 3: "right_shoulder", + 4: "left_shoulder", + 5: "right_hip", + 6: "left_hip", + 7: "right_elbow", + 8: "left_elbow", + 9: "right_knee", + 10: "left_knee", + 11: "right_wrist", + 12: "left_wrist", + 13: "right_ankle", + 14: "left_ankle" + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/jhmdb.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(TopDownCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + # JHMDB uses matlab format, index is 1-based, + # we should first convert to 0-based index + x -= 1 + y -= 1 + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + rec = [] + bbox_id = 0 + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + + # JHMDB uses matlab format, index is 1-based, + # we should first convert to 0-based index + joints_3d[:, :2] = keypoints[:, :2] - 1 + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': f'{img_id}_{bbox_id:03}' + }) + bbox_id = bbox_id + 1 + + return rec + + def _write_keypoint_results(self, keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def _report_metric(self, res_file, metrics, pck_thr=0.2): + """Keypoint evaluation. + + Args: + res_file (str): Json file stored prediction results. + metrics (str | list[str]): Metric to be performed. + Options: 'PCK', 'PCKh', 'AUC', 'EPE'. + pck_thr (float): PCK threshold, default as 0.2. + pckh_thr (float): PCKh threshold, default as 0.7. + auc_nor (float): AUC normalization factor, default as 30 pixel. + + Returns: + List: Evaluation results for evaluation metric. + """ + info_str = [] + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.db) + + outputs = [] + gts = [] + masks = [] + threshold_bbox = [] + threshold_torso = [] + + for pred, item in zip(preds, self.db): + outputs.append(np.array(pred['keypoints'])[:, :-1]) + gts.append(np.array(item['joints_3d'])[:, :-1]) + masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0) + if 'PCK' in metrics: + bbox = np.array(item['bbox']) + bbox_thr = np.max(bbox[2:]) + threshold_bbox.append(np.array([bbox_thr, bbox_thr])) + + if 'tPCK' in metrics: + torso_thr = np.linalg.norm(item['joints_3d'][4, :2] - + item['joints_3d'][5, :2]) + if torso_thr < 1: + torso_thr = np.linalg.norm( + np.array(pred['keypoints'])[4, :2] - + np.array(pred['keypoints'])[5, :2]) + warnings.warn('Torso Size < 1.') + threshold_torso.append(np.array([torso_thr, torso_thr])) + + outputs = np.array(outputs) + gts = np.array(gts) + masks = np.array(masks) + threshold_bbox = np.array(threshold_bbox) + threshold_torso = np.array(threshold_torso) + + if 'PCK' in metrics: + pck_p, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr, + threshold_bbox) + + stats_names = [ + 'Head PCK', 'Sho PCK', 'Elb PCK', 'Wri PCK', 'Hip PCK', + 'Knee PCK', 'Ank PCK', 'Mean PCK' + ] + + stats = [ + pck_p[2], 0.5 * pck_p[3] + 0.5 * pck_p[4], + 0.5 * pck_p[7] + 0.5 * pck_p[8], + 0.5 * pck_p[11] + 0.5 * pck_p[12], + 0.5 * pck_p[5] + 0.5 * pck_p[6], + 0.5 * pck_p[9] + 0.5 * pck_p[10], + 0.5 * pck_p[13] + 0.5 * pck_p[14], pck + ] + + info_str.extend(list(zip(stats_names, stats))) + + if 'tPCK' in metrics: + pck_p, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr, + threshold_torso) + + stats_names = [ + 'Head tPCK', 'Sho tPCK', 'Elb tPCK', 'Wri tPCK', 'Hip tPCK', + 'Knee tPCK', 'Ank tPCK', 'Mean tPCK' + ] + + stats = [ + pck_p[2], 0.5 * pck_p[3] + 0.5 * pck_p[4], + 0.5 * pck_p[7] + 0.5 * pck_p[8], + 0.5 * pck_p[11] + 0.5 * pck_p[12], + 0.5 * pck_p[5] + 0.5 * pck_p[6], + 0.5 * pck_p[9] + 0.5 * pck_p[10], + 0.5 * pck_p[13] + 0.5 * pck_p[14], pck + ] + + info_str.extend(list(zip(stats_names, stats))) + + return info_str + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCK', **kwargs): + """Evaluate onehand10k keypoint results. The pose prediction results + will be saved in `${res_folder}/result_keypoints.json`. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_path (list[str]) + - output_heatmap (np.ndarray[N, K, H, W]): model outputs. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'tPCK'. + PCK means normalized by the bounding boxes, while tPCK + means normalized by the torso size. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'tPCK'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + # convert 0-based index to 1-based index, + # and get the first two dimensions. + preds[..., :2] += 1.0 + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + kpts.append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py b/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py new file mode 100644 index 0000000..050824a --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_mhp_dataset.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmcv import Config +from xtcocotools.cocoeval import COCOeval + +from ...builder import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + + +@DATASETS.register_module() +class TopDownMhpDataset(TopDownCocoDataset): + """MHPv2.0 dataset for top-down pose estimation. + + "Understanding Humans in Crowded Scenes: Deep Nested Adversarial + Learning and A New Benchmark for Multi-Human Parsing", ACM MM'2018. + More details can be found in the `paper + `__ + + Note that, the evaluation metric used here is mAP (adapted from COCO), + which may be different from the official evaluation codes. + 'https://github.com/ZhaoJ9014/Multi-Human-Parsing/tree/master/' + 'Evaluation/Multi-Human-Pose' + Please be cautious if you use the results in papers. + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + MHP keypoint indexes:: + + 0: "right ankle", + 1: "right knee", + 2: "right hip", + 3: "left hip", + 4: "left knee", + 5: "left ankle", + 6: "pelvis", + 7: "thorax", + 8: "upper neck", + 9: "head top", + 10: "right wrist", + 11: "right elbow", + 12: "right shoulder", + 13: "left shoulder", + 14: "left elbow", + 15: "left wrist", + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/mhp.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(TopDownCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + if 'image_thr' in data_cfg: + warnings.warn( + 'image_thr is deprecated, ' + 'please use det_bbox_thr instead', DeprecationWarning) + self.det_bbox_thr = data_cfg['image_thr'] + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db + + def _do_python_keypoint_eval(self, res_file): + """Keypoint evaluation using COCOAPI.""" + coco_det = self.coco.loadRes(res_file) + coco_eval = COCOeval( + self.coco, coco_det, 'keypoints', self.sigmas, use_area=False) + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + + info_str = list(zip(stats_names, coco_eval.stats)) + + return info_str diff --git a/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py b/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py new file mode 100644 index 0000000..751046a --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py @@ -0,0 +1,275 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +import warnings +from collections import OrderedDict + +import numpy as np +from mmcv import Config, deprecated_api_warning +from scipy.io import loadmat, savemat + +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class TopDownMpiiDataset(Kpt2dSviewRgbImgTopDownDataset): + """MPII Dataset for top-down pose estimation. + + "2D Human Pose Estimation: New Benchmark and State of the Art Analysis" + ,CVPR'2014. More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + MPII keypoint indexes:: + + 0: 'right_ankle' + 1: 'right_knee', + 2: 'right_hip', + 3: 'left_hip', + 4: 'left_knee', + 5: 'left_ankle', + 6: 'pelvis', + 7: 'thorax', + 8: 'upper_neck', + 9: 'head_top', + 10: 'right_wrist', + 11: 'right_elbow', + 12: 'right_shoulder', + 13: 'left_shoulder', + 14: 'left_elbow', + 15: 'left_wrist' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/mpii.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + coco_style=False, + test_mode=test_mode) + + self.db = self._get_db() + self.image_set = set(x['image_file'] for x in self.db) + self.num_images = len(self.image_set) + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + # create train/val split + with open(self.ann_file) as anno_file: + anno = json.load(anno_file) + + gt_db = [] + bbox_id = 0 + for a in anno: + image_name = a['image'] + + center = np.array(a['center'], dtype=np.float32) + scale = np.array([a['scale'], a['scale']], dtype=np.float32) + + # Adjust center/scale slightly to avoid cropping limbs + if center[0] != -1: + center[1] = center[1] + 15 * scale[1] + # padding to include proper amount of context + scale = scale * 1.25 + + # MPII uses matlab format, index is 1-based, + # we should first convert to 0-based index + center = center - 1 + + joints_3d = np.zeros((self.ann_info['num_joints'], 3), + dtype=np.float32) + joints_3d_visible = np.zeros((self.ann_info['num_joints'], 3), + dtype=np.float32) + if not self.test_mode: + joints = np.array(a['joints']) + joints_vis = np.array(a['joints_vis']) + assert len(joints) == self.ann_info['num_joints'], \ + f'joint num diff: {len(joints)}' + \ + f' vs {self.ann_info["num_joints"]}' + + joints_3d[:, 0:2] = joints[:, 0:2] - 1 + joints_3d_visible[:, :2] = joints_vis[:, None] + image_file = osp.join(self.img_prefix, image_name) + gt_db.append({ + 'image_file': image_file, + 'bbox_id': bbox_id, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1 + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs): + """Evaluate PCKh for MPII dataset. Adapted from + https://github.com/leoxiaobin/deep-high-resolution-net.pytorch + Copyright (c) Microsoft, under the MIT License. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['/val2017/000000\ + 397133.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap. + res_folder (str, optional): The folder to save the testing + results. Default: None. + metric (str | list[str]): Metrics to be performed. + Defaults: 'PCKh'. + + Returns: + dict: PCKh for each joint + """ + + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCKh'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + kpts = [] + for result in results: + preds = result['preds'] + bbox_ids = result['bbox_ids'] + batch_size = len(bbox_ids) + for i in range(batch_size): + kpts.append({'keypoints': preds[i], 'bbox_id': bbox_ids[i]}) + kpts = self._sort_and_unique_bboxes(kpts) + + preds = np.stack([kpt['keypoints'] for kpt in kpts]) + + # convert 0-based index to 1-based index, + # and get the first two dimensions. + preds = preds[..., :2] + 1.0 + + if res_folder: + pred_file = osp.join(res_folder, 'pred.mat') + savemat(pred_file, mdict={'preds': preds}) + + SC_BIAS = 0.6 + threshold = 0.5 + + gt_file = osp.join(osp.dirname(self.ann_file), 'mpii_gt_val.mat') + gt_dict = loadmat(gt_file) + dataset_joints = gt_dict['dataset_joints'] + jnt_missing = gt_dict['jnt_missing'] + pos_gt_src = gt_dict['pos_gt_src'] + headboxes_src = gt_dict['headboxes_src'] + + pos_pred_src = np.transpose(preds, [1, 2, 0]) + + head = np.where(dataset_joints == 'head')[1][0] + lsho = np.where(dataset_joints == 'lsho')[1][0] + lelb = np.where(dataset_joints == 'lelb')[1][0] + lwri = np.where(dataset_joints == 'lwri')[1][0] + lhip = np.where(dataset_joints == 'lhip')[1][0] + lkne = np.where(dataset_joints == 'lkne')[1][0] + lank = np.where(dataset_joints == 'lank')[1][0] + + rsho = np.where(dataset_joints == 'rsho')[1][0] + relb = np.where(dataset_joints == 'relb')[1][0] + rwri = np.where(dataset_joints == 'rwri')[1][0] + rkne = np.where(dataset_joints == 'rkne')[1][0] + rank = np.where(dataset_joints == 'rank')[1][0] + rhip = np.where(dataset_joints == 'rhip')[1][0] + + jnt_visible = 1 - jnt_missing + uv_error = pos_pred_src - pos_gt_src + uv_err = np.linalg.norm(uv_error, axis=1) + headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :] + headsizes = np.linalg.norm(headsizes, axis=0) + headsizes *= SC_BIAS + scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32) + scaled_uv_err = uv_err / scale + scaled_uv_err = scaled_uv_err * jnt_visible + jnt_count = np.sum(jnt_visible, axis=1) + less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible + PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count + + # save + rng = np.arange(0, 0.5 + 0.01, 0.01) + pckAll = np.zeros((len(rng), 16), dtype=np.float32) + + for r, threshold in enumerate(rng): + less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible + pckAll[r, :] = 100. * np.sum( + less_than_threshold, axis=1) / jnt_count + + PCKh = np.ma.array(PCKh, mask=False) + PCKh.mask[6:8] = True + + jnt_count = np.ma.array(jnt_count, mask=False) + jnt_count.mask[6:8] = True + jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64) + + name_value = [('Head', PCKh[head]), + ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])), + ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])), + ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])), + ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])), + ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])), + ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])), + ('PCKh', np.sum(PCKh * jnt_ratio)), + ('PCKh@0.1', np.sum(pckAll[10, :] * jnt_ratio))] + name_value = OrderedDict(name_value) + + return name_value + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py b/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py new file mode 100644 index 0000000..a0da65b --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py @@ -0,0 +1,310 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt2dSviewRgbImgTopDownDataset + + +@DATASETS.register_module() +class TopDownMpiiTrbDataset(Kpt2dSviewRgbImgTopDownDataset): + """MPII-TRB Dataset dataset for top-down pose estimation. + + "TRB: A Novel Triplet Representation for Understanding 2D Human Body", + ICCV'2019. More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + MPII-TRB keypoint indexes:: + + 0: 'left_shoulder' + 1: 'right_shoulder' + 2: 'left_elbow' + 3: 'right_elbow' + 4: 'left_wrist' + 5: 'right_wrist' + 6: 'left_hip' + 7: 'right_hip' + 8: 'left_knee' + 9: 'right_knee' + 10: 'left_ankle' + 11: 'right_ankle' + 12: 'head' + 13: 'neck' + + 14: 'right_neck' + 15: 'left_neck' + 16: 'medial_right_shoulder' + 17: 'lateral_right_shoulder' + 18: 'medial_right_bow' + 19: 'lateral_right_bow' + 20: 'medial_right_wrist' + 21: 'lateral_right_wrist' + 22: 'medial_left_shoulder' + 23: 'lateral_left_shoulder' + 24: 'medial_left_bow' + 25: 'lateral_left_bow' + 26: 'medial_left_wrist' + 27: 'lateral_left_wrist' + 28: 'medial_right_hip' + 29: 'lateral_right_hip' + 30: 'medial_right_knee' + 31: 'lateral_right_knee' + 32: 'medial_right_ankle' + 33: 'lateral_right_ankle' + 34: 'medial_left_hip' + 35: 'lateral_left_hip' + 36: 'medial_left_knee' + 37: 'lateral_left_knee' + 38: 'medial_left_ankle' + 39: 'lateral_left_ankle' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/mpii_trb.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.db = self._get_db(ann_file) + self.image_set = set(x['image_file'] for x in self.db) + self.num_images = len(self.image_set) + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self, ann_file): + """Load dataset.""" + with open(ann_file, 'r') as f: + data = json.load(f) + tmpl = dict( + image_file=None, + bbox_id=None, + center=None, + scale=None, + rotation=0, + joints_3d=None, + joints_3d_visible=None, + dataset=self.dataset_name) + + imid2info = { + int(osp.splitext(x['file_name'])[0]): x + for x in data['images'] + } + + num_joints = self.ann_info['num_joints'] + gt_db = [] + + for anno in data['annotations']: + newitem = cp.deepcopy(tmpl) + image_id = anno['image_id'] + newitem['bbox_id'] = anno['id'] + newitem['image_file'] = osp.join(self.img_prefix, + imid2info[image_id]['file_name']) + + if max(anno['keypoints']) == 0: + continue + + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + for ipt in range(num_joints): + joints_3d[ipt, 0] = anno['keypoints'][ipt * 3 + 0] + joints_3d[ipt, 1] = anno['keypoints'][ipt * 3 + 1] + joints_3d[ipt, 2] = 0 + t_vis = min(anno['keypoints'][ipt * 3 + 2], 1) + joints_3d_visible[ipt, :] = (t_vis, t_vis, 0) + + center = np.array(anno['center'], dtype=np.float32) + scale = self.ann_info['image_size'] / anno['scale'] / 200.0 + newitem['center'] = center + newitem['scale'] = scale + newitem['joints_3d'] = joints_3d + newitem['joints_3d_visible'] = joints_3d_visible + if 'headbox' in anno: + newitem['headbox'] = anno['headbox'] + gt_db.append(newitem) + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + def _evaluate_kernel(self, pred, joints_3d, joints_3d_visible, headbox): + """Evaluate one example.""" + num_joints = self.ann_info['num_joints'] + headbox = np.array(headbox) + threshold = np.linalg.norm(headbox[:2] - headbox[2:]) * 0.3 + hit = np.zeros(num_joints, dtype=np.float32) + exist = np.zeros(num_joints, dtype=np.float32) + + for i in range(num_joints): + pred_pt = pred[i] + gt_pt = joints_3d[i] + vis = joints_3d_visible[i][0] + if vis: + exist[i] = 1 + else: + continue + distance = np.linalg.norm(pred_pt[:2] - gt_pt[:2]) + if distance < threshold: + hit[i] = 1 + return hit, exist + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='PCKh', **kwargs): + """Evaluate PCKh for MPII-TRB dataset. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['/val2017/\ + 000000397133.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap. + - bbox_ids (list[str]): For example, ['27407']. + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metrics to be performed. + Defaults: 'PCKh'. + + Returns: + dict: PCKh for each joint + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCKh'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + res_file = osp.join(res_folder, 'result_keypoints.json') + else: + tmp_folder = tempfile.TemporaryDirectory() + res_file = osp.join(tmp_folder.name, 'result_keypoints.json') + + kpts = [] + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + str_image_path = image_paths[i] + image_id = int(osp.basename(osp.splitext(str_image_path)[0])) + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def _report_metric(self, res_file): + """Keypoint evaluation. + + Report Mean Acc of skeleton, contour and all joints. + """ + num_joints = self.ann_info['num_joints'] + hit = np.zeros(num_joints, dtype=np.float32) + exist = np.zeros(num_joints, dtype=np.float32) + + with open(res_file, 'r') as fin: + preds = json.load(fin) + + assert len(preds) == len( + self.db), f'len(preds)={len(preds)}, len(self.db)={len(self.db)}' + for pred, item in zip(preds, self.db): + h, e = self._evaluate_kernel(pred['keypoints'], item['joints_3d'], + item['joints_3d_visible'], + item['headbox']) + hit += h + exist += e + skeleton = np.sum(hit[:14]) / np.sum(exist[:14]) + contour = np.sum(hit[14:]) / np.sum(exist[14:]) + mean = np.sum(hit) / np.sum(exist) + + info_str = [] + info_str.append(('Skeleton_acc', skeleton.item())) + info_str.append(('Contour_acc', contour.item())) + info_str.append(('PCKh', mean.item())) + return info_str + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py b/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py new file mode 100644 index 0000000..0ad6b81 --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py @@ -0,0 +1,97 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmcv import Config + +from ...builder import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + + +@DATASETS.register_module() +class TopDownOCHumanDataset(TopDownCocoDataset): + """OChuman dataset for top-down pose estimation. + + "Pose2Seg: Detection Free Human Instance Segmentation", CVPR'2019. + More details can be found in the `paper + `__ . + + "Occluded Human (OCHuman)" dataset contains 8110 heavily occluded + human instances within 4731 images. OCHuman dataset is designed for + validation and testing. To evaluate on OCHuman, the model should be + trained on COCO training set, and then test the robustness of the + model to occlusion using OCHuman. + + OCHuman keypoint indexes (same as COCO):: + + 0: 'nose', + 1: 'left_eye', + 2: 'right_eye', + 3: 'left_ear', + 4: 'right_ear', + 5: 'left_shoulder', + 6: 'right_shoulder', + 7: 'left_elbow', + 8: 'right_elbow', + 9: 'left_wrist', + 10: 'right_wrist', + 11: 'left_hip', + 12: 'right_hip', + 13: 'left_knee', + 14: 'right_knee', + 15: 'left_ankle', + 16: 'right_ankle' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/ochuman.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(TopDownCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db diff --git a/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py b/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py new file mode 100644 index 0000000..c690860 --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_posetrack18_dataset.py @@ -0,0 +1,312 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning + +from ....core.post_processing import oks_nms, soft_oks_nms +from ...builder import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + +try: + from poseval import eval_helpers + from poseval.evaluateAP import evaluateAP + has_poseval = True +except (ImportError, ModuleNotFoundError): + has_poseval = False + + +@DATASETS.register_module() +class TopDownPoseTrack18Dataset(TopDownCocoDataset): + """PoseTrack18 dataset for top-down pose estimation. + + "Posetrack: A benchmark for human pose estimation and tracking", CVPR'2018. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + PoseTrack2018 keypoint indexes:: + + 0: 'nose', + 1: 'head_bottom', + 2: 'head_top', + 3: 'left_ear', + 4: 'right_ear', + 5: 'left_shoulder', + 6: 'right_shoulder', + 7: 'left_elbow', + 8: 'right_elbow', + 9: 'left_wrist', + 10: 'right_wrist', + 11: 'left_hip', + 12: 'right_hip', + 13: 'left_knee', + 14: 'right_knee', + 15: 'left_ankle', + 16: 'right_ankle' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/posetrack18.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super(TopDownCocoDataset, self).__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mAP', **kwargs): + """Evaluate posetrack keypoint results. The pose prediction results + will be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - num_keypoints: K + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['val/010016_mpii_test\ + /000024.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap. + - bbox_id (list(int)) + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'mAP'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['mAP'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + else: + tmp_folder = tempfile.TemporaryDirectory() + res_folder = tmp_folder.name + + gt_folder = osp.join( + osp.dirname(self.ann_file), + osp.splitext(self.ann_file.split('_')[-1])[0]) + + kpts = defaultdict(list) + + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + kpts[image_id].append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + # rescoring and oks nms + num_joints = self.ann_info['num_joints'] + vis_thr = self.vis_thr + oks_thr = self.oks_thr + valid_kpts = defaultdict(list) + for image_id in kpts.keys(): + img_kpts = kpts[image_id] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > vis_thr: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + if self.use_nms: + nms = soft_oks_nms if self.soft_nms else oks_nms + keep = nms(img_kpts, oks_thr, sigmas=self.sigmas) + valid_kpts[image_id].append( + [img_kpts[_keep] for _keep in keep]) + else: + valid_kpts[image_id].append(img_kpts) + + self._write_posetrack18_keypoint_results(valid_kpts, gt_folder, + res_folder) + + info_str = self._do_python_keypoint_eval(gt_folder, res_folder) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + @staticmethod + def _write_posetrack18_keypoint_results(keypoint_results, gt_folder, + pred_folder): + """Write results into a json file. + + Args: + keypoint_results (dict): keypoint results organized by image_id. + gt_folder (str): Path of directory for official gt files. + pred_folder (str): Path of directory to save the results. + """ + categories = [] + + cat = {} + cat['supercategory'] = 'person' + cat['id'] = 1 + cat['name'] = 'person' + cat['keypoints'] = [ + 'nose', 'head_bottom', 'head_top', 'left_ear', 'right_ear', + 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', + 'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', + 'right_knee', 'left_ankle', 'right_ankle' + ] + cat['skeleton'] = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], + [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], + [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], + [4, 6], [5, 7]] + categories.append(cat) + + json_files = [ + pos for pos in os.listdir(gt_folder) if pos.endswith('.json') + ] + for json_file in json_files: + + with open(osp.join(gt_folder, json_file), 'r') as f: + gt = json.load(f) + + annotations = [] + images = [] + + for image in gt['images']: + im = {} + im['id'] = image['id'] + im['file_name'] = image['file_name'] + images.append(im) + + img_kpts = keypoint_results[im['id']] + + if len(img_kpts) == 0: + continue + for track_id, img_kpt in enumerate(img_kpts[0]): + ann = {} + ann['image_id'] = img_kpt['image_id'] + ann['keypoints'] = np.array( + img_kpt['keypoints']).reshape(-1).tolist() + ann['scores'] = np.array(ann['keypoints']).reshape( + [-1, 3])[:, 2].tolist() + ann['score'] = float(img_kpt['score']) + ann['track_id'] = track_id + annotations.append(ann) + + info = {} + info['images'] = images + info['categories'] = categories + info['annotations'] = annotations + + with open(osp.join(pred_folder, json_file), 'w') as f: + json.dump(info, f, sort_keys=True, indent=4) + + def _do_python_keypoint_eval(self, gt_folder, pred_folder): + """Keypoint evaluation using poseval.""" + + if not has_poseval: + raise ImportError('Please install poseval package for evaluation' + 'on PoseTrack dataset ' + '(see requirements/optional.txt)') + + argv = ['', gt_folder + '/', pred_folder + '/'] + + print('Loading data') + gtFramesAll, prFramesAll = eval_helpers.load_data_dir(argv) + + print('# gt frames :', len(gtFramesAll)) + print('# pred frames:', len(prFramesAll)) + + # evaluate per-frame multi-person pose estimation (AP) + # compute AP + print('Evaluation of per-frame multi-person pose estimation') + apAll, _, _ = evaluateAP(gtFramesAll, prFramesAll, None, False, False) + + # print AP + print('Average Precision (AP) metric:') + eval_helpers.printTable(apAll) + + stats = eval_helpers.getCum(apAll) + + stats_names = [ + 'Head AP', 'Shou AP', 'Elb AP', 'Wri AP', 'Hip AP', 'Knee AP', + 'Ankl AP', 'Total AP' + ] + + info_str = list(zip(stats_names, stats)) + + return info_str diff --git a/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py b/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py new file mode 100644 index 0000000..045148d --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py @@ -0,0 +1,549 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict, defaultdict + +import json_tricks as json +import numpy as np +from mmcv import deprecated_api_warning + +from ....core.post_processing import oks_nms, soft_oks_nms +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbVidTopDownDataset + +try: + from poseval import eval_helpers + from poseval.evaluateAP import evaluateAP + has_poseval = True +except (ImportError, ModuleNotFoundError): + has_poseval = False + + +@DATASETS.register_module() +class TopDownPoseTrack18VideoDataset(Kpt2dSviewRgbVidTopDownDataset): + """PoseTrack18 dataset for top-down pose estimation. + + "Posetrack: A benchmark for human pose estimation and tracking", CVPR'2018. + More details can be found in the `paper + `__ . + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + PoseTrack2018 keypoint indexes:: + + 0: 'nose', + 1: 'head_bottom', + 2: 'head_top', + 3: 'left_ear', + 4: 'right_ear', + 5: 'left_shoulder', + 6: 'right_shoulder', + 7: 'left_elbow', + 8: 'right_elbow', + 9: 'left_wrist', + 10: 'right_wrist', + 11: 'left_hip', + 12: 'right_hip', + 13: 'left_knee', + 14: 'right_knee', + 15: 'left_ankle', + 16: 'right_ankle' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where videos/images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + ph_fill_len (int): The length of the placeholder to fill in the + image filenames, default: 6 in PoseTrack18. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False, + ph_fill_len=6): + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + self.frame_weight_train = data_cfg['frame_weight_train'] + self.frame_weight_test = data_cfg['frame_weight_test'] + self.frame_weight = self.frame_weight_test \ + if self.test_mode else self.frame_weight_train + + self.ph_fill_len = ph_fill_len + + # select the frame indices + self.frame_index_rand = data_cfg.get('frame_index_rand', True) + self.frame_index_range = data_cfg.get('frame_index_range', [-2, 2]) + self.num_adj_frames = data_cfg.get('num_adj_frames', 1) + self.frame_indices_train = data_cfg.get('frame_indices_train', None) + self.frame_indices_test = data_cfg.get('frame_indices_test', + [-2, -1, 0, 1, 2]) + + if self.frame_indices_train is not None: + self.frame_indices_train.sort() + self.frame_indices_test.sort() + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + if (not self.test_mode) or self.use_gt_bbox: + # use ground truth bbox + gt_db = self._load_coco_keypoint_annotations() + else: + # use bbox from detection + gt_db = self._load_posetrack_person_detection_results() + return gt_db + + def _load_coco_keypoint_annotations(self): + """Ground truth bbox and keypoints.""" + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + file_name = img_ann['file_name'] + nframes = int(img_ann['nframes']) + frame_id = int(img_ann['frame_id']) + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_files = [] + cur_image_file = osp.join(self.img_prefix, self.id2name[img_id]) + image_files.append(cur_image_file) + + # "images/val/012834_mpii_test/000000.jpg" -->> "000000.jpg" + cur_image_name = file_name.split('/')[-1] + ref_idx = int(cur_image_name.replace('.jpg', '')) + + # select the frame indices + if not self.test_mode and self.frame_indices_train is not None: + indices = self.frame_indices_train + elif not self.test_mode and self.frame_index_rand: + low, high = self.frame_index_range + indices = np.random.randint(low, high + 1, self.num_adj_frames) + else: + indices = self.frame_indices_test + + for index in indices: + if self.test_mode and index == 0: + continue + # the supporting frame index + support_idx = ref_idx + index + support_idx = np.clip(support_idx, 0, nframes - 1) + sup_image_file = cur_image_file.replace( + cur_image_name, + str(support_idx).zfill(self.ph_fill_len) + '.jpg') + + if osp.exists(sup_image_file): + image_files.append(sup_image_file) + else: + warnings.warn( + f'{sup_image_file} does not exist, ' + f'use {cur_image_file} instead.', UserWarning) + image_files.append(cur_image_file) + rec.append({ + 'image_file': image_files, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': bbox_id, + 'nframes': nframes, + 'frame_id': frame_id, + 'frame_weight': self.frame_weight + }) + bbox_id = bbox_id + 1 + + return rec + + def _load_posetrack_person_detection_results(self): + """Load Posetrack person detection results. + + Only in test mode. + """ + num_joints = self.ann_info['num_joints'] + all_boxes = None + with open(self.bbox_file, 'r') as f: + all_boxes = json.load(f) + + if not all_boxes: + raise ValueError('=> Load %s fail!' % self.bbox_file) + + print(f'=> Total boxes: {len(all_boxes)}') + + kpt_db = [] + bbox_id = 0 + for det_res in all_boxes: + if det_res['category_id'] != 1: + continue + + score = det_res['score'] + if score < self.det_bbox_thr: + continue + + box = det_res['bbox'] + + # deal with different bbox file formats + if 'nframes' in det_res and 'frame_id' in det_res: + nframes = int(det_res['nframes']) + frame_id = int(det_res['frame_id']) + elif 'image_name' in det_res: + img_id = self.name2id[det_res['image_name']] + img_ann = self.coco.loadImgs(img_id)[0] + nframes = int(img_ann['nframes']) + frame_id = int(img_ann['frame_id']) + else: + img_id = det_res['image_id'] + img_ann = self.coco.loadImgs(img_id)[0] + nframes = int(img_ann['nframes']) + frame_id = int(img_ann['frame_id']) + + image_files = [] + if 'image_name' in det_res: + file_name = det_res['image_name'] + else: + file_name = self.id2name[det_res['image_id']] + + cur_image_file = osp.join(self.img_prefix, file_name) + image_files.append(cur_image_file) + + # "images/val/012834_mpii_test/000000.jpg" -->> "000000.jpg" + cur_image_name = file_name.split('/')[-1] + ref_idx = int(cur_image_name.replace('.jpg', '')) + + indices = self.frame_indices_test + for index in indices: + if self.test_mode and index == 0: + continue + # the supporting frame index + support_idx = ref_idx + index + support_idx = np.clip(support_idx, 0, nframes - 1) + sup_image_file = cur_image_file.replace( + cur_image_name, + str(support_idx).zfill(self.ph_fill_len) + '.jpg') + + if osp.exists(sup_image_file): + image_files.append(sup_image_file) + else: + warnings.warn(f'{sup_image_file} does not exist, ' + f'use {cur_image_file} instead.') + image_files.append(cur_image_file) + + center, scale = self._xywh2cs(*box[:4]) + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.ones((num_joints, 3), dtype=np.float32) + kpt_db.append({ + 'image_file': image_files, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'bbox': box[:4], + 'bbox_score': score, + 'dataset': self.dataset_name, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'bbox_id': bbox_id, + 'nframes': nframes, + 'frame_id': frame_id, + 'frame_weight': self.frame_weight + }) + bbox_id = bbox_id + 1 + print(f'=> Total boxes after filter ' + f'low score@{self.det_bbox_thr}: {bbox_id}') + return kpt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='mAP', **kwargs): + """Evaluate posetrack keypoint results. The pose prediction results + will be saved in ``${res_folder}/result_keypoints.json``. + + Note: + - num_keypoints: K + + Args: + results (list[dict]): Testing results containing the following + items: + + - preds (np.ndarray[N,K,3]): The first two dimensions are \ + coordinates, score is the third dimension of the array. + - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \ + scale[1],area, score] + - image_paths (list[str]): For example, ['val/010016_mpii_test\ + /000024.jpg'] + - heatmap (np.ndarray[N, K, H, W]): model output heatmap. + - bbox_id (list(int)) + res_folder (str, optional): The folder to save the testing + results. If not specified, a temp folder will be created. + Default: None. + metric (str | list[str]): Metric to be performed. Defaults: 'mAP'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['mAP'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + if res_folder is not None: + tmp_folder = None + else: + tmp_folder = tempfile.TemporaryDirectory() + res_folder = tmp_folder.name + + gt_folder = osp.join( + osp.dirname(self.ann_file), + osp.splitext(self.ann_file.split('_')[-1])[0]) + + kpts = defaultdict(list) + + for result in results: + preds = result['preds'] + boxes = result['boxes'] + image_paths = result['image_paths'] + bbox_ids = result['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + if not isinstance(image_paths[i], list): + image_id = self.name2id[image_paths[i] + [len(self.img_prefix):]] + else: + image_id = self.name2id[image_paths[i][0] + [len(self.img_prefix):]] + + kpts[image_id].append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + # rescoring and oks nms + num_joints = self.ann_info['num_joints'] + vis_thr = self.vis_thr + oks_thr = self.oks_thr + valid_kpts = defaultdict(list) + for image_id in kpts.keys(): + img_kpts = kpts[image_id] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > vis_thr: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + if self.use_nms: + nms = soft_oks_nms if self.soft_nms else oks_nms + keep = nms(img_kpts, oks_thr, sigmas=self.sigmas) + valid_kpts[image_id].append( + [img_kpts[_keep] for _keep in keep]) + else: + valid_kpts[image_id].append(img_kpts) + + self._write_keypoint_results(valid_kpts, gt_folder, res_folder) + + info_str = self._do_keypoint_eval(gt_folder, res_folder) + name_value = OrderedDict(info_str) + + if tmp_folder is not None: + tmp_folder.cleanup() + + return name_value + + @staticmethod + def _write_keypoint_results(keypoint_results, gt_folder, pred_folder): + """Write results into a json file. + + Args: + keypoint_results (dict): keypoint results organized by image_id. + gt_folder (str): Path of directory for official gt files. + pred_folder (str): Path of directory to save the results. + """ + categories = [] + + cat = {} + cat['supercategory'] = 'person' + cat['id'] = 1 + cat['name'] = 'person' + cat['keypoints'] = [ + 'nose', 'head_bottom', 'head_top', 'left_ear', 'right_ear', + 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', + 'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', + 'right_knee', 'left_ankle', 'right_ankle' + ] + cat['skeleton'] = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], + [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], + [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], + [4, 6], [5, 7]] + categories.append(cat) + + json_files = [ + pos for pos in os.listdir(gt_folder) if pos.endswith('.json') + ] + for json_file in json_files: + + with open(osp.join(gt_folder, json_file), 'r') as f: + gt = json.load(f) + + annotations = [] + images = [] + + for image in gt['images']: + im = {} + im['id'] = image['id'] + im['file_name'] = image['file_name'] + images.append(im) + + img_kpts = keypoint_results[im['id']] + + if len(img_kpts) == 0: + continue + for track_id, img_kpt in enumerate(img_kpts[0]): + ann = {} + ann['image_id'] = img_kpt['image_id'] + ann['keypoints'] = np.array( + img_kpt['keypoints']).reshape(-1).tolist() + ann['scores'] = np.array(ann['keypoints']).reshape( + [-1, 3])[:, 2].tolist() + ann['score'] = float(img_kpt['score']) + ann['track_id'] = track_id + annotations.append(ann) + + info = {} + info['images'] = images + info['categories'] = categories + info['annotations'] = annotations + + with open(osp.join(pred_folder, json_file), 'w') as f: + json.dump(info, f, sort_keys=True, indent=4) + + def _do_keypoint_eval(self, gt_folder, pred_folder): + """Keypoint evaluation using poseval.""" + + if not has_poseval: + raise ImportError('Please install poseval package for evaluation' + 'on PoseTrack dataset ' + '(see requirements/optional.txt)') + + argv = ['', gt_folder + '/', pred_folder + '/'] + + print('Loading data') + gtFramesAll, prFramesAll = eval_helpers.load_data_dir(argv) + + print('# gt frames :', len(gtFramesAll)) + print('# pred frames:', len(prFramesAll)) + + # evaluate per-frame multi-person pose estimation (AP) + # compute AP + print('Evaluation of per-frame multi-person pose estimation') + apAll, _, _ = evaluateAP(gtFramesAll, prFramesAll, None, False, False) + + # print AP + print('Average Precision (AP) metric:') + eval_helpers.printTable(apAll) + + stats = eval_helpers.getCum(apAll) + + stats_names = [ + 'Head AP', 'Shou AP', 'Elb AP', 'Wri AP', 'Hip AP', 'Knee AP', + 'Ankl AP', 'Total AP' + ] + + info_str = list(zip(stats_names, stats)) + + return info_str diff --git a/mmpose/datasets/pipelines/__init__.py b/mmpose/datasets/pipelines/__init__.py new file mode 100644 index 0000000..cf06db1 --- /dev/null +++ b/mmpose/datasets/pipelines/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bottom_up_transform import * # noqa +from .hand_transform import * # noqa +from .loading import LoadImageFromFile # noqa +from .mesh_transform import * # noqa +from .pose3d_transform import * # noqa +from .shared_transform import * # noqa +from .top_down_transform import * # noqa diff --git a/mmpose/datasets/pipelines/bottom_up_transform.py b/mmpose/datasets/pipelines/bottom_up_transform.py new file mode 100644 index 0000000..032ce45 --- /dev/null +++ b/mmpose/datasets/pipelines/bottom_up_transform.py @@ -0,0 +1,816 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + +from mmpose.core.post_processing import (get_affine_transform, get_warp_matrix, + warp_affine_joints) +from mmpose.datasets.builder import PIPELINES +from .shared_transform import Compose + + +def _ceil_to_multiples_of(x, base=64): + """Transform x to the integral multiple of the base.""" + return int(np.ceil(x / base)) * base + + +def _get_multi_scale_size(image, + input_size, + current_scale, + min_scale, + use_udp=False): + """Get the size for multi-scale training. + + Args: + image: Input image. + input_size (np.ndarray[2]): Size (w, h) of the image input. + current_scale (float): Scale factor. + min_scale (float): Minimal scale. + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + + Returns: + tuple: A tuple containing multi-scale sizes. + + - (w_resized, h_resized) (tuple(int)): resized width/height + - center (np.ndarray)image center + - scale (np.ndarray): scales wrt width/height + """ + assert len(input_size) == 2 + h, w, _ = image.shape + + # calculate the size for min_scale + min_input_w = _ceil_to_multiples_of(min_scale * input_size[0], 64) + min_input_h = _ceil_to_multiples_of(min_scale * input_size[1], 64) + if w < h: + w_resized = int(min_input_w * current_scale / min_scale) + h_resized = int( + _ceil_to_multiples_of(min_input_w / w * h, 64) * current_scale / + min_scale) + if use_udp: + scale_w = w - 1.0 + scale_h = (h_resized - 1.0) / (w_resized - 1.0) * (w - 1.0) + else: + scale_w = w / 200.0 + scale_h = h_resized / w_resized * w / 200.0 + else: + h_resized = int(min_input_h * current_scale / min_scale) + w_resized = int( + _ceil_to_multiples_of(min_input_h / h * w, 64) * current_scale / + min_scale) + if use_udp: + scale_h = h - 1.0 + scale_w = (w_resized - 1.0) / (h_resized - 1.0) * (h - 1.0) + else: + scale_h = h / 200.0 + scale_w = w_resized / h_resized * h / 200.0 + if use_udp: + center = (scale_w / 2.0, scale_h / 2.0) + else: + center = np.array([round(w / 2.0), round(h / 2.0)]) + return (w_resized, h_resized), center, np.array([scale_w, scale_h]) + + +def _resize_align_multi_scale(image, input_size, current_scale, min_scale): + """Resize the images for multi-scale training. + + Args: + image: Input image + input_size (np.ndarray[2]): Size (w, h) of the image input + current_scale (float): Current scale + min_scale (float): Minimal scale + + Returns: + tuple: A tuple containing image info. + + - image_resized (np.ndarray): resized image + - center (np.ndarray): center of image + - scale (np.ndarray): scale + """ + assert len(input_size) == 2 + size_resized, center, scale = _get_multi_scale_size( + image, input_size, current_scale, min_scale) + + trans = get_affine_transform(center, scale, 0, size_resized) + image_resized = cv2.warpAffine(image, trans, size_resized) + + return image_resized, center, scale + + +def _resize_align_multi_scale_udp(image, input_size, current_scale, min_scale): + """Resize the images for multi-scale training. + + Args: + image: Input image + input_size (np.ndarray[2]): Size (w, h) of the image input + current_scale (float): Current scale + min_scale (float): Minimal scale + + Returns: + tuple: A tuple containing image info. + + - image_resized (np.ndarray): resized image + - center (np.ndarray): center of image + - scale (np.ndarray): scale + """ + assert len(input_size) == 2 + size_resized, _, _ = _get_multi_scale_size(image, input_size, + current_scale, min_scale, True) + + _, center, scale = _get_multi_scale_size(image, input_size, min_scale, + min_scale, True) + + trans = get_warp_matrix( + theta=0, + size_input=np.array(scale, dtype=np.float32), + size_dst=np.array(size_resized, dtype=np.float32) - 1.0, + size_target=np.array(scale, dtype=np.float32)) + image_resized = cv2.warpAffine( + image.copy(), trans, size_resized, flags=cv2.INTER_LINEAR) + + return image_resized, center, scale + + +class HeatmapGenerator: + """Generate heatmaps for bottom-up models. + + Args: + num_joints (int): Number of keypoints + output_size (np.ndarray): Size (w, h) of feature map + sigma (int): Sigma of the heatmaps. + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, output_size, num_joints, sigma=-1, use_udp=False): + if not isinstance(output_size, np.ndarray): + output_size = np.array(output_size) + if output_size.size > 1: + assert len(output_size) == 2 + self.output_size = output_size + else: + self.output_size = np.array([output_size, output_size], + dtype=np.int) + self.num_joints = num_joints + if sigma < 0: + sigma = self.output_size.prod()**0.5 / 64 + self.sigma = sigma + size = 6 * sigma + 3 + self.use_udp = use_udp + if use_udp: + self.x = np.arange(0, size, 1, np.float32) + self.y = self.x[:, None] + else: + x = np.arange(0, size, 1, np.float32) + y = x[:, None] + x0, y0 = 3 * sigma + 1, 3 * sigma + 1 + self.g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) + + def __call__(self, joints): + """Generate heatmaps.""" + hms = np.zeros( + (self.num_joints, self.output_size[1], self.output_size[0]), + dtype=np.float32) + + sigma = self.sigma + for p in joints: + for idx, pt in enumerate(p): + if pt[2] > 0: + x, y = int(pt[0]), int(pt[1]) + if x < 0 or y < 0 or \ + x >= self.output_size[0] or y >= self.output_size[1]: + continue + + if self.use_udp: + x0 = 3 * sigma + 1 + pt[0] - x + y0 = 3 * sigma + 1 + pt[1] - y + g = np.exp(-((self.x - x0)**2 + (self.y - y0)**2) / + (2 * sigma**2)) + else: + g = self.g + + ul = int(np.round(x - 3 * sigma - + 1)), int(np.round(y - 3 * sigma - 1)) + br = int(np.round(x + 3 * sigma + + 2)), int(np.round(y + 3 * sigma + 2)) + + c, d = max(0, + -ul[0]), min(br[0], self.output_size[0]) - ul[0] + a, b = max(0, + -ul[1]), min(br[1], self.output_size[1]) - ul[1] + + cc, dd = max(0, ul[0]), min(br[0], self.output_size[0]) + aa, bb = max(0, ul[1]), min(br[1], self.output_size[1]) + hms[idx, aa:bb, + cc:dd] = np.maximum(hms[idx, aa:bb, cc:dd], g[a:b, + c:d]) + return hms + + +class JointsEncoder: + """Encodes the visible joints into (coordinates, score); The coordinate of + one joint and its score are of `int` type. + + (idx * output_size**2 + y * output_size + x, 1) or (0, 0). + + Args: + max_num_people(int): Max number of people in an image + num_joints(int): Number of keypoints + output_size(np.ndarray): Size (w, h) of feature map + tag_per_joint(bool): Option to use one tag map per joint. + """ + + def __init__(self, max_num_people, num_joints, output_size, tag_per_joint): + self.max_num_people = max_num_people + self.num_joints = num_joints + if not isinstance(output_size, np.ndarray): + output_size = np.array(output_size) + if output_size.size > 1: + assert len(output_size) == 2 + self.output_size = output_size + else: + self.output_size = np.array([output_size, output_size], + dtype=np.int) + self.tag_per_joint = tag_per_joint + + def __call__(self, joints): + """ + Note: + - number of people in image: N + - number of keypoints: K + - max number of people in an image: M + + Args: + joints (np.ndarray[N,K,3]) + + Returns: + visible_kpts (np.ndarray[M,K,2]). + """ + visible_kpts = np.zeros((self.max_num_people, self.num_joints, 2), + dtype=np.float32) + for i in range(len(joints)): + tot = 0 + for idx, pt in enumerate(joints[i]): + x, y = int(pt[0]), int(pt[1]) + if (pt[2] > 0 and 0 <= y < self.output_size[1] + and 0 <= x < self.output_size[0]): + if self.tag_per_joint: + visible_kpts[i][tot] = \ + (idx * self.output_size.prod() + + y * self.output_size[0] + x, 1) + else: + visible_kpts[i][tot] = (y * self.output_size[0] + x, 1) + tot += 1 + return visible_kpts + + +class PAFGenerator: + """Generate part affinity fields. + + Args: + output_size (np.ndarray): Size (w, h) of feature map. + limb_width (int): Limb width of part affinity fields. + skeleton (list[list]): connections of joints. + """ + + def __init__(self, output_size, limb_width, skeleton): + if not isinstance(output_size, np.ndarray): + output_size = np.array(output_size) + if output_size.size > 1: + assert len(output_size) == 2 + self.output_size = output_size + else: + self.output_size = np.array([output_size, output_size], + dtype=np.int) + self.limb_width = limb_width + self.skeleton = skeleton + + def _accumulate_paf_map_(self, pafs, src, dst, count): + """Accumulate part affinity fields between two given joints. + + Args: + pafs (np.ndarray[2,H,W]): paf maps (2 dimensions:x axis and + y axis) for a certain limb connection. This argument will + be modified inplace. + src (np.ndarray[2,]): coordinates of the source joint. + dst (np.ndarray[2,]): coordinates of the destination joint. + count (np.ndarray[H,W]): count map that preserves the number + of non-zero vectors at each point. This argument will be + modified inplace. + """ + limb_vec = dst - src + norm = np.linalg.norm(limb_vec) + if norm == 0: + unit_limb_vec = np.zeros(2) + else: + unit_limb_vec = limb_vec / norm + + min_x = max(np.floor(min(src[0], dst[0]) - self.limb_width), 0) + max_x = min( + np.ceil(max(src[0], dst[0]) + self.limb_width), + self.output_size[0] - 1) + min_y = max(np.floor(min(src[1], dst[1]) - self.limb_width), 0) + max_y = min( + np.ceil(max(src[1], dst[1]) + self.limb_width), + self.output_size[1] - 1) + + range_x = list(range(int(min_x), int(max_x + 1), 1)) + range_y = list(range(int(min_y), int(max_y + 1), 1)) + + mask = np.zeros_like(count, dtype=bool) + if len(range_x) > 0 and len(range_y) > 0: + xx, yy = np.meshgrid(range_x, range_y) + delta_x = xx - src[0] + delta_y = yy - src[1] + dist = np.abs(delta_x * unit_limb_vec[1] - + delta_y * unit_limb_vec[0]) + mask_local = (dist < self.limb_width) + mask[yy, xx] = mask_local + + pafs[0, mask] += unit_limb_vec[0] + pafs[1, mask] += unit_limb_vec[1] + count += mask + + return pafs, count + + def __call__(self, joints): + """Generate the target part affinity fields.""" + pafs = np.zeros( + (len(self.skeleton) * 2, self.output_size[1], self.output_size[0]), + dtype=np.float32) + + for idx, sk in enumerate(self.skeleton): + count = np.zeros((self.output_size[1], self.output_size[0]), + dtype=np.float32) + + for p in joints: + src = p[sk[0]] + dst = p[sk[1]] + if src[2] > 0 and dst[2] > 0: + self._accumulate_paf_map_(pafs[2 * idx:2 * idx + 2], + src[:2], dst[:2], count) + + pafs[2 * idx:2 * idx + 2] /= np.maximum(count, 1) + + return pafs + + +@PIPELINES.register_module() +class BottomUpRandomFlip: + """Data augmentation with random image flip for bottom-up. + + Args: + flip_prob (float): Probability of flip. + """ + + def __init__(self, flip_prob=0.5): + self.flip_prob = flip_prob + + def __call__(self, results): + """Perform data augmentation with random image flip.""" + image, mask, joints = results['img'], results['mask'], results[ + 'joints'] + self.flip_index = results['ann_info']['flip_index'] + self.output_size = results['ann_info']['heatmap_size'] + + assert isinstance(mask, list) + assert isinstance(joints, list) + assert len(mask) == len(joints) + assert len(mask) == len(self.output_size) + + if np.random.random() < self.flip_prob: + image = image[:, ::-1].copy() - np.zeros_like(image) + for i, _output_size in enumerate(self.output_size): + if not isinstance(_output_size, np.ndarray): + _output_size = np.array(_output_size) + if _output_size.size > 1: + assert len(_output_size) == 2 + else: + _output_size = np.array([_output_size, _output_size], + dtype=np.int) + mask[i] = mask[i][:, ::-1].copy() + joints[i] = joints[i][:, self.flip_index] + joints[i][:, :, 0] = _output_size[0] - joints[i][:, :, 0] - 1 + results['img'], results['mask'], results[ + 'joints'] = image, mask, joints + return results + + +@PIPELINES.register_module() +class BottomUpRandomAffine: + """Data augmentation with random scaling & rotating. + + Args: + rot_factor (int): Rotating to [-rotation_factor, rotation_factor] + scale_factor (float): Scaling to [1-scale_factor, 1+scale_factor] + scale_type: wrt ``long`` or ``short`` length of the image. + trans_factor: Translation factor. + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, + rot_factor, + scale_factor, + scale_type, + trans_factor, + use_udp=False): + self.max_rotation = rot_factor + self.min_scale = scale_factor[0] + self.max_scale = scale_factor[1] + self.scale_type = scale_type + self.trans_factor = trans_factor + self.use_udp = use_udp + + def _get_scale(self, image_size, resized_size): + w, h = image_size + w_resized, h_resized = resized_size + if w / w_resized < h / h_resized: + if self.scale_type == 'long': + w_pad = h / h_resized * w_resized + h_pad = h + elif self.scale_type == 'short': + w_pad = w + h_pad = w / w_resized * h_resized + else: + raise ValueError(f'Unknown scale type: {self.scale_type}') + else: + if self.scale_type == 'long': + w_pad = w + h_pad = w / w_resized * h_resized + elif self.scale_type == 'short': + w_pad = h / h_resized * w_resized + h_pad = h + else: + raise ValueError(f'Unknown scale type: {self.scale_type}') + + scale = np.array([w_pad, h_pad], dtype=np.float32) + + return scale + + def __call__(self, results): + """Perform data augmentation with random scaling & rotating.""" + image, mask, joints = results['img'], results['mask'], results[ + 'joints'] + + self.input_size = results['ann_info']['image_size'] + if not isinstance(self.input_size, np.ndarray): + self.input_size = np.array(self.input_size) + if self.input_size.size > 1: + assert len(self.input_size) == 2 + else: + self.input_size = [self.input_size, self.input_size] + self.output_size = results['ann_info']['heatmap_size'] + + assert isinstance(mask, list) + assert isinstance(joints, list) + assert len(mask) == len(joints) + assert len(mask) == len(self.output_size), (len(mask), + len(self.output_size), + self.output_size) + + height, width = image.shape[:2] + if self.use_udp: + center = np.array(((width - 1.0) / 2, (height - 1.0) / 2)) + else: + center = np.array((width / 2, height / 2)) + + img_scale = np.array([width, height], dtype=np.float32) + aug_scale = np.random.random() * (self.max_scale - self.min_scale) \ + + self.min_scale + img_scale *= aug_scale + aug_rot = (np.random.random() * 2 - 1) * self.max_rotation + + if self.trans_factor > 0: + dx = np.random.randint(-self.trans_factor * img_scale[0] / 200.0, + self.trans_factor * img_scale[0] / 200.0) + dy = np.random.randint(-self.trans_factor * img_scale[1] / 200.0, + self.trans_factor * img_scale[1] / 200.0) + + center[0] += dx + center[1] += dy + if self.use_udp: + for i, _output_size in enumerate(self.output_size): + if not isinstance(_output_size, np.ndarray): + _output_size = np.array(_output_size) + if _output_size.size > 1: + assert len(_output_size) == 2 + else: + _output_size = [_output_size, _output_size] + + scale = self._get_scale(img_scale, _output_size) + + trans = get_warp_matrix( + theta=aug_rot, + size_input=center * 2.0, + size_dst=np.array( + (_output_size[0], _output_size[1]), dtype=np.float32) - + 1.0, + size_target=scale) + mask[i] = cv2.warpAffine( + (mask[i] * 255).astype(np.uint8), + trans, (int(_output_size[0]), int(_output_size[1])), + flags=cv2.INTER_LINEAR) / 255 + mask[i] = (mask[i] > 0.5).astype(np.float32) + joints[i][:, :, 0:2] = \ + warp_affine_joints(joints[i][:, :, 0:2].copy(), trans) + if results['ann_info']['scale_aware_sigma']: + joints[i][:, :, 3] = joints[i][:, :, 3] / aug_scale + scale = self._get_scale(img_scale, self.input_size) + mat_input = get_warp_matrix( + theta=aug_rot, + size_input=center * 2.0, + size_dst=np.array((self.input_size[0], self.input_size[1]), + dtype=np.float32) - 1.0, + size_target=scale) + image = cv2.warpAffine( + image, + mat_input, (int(self.input_size[0]), int(self.input_size[1])), + flags=cv2.INTER_LINEAR) + else: + for i, _output_size in enumerate(self.output_size): + if not isinstance(_output_size, np.ndarray): + _output_size = np.array(_output_size) + if _output_size.size > 1: + assert len(_output_size) == 2 + else: + _output_size = [_output_size, _output_size] + scale = self._get_scale(img_scale, _output_size) + mat_output = get_affine_transform( + center=center, + scale=scale / 200.0, + rot=aug_rot, + output_size=_output_size) + mask[i] = cv2.warpAffine( + (mask[i] * 255).astype(np.uint8), mat_output, + (int(_output_size[0]), int(_output_size[1]))) / 255 + mask[i] = (mask[i] > 0.5).astype(np.float32) + + joints[i][:, :, 0:2] = \ + warp_affine_joints(joints[i][:, :, 0:2], mat_output) + if results['ann_info']['scale_aware_sigma']: + joints[i][:, :, 3] = joints[i][:, :, 3] / aug_scale + + scale = self._get_scale(img_scale, self.input_size) + mat_input = get_affine_transform( + center=center, + scale=scale / 200.0, + rot=aug_rot, + output_size=self.input_size) + image = cv2.warpAffine(image, mat_input, (int( + self.input_size[0]), int(self.input_size[1]))) + + results['img'], results['mask'], results[ + 'joints'] = image, mask, joints + + return results + + +@PIPELINES.register_module() +class BottomUpGenerateHeatmapTarget: + """Generate multi-scale heatmap target for bottom-up. + + Args: + sigma (int): Sigma of heatmap Gaussian + max_num_people (int): Maximum number of people in an image + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, sigma, use_udp=False): + self.sigma = sigma + self.use_udp = use_udp + + def _generate(self, num_joints, heatmap_size): + """Get heatmap generator.""" + heatmap_generator = [ + HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp) + for output_size in heatmap_size + ] + return heatmap_generator + + def __call__(self, results): + """Generate multi-scale heatmap target for bottom-up.""" + heatmap_generator = \ + self._generate(results['ann_info']['num_joints'], + results['ann_info']['heatmap_size']) + target_list = list() + joints_list = results['joints'] + + for scale_id in range(results['ann_info']['num_scales']): + heatmaps = heatmap_generator[scale_id](joints_list[scale_id]) + target_list.append(heatmaps.astype(np.float32)) + results['target'] = target_list + + return results + + +@PIPELINES.register_module() +class BottomUpGenerateTarget: + """Generate multi-scale heatmap target for associate embedding. + + Args: + sigma (int): Sigma of heatmap Gaussian + max_num_people (int): Maximum number of people in an image + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, sigma, max_num_people, use_udp=False): + self.sigma = sigma + self.max_num_people = max_num_people + self.use_udp = use_udp + + def _generate(self, num_joints, heatmap_size): + """Get heatmap generator and joint encoder.""" + heatmap_generator = [ + HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp) + for output_size in heatmap_size + ] + joints_encoder = [ + JointsEncoder(self.max_num_people, num_joints, output_size, True) + for output_size in heatmap_size + ] + return heatmap_generator, joints_encoder + + def __call__(self, results): + """Generate multi-scale heatmap target for bottom-up.""" + heatmap_generator, joints_encoder = \ + self._generate(results['ann_info']['num_joints'], + results['ann_info']['heatmap_size']) + target_list = list() + mask_list, joints_list = results['mask'], results['joints'] + + for scale_id in range(results['ann_info']['num_scales']): + target_t = heatmap_generator[scale_id](joints_list[scale_id]) + joints_t = joints_encoder[scale_id](joints_list[scale_id]) + + target_list.append(target_t.astype(np.float32)) + mask_list[scale_id] = mask_list[scale_id].astype(np.float32) + joints_list[scale_id] = joints_t.astype(np.int32) + + results['masks'], results['joints'] = mask_list, joints_list + results['targets'] = target_list + + return results + + +@PIPELINES.register_module() +class BottomUpGeneratePAFTarget: + """Generate multi-scale heatmaps and part affinity fields (PAF) target for + bottom-up. Paper ref: Cao et al. Realtime Multi-Person 2D Human Pose + Estimation using Part Affinity Fields (CVPR 2017). + + Args: + limb_width (int): Limb width of part affinity fields + """ + + def __init__(self, limb_width, skeleton=None): + self.limb_width = limb_width + self.skeleton = skeleton + + def _generate(self, heatmap_size, skeleton): + """Get PAF generator.""" + paf_generator = [ + PAFGenerator(output_size, self.limb_width, skeleton) + for output_size in heatmap_size + ] + return paf_generator + + def __call__(self, results): + """Generate multi-scale part affinity fields for bottom-up.""" + if self.skeleton is None: + assert results['ann_info']['skeleton'] is not None + self.skeleton = results['ann_info']['skeleton'] + + paf_generator = \ + self._generate(results['ann_info']['heatmap_size'], + self.skeleton) + target_list = list() + joints_list = results['joints'] + + for scale_id in range(results['ann_info']['num_scales']): + pafs = paf_generator[scale_id](joints_list[scale_id]) + target_list.append(pafs.astype(np.float32)) + + results['target'] = target_list + + return results + + +@PIPELINES.register_module() +class BottomUpGetImgSize: + """Get multi-scale image sizes for bottom-up, including base_size and + test_scale_factor. Keep the ratio and the image is resized to + `results['ann_info']['image_size']×current_scale`. + + Args: + test_scale_factor (List[float]): Multi scale + current_scale (int): default 1 + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, test_scale_factor, current_scale=1, use_udp=False): + self.test_scale_factor = test_scale_factor + self.min_scale = min(test_scale_factor) + self.current_scale = current_scale + self.use_udp = use_udp + + def __call__(self, results): + """Get multi-scale image sizes for bottom-up.""" + input_size = results['ann_info']['image_size'] + if not isinstance(input_size, np.ndarray): + input_size = np.array(input_size) + if input_size.size > 1: + assert len(input_size) == 2 + else: + input_size = np.array([input_size, input_size], dtype=np.int) + img = results['img'] + + h, w, _ = img.shape + + # calculate the size for min_scale + min_input_w = _ceil_to_multiples_of(self.min_scale * input_size[0], 64) + min_input_h = _ceil_to_multiples_of(self.min_scale * input_size[1], 64) + if w < h: + w_resized = int(min_input_w * self.current_scale / self.min_scale) + h_resized = int( + _ceil_to_multiples_of(min_input_w / w * h, 64) * + self.current_scale / self.min_scale) + if self.use_udp: + scale_w = w - 1.0 + scale_h = (h_resized - 1.0) / (w_resized - 1.0) * (w - 1.0) + else: + scale_w = w / 200.0 + scale_h = h_resized / w_resized * w / 200.0 + else: + h_resized = int(min_input_h * self.current_scale / self.min_scale) + w_resized = int( + _ceil_to_multiples_of(min_input_h / h * w, 64) * + self.current_scale / self.min_scale) + if self.use_udp: + scale_h = h - 1.0 + scale_w = (w_resized - 1.0) / (h_resized - 1.0) * (h - 1.0) + else: + scale_h = h / 200.0 + scale_w = w_resized / h_resized * h / 200.0 + if self.use_udp: + center = (scale_w / 2.0, scale_h / 2.0) + else: + center = np.array([round(w / 2.0), round(h / 2.0)]) + results['ann_info']['test_scale_factor'] = self.test_scale_factor + results['ann_info']['base_size'] = (w_resized, h_resized) + results['ann_info']['center'] = center + results['ann_info']['scale'] = np.array([scale_w, scale_h]) + + return results + + +@PIPELINES.register_module() +class BottomUpResizeAlign: + """Resize multi-scale size and align transform for bottom-up. + + Args: + transforms (List): ToTensor & Normalize + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, transforms, use_udp=False): + self.transforms = Compose(transforms) + if use_udp: + self._resize_align_multi_scale = _resize_align_multi_scale_udp + else: + self._resize_align_multi_scale = _resize_align_multi_scale + + def __call__(self, results): + """Resize multi-scale size and align transform for bottom-up.""" + input_size = results['ann_info']['image_size'] + if not isinstance(input_size, np.ndarray): + input_size = np.array(input_size) + if input_size.size > 1: + assert len(input_size) == 2 + else: + input_size = np.array([input_size, input_size], dtype=np.int) + test_scale_factor = results['ann_info']['test_scale_factor'] + aug_data = [] + + for _, s in enumerate(sorted(test_scale_factor, reverse=True)): + _results = results.copy() + image_resized, _, _ = self._resize_align_multi_scale( + _results['img'], input_size, s, min(test_scale_factor)) + _results['img'] = image_resized + _results = self.transforms(_results) + transformed_img = _results['img'].unsqueeze(0) + aug_data.append(transformed_img) + + results['ann_info']['aug_data'] = aug_data + + return results diff --git a/mmpose/datasets/pipelines/hand_transform.py b/mmpose/datasets/pipelines/hand_transform.py new file mode 100644 index 0000000..b83e399 --- /dev/null +++ b/mmpose/datasets/pipelines/hand_transform.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + +from mmpose.datasets.builder import PIPELINES +from .top_down_transform import TopDownRandomFlip + + +@PIPELINES.register_module() +class HandRandomFlip(TopDownRandomFlip): + """Data augmentation with random image flip. A child class of + TopDownRandomFlip. + + Required keys: 'img', 'joints_3d', 'joints_3d_visible', 'center', + 'hand_type', 'rel_root_depth' and 'ann_info'. + + Modifies key: 'img', 'joints_3d', 'joints_3d_visible', 'center', + 'hand_type', 'rel_root_depth'. + + Args: + flip_prob (float): Probability of flip. + """ + + def __call__(self, results): + """Perform data augmentation with random image flip.""" + # base flip augmentation + super().__call__(results) + + # flip hand type and root depth + hand_type = results['hand_type'] + rel_root_depth = results['rel_root_depth'] + flipped = results['flipped'] + if flipped: + hand_type[0], hand_type[1] = hand_type[1], hand_type[0] + rel_root_depth = -rel_root_depth + results['hand_type'] = hand_type + results['rel_root_depth'] = rel_root_depth + return results + + +@PIPELINES.register_module() +class HandGenerateRelDepthTarget: + """Generate the target relative root depth. + + Required keys: 'rel_root_depth', 'rel_root_valid', 'ann_info'. + + Modified keys: 'target', 'target_weight'. + """ + + def __init__(self): + pass + + def __call__(self, results): + """Generate the target heatmap.""" + rel_root_depth = results['rel_root_depth'] + rel_root_valid = results['rel_root_valid'] + cfg = results['ann_info'] + D = cfg['heatmap_size_root'] + root_depth_bound = cfg['root_depth_bound'] + target = (rel_root_depth / root_depth_bound + 0.5) * D + target_weight = rel_root_valid * (target >= 0) * (target <= D) + results['target'] = target * np.ones(1, dtype=np.float32) + results['target_weight'] = target_weight * np.ones(1, dtype=np.float32) + return results diff --git a/mmpose/datasets/pipelines/loading.py b/mmpose/datasets/pipelines/loading.py new file mode 100644 index 0000000..6475005 --- /dev/null +++ b/mmpose/datasets/pipelines/loading.py @@ -0,0 +1,91 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class LoadImageFromFile: + """Loading image(s) from file. + + Required key: "image_file". + + Added key: "img". + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + color_type (str): Flags specifying the color type of a loaded image, + candidates are 'color', 'grayscale' and 'unchanged'. + channel_order (str): Order of channel, candidates are 'bgr' and 'rgb'. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + """ + + def __init__(self, + to_float32=False, + color_type='color', + channel_order='rgb', + file_client_args=dict(backend='disk')): + self.to_float32 = to_float32 + self.color_type = color_type + self.channel_order = channel_order + self.file_client_args = file_client_args.copy() + self.file_client = None + + def _read_image(self, path): + img_bytes = self.file_client.get(path) + img = mmcv.imfrombytes( + img_bytes, flag=self.color_type, channel_order=self.channel_order) + if img is None: + raise ValueError(f'Fail to read {path}') + if self.to_float32: + img = img.astype(np.float32) + return img + + def __call__(self, results): + """Loading image(s) from file.""" + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + image_file = results.get('image_file', None) + + if isinstance(image_file, (list, tuple)): + # Load images from a list of paths + results['img'] = [self._read_image(path) for path in image_file] + elif image_file is not None: + # Load single image from path + results['img'] = self._read_image(image_file) + else: + if 'img' not in results: + # If `image_file`` is not in results, check the `img` exists + # and format the image. This for compatibility when the image + # is manually set outside the pipeline. + raise KeyError('Either `image_file` or `img` should exist in ' + 'results.') + assert isinstance(results['img'], np.ndarray) + if self.color_type == 'color' and self.channel_order == 'rgb': + # The original results['img'] is assumed to be image(s) in BGR + # order, so we convert the color according to the arguments. + if results['img'].ndim == 3: + results['img'] = mmcv.bgr2rgb(results['img']) + elif results['img'].ndim == 4: + results['img'] = np.concatenate( + [mmcv.bgr2rgb(img) for img in results['img']], axis=0) + else: + raise ValueError('results["img"] has invalid shape ' + f'{results["img"].shape}') + + results['image_file'] = None + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'to_float32={self.to_float32}, ' + f"color_type='{self.color_type}', " + f'file_client_args={self.file_client_args})') + return repr_str diff --git a/mmpose/datasets/pipelines/mesh_transform.py b/mmpose/datasets/pipelines/mesh_transform.py new file mode 100644 index 0000000..e3f32fe --- /dev/null +++ b/mmpose/datasets/pipelines/mesh_transform.py @@ -0,0 +1,399 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import mmcv +import numpy as np +import torch + +from mmpose.core.post_processing import (affine_transform, fliplr_joints, + get_affine_transform) +from mmpose.datasets.builder import PIPELINES + + +def _flip_smpl_pose(pose): + """Flip SMPL pose parameters horizontally. + + Args: + pose (np.ndarray([72])): SMPL pose parameters + + Returns: + pose_flipped + """ + + flippedParts = [ + 0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11, 15, 16, 17, 12, 13, 14, 18, 19, + 20, 24, 25, 26, 21, 22, 23, 27, 28, 29, 33, 34, 35, 30, 31, 32, 36, 37, + 38, 42, 43, 44, 39, 40, 41, 45, 46, 47, 51, 52, 53, 48, 49, 50, 57, 58, + 59, 54, 55, 56, 63, 64, 65, 60, 61, 62, 69, 70, 71, 66, 67, 68 + ] + pose_flipped = pose[flippedParts] + # Negate the second and the third dimension of the axis-angle + pose_flipped[1::3] = -pose_flipped[1::3] + pose_flipped[2::3] = -pose_flipped[2::3] + return pose_flipped + + +def _flip_iuv(iuv, uv_type='BF'): + """Flip IUV image horizontally. + + Note: + IUV image height: H + IUV image width: W + + Args: + iuv np.ndarray([H, W, 3]): IUV image + uv_type (str): The type of the UV map. + Candidate values: + 'DP': The UV map used in DensePose project. + 'SMPL': The default UV map of SMPL model. + 'BF': The UV map used in DecoMR project. + Default: 'BF' + + Returns: + iuv_flipped np.ndarray([H, W, 3]): Flipped IUV image + """ + assert uv_type in ['DP', 'SMPL', 'BF'] + if uv_type == 'BF': + iuv_flipped = iuv[:, ::-1, :] + iuv_flipped[:, :, 1] = 255 - iuv_flipped[:, :, 1] + else: + # The flip of other UV map is complex, not finished yet. + raise NotImplementedError( + f'The flip of {uv_type} UV map is not implemented yet.') + + return iuv_flipped + + +def _construct_rotation_matrix(rot, size=3): + """Construct the in-plane rotation matrix. + + Args: + rot (float): Rotation angle (degree). + size (int): The size of the rotation matrix. + Candidate Values: 2, 3. Defaults to 3. + + Returns: + rot_mat (np.ndarray([size, size]): Rotation matrix. + """ + rot_mat = np.eye(size, dtype=np.float32) + if rot != 0: + rot_rad = np.deg2rad(rot) + sn, cs = np.sin(rot_rad), np.cos(rot_rad) + rot_mat[0, :2] = [cs, -sn] + rot_mat[1, :2] = [sn, cs] + + return rot_mat + + +def _rotate_joints_3d(joints_3d, rot): + """Rotate the 3D joints in the local coordinates. + + Note: + Joints number: K + + Args: + joints_3d (np.ndarray([K, 3])): Coordinates of keypoints. + rot (float): Rotation angle (degree). + + Returns: + joints_3d_rotated + """ + # in-plane rotation + # 3D joints are rotated counterclockwise, + # so the rot angle is inversed. + rot_mat = _construct_rotation_matrix(-rot, 3) + + joints_3d_rotated = np.einsum('ij,kj->ki', rot_mat, joints_3d) + joints_3d_rotated = joints_3d_rotated.astype('float32') + return joints_3d_rotated + + +def _rotate_smpl_pose(pose, rot): + """Rotate SMPL pose parameters. SMPL (https://smpl.is.tue.mpg.de/) is a 3D + human model. + + Args: + pose (np.ndarray([72])): SMPL pose parameters + rot (float): Rotation angle (degree). + + Returns: + pose_rotated + """ + pose_rotated = pose.copy() + if rot != 0: + rot_mat = _construct_rotation_matrix(-rot) + orient = pose[:3] + # find the rotation of the body in camera frame + per_rdg, _ = cv2.Rodrigues(orient) + # apply the global rotation to the global orientation + res_rot, _ = cv2.Rodrigues(np.dot(rot_mat, per_rdg)) + pose_rotated[:3] = (res_rot.T)[0] + + return pose_rotated + + +def _flip_joints_3d(joints_3d, joints_3d_visible, flip_pairs): + """Flip human joints in 3D space horizontally. + + Note: + num_keypoints: K + + Args: + joints_3d (np.ndarray([K, 3])): Coordinates of keypoints. + joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints. + flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored + (for example, left ear -- right ear). + + Returns: + joints_3d_flipped, joints_3d_visible_flipped + """ + + assert len(joints_3d) == len(joints_3d_visible) + + joints_3d_flipped = joints_3d.copy() + joints_3d_visible_flipped = joints_3d_visible.copy() + + # Swap left-right parts + for left, right in flip_pairs: + joints_3d_flipped[left, :] = joints_3d[right, :] + joints_3d_flipped[right, :] = joints_3d[left, :] + + joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :] + joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :] + + # Flip horizontally + joints_3d_flipped[:, 0] = -joints_3d_flipped[:, 0] + joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped + + return joints_3d_flipped, joints_3d_visible_flipped + + +@PIPELINES.register_module() +class LoadIUVFromFile: + """Loading IUV image from file.""" + + def __init__(self, to_float32=False): + self.to_float32 = to_float32 + self.color_type = 'color' + # channel relations: iuv->bgr + self.channel_order = 'bgr' + + def __call__(self, results): + """Loading image from file.""" + has_iuv = results['has_iuv'] + use_iuv = results['ann_info']['use_IUV'] + if has_iuv and use_iuv: + iuv_file = results['iuv_file'] + iuv = mmcv.imread(iuv_file, self.color_type, self.channel_order) + if iuv is None: + raise ValueError(f'Fail to read {iuv_file}') + else: + has_iuv = 0 + iuv = None + + results['has_iuv'] = has_iuv + results['iuv'] = iuv + return results + + +@PIPELINES.register_module() +class IUVToTensor: + """Transform IUV image to part index mask and uv coordinates image. The 3 + channels of IUV image means: part index, u coordinates, v coordinates. + + Required key: 'iuv', 'ann_info'. + Modifies key: 'part_index', 'uv_coordinates'. + + Args: + results (dict): contain all information about training. + """ + + def __call__(self, results): + iuv = results['iuv'] + if iuv is None: + H, W = results['ann_info']['iuv_size'] + part_index = torch.zeros([1, H, W], dtype=torch.long) + uv_coordinates = torch.zeros([2, H, W], dtype=torch.float32) + else: + part_index = torch.LongTensor(iuv[:, :, 0])[None, :, :] + uv_coordinates = torch.FloatTensor(iuv[:, :, 1:]) / 255 + uv_coordinates = uv_coordinates.permute(2, 0, 1) + results['part_index'] = part_index + results['uv_coordinates'] = uv_coordinates + return results + + +@PIPELINES.register_module() +class MeshRandomChannelNoise: + """Data augmentation with random channel noise. + + Required keys: 'img' + Modifies key: 'img' + + Args: + noise_factor (float): Multiply each channel with + a factor between``[1-scale_factor, 1+scale_factor]`` + """ + + def __init__(self, noise_factor=0.4): + self.noise_factor = noise_factor + + def __call__(self, results): + """Perform data augmentation with random channel noise.""" + img = results['img'] + + # Each channel is multiplied with a number + # in the area [1-self.noise_factor, 1+self.noise_factor] + pn = np.random.uniform(1 - self.noise_factor, 1 + self.noise_factor, + (1, 3)) + img = cv2.multiply(img, pn) + + results['img'] = img + return results + + +@PIPELINES.register_module() +class MeshRandomFlip: + """Data augmentation with random image flip. + + Required keys: 'img', 'joints_2d','joints_2d_visible', 'joints_3d', + 'joints_3d_visible', 'center', 'pose', 'iuv' and 'ann_info'. + Modifies key: 'img', 'joints_2d','joints_2d_visible', 'joints_3d', + 'joints_3d_visible', 'center', 'pose', 'iuv'. + + Args: + flip_prob (float): Probability of flip. + """ + + def __init__(self, flip_prob=0.5): + self.flip_prob = flip_prob + + def __call__(self, results): + """Perform data augmentation with random image flip.""" + if np.random.rand() > self.flip_prob: + return results + + img = results['img'] + joints_2d = results['joints_2d'] + joints_2d_visible = results['joints_2d_visible'] + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + pose = results['pose'] + center = results['center'] + + img = img[:, ::-1, :] + pose = _flip_smpl_pose(pose) + + joints_2d, joints_2d_visible = fliplr_joints( + joints_2d, joints_2d_visible, img.shape[1], + results['ann_info']['flip_pairs']) + + joints_3d, joints_3d_visible = _flip_joints_3d( + joints_3d, joints_3d_visible, results['ann_info']['flip_pairs']) + center[0] = img.shape[1] - center[0] - 1 + + if 'iuv' in results.keys(): + iuv = results['iuv'] + if iuv is not None: + iuv = _flip_iuv(iuv, results['ann_info']['uv_type']) + results['iuv'] = iuv + + results['img'] = img + results['joints_2d'] = joints_2d + results['joints_2d_visible'] = joints_2d_visible + results['joints_3d'] = joints_3d + results['joints_3d_visible'] = joints_3d_visible + results['pose'] = pose + results['center'] = center + return results + + +@PIPELINES.register_module() +class MeshGetRandomScaleRotation: + """Data augmentation with random scaling & rotating. + + Required key: 'scale'. Modifies key: 'scale' and 'rotation'. + + Args: + rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``. + scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``. + rot_prob (float): Probability of random rotation. + """ + + def __init__(self, rot_factor=30, scale_factor=0.25, rot_prob=0.6): + self.rot_factor = rot_factor + self.scale_factor = scale_factor + self.rot_prob = rot_prob + + def __call__(self, results): + """Perform data augmentation with random scaling & rotating.""" + s = results['scale'] + + sf = self.scale_factor + rf = self.rot_factor + + s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) + s = s * s_factor + + r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) + r = r_factor if np.random.rand() <= self.rot_prob else 0 + + results['scale'] = s + results['rotation'] = r + + return results + + +@PIPELINES.register_module() +class MeshAffine: + """Affine transform the image to get input image. Affine transform the 2D + keypoints, 3D kepoints and IUV image too. + + Required keys: 'img', 'joints_2d','joints_2d_visible', 'joints_3d', + 'joints_3d_visible', 'pose', 'iuv', 'ann_info','scale', 'rotation' and + 'center'. Modifies key: 'img', 'joints_2d','joints_2d_visible', + 'joints_3d', 'pose', 'iuv'. + """ + + def __call__(self, results): + image_size = results['ann_info']['image_size'] + + img = results['img'] + joints_2d = results['joints_2d'] + joints_2d_visible = results['joints_2d_visible'] + joints_3d = results['joints_3d'] + pose = results['pose'] + + c = results['center'] + s = results['scale'] + r = results['rotation'] + trans = get_affine_transform(c, s, r, image_size) + + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + + for i in range(results['ann_info']['num_joints']): + if joints_2d_visible[i, 0] > 0.0: + joints_2d[i] = affine_transform(joints_2d[i], trans) + + joints_3d = _rotate_joints_3d(joints_3d, r) + pose = _rotate_smpl_pose(pose, r) + + results['img'] = img + results['joints_2d'] = joints_2d + results['joints_2d_visible'] = joints_2d_visible + results['joints_3d'] = joints_3d + results['pose'] = pose + + if 'iuv' in results.keys(): + iuv = results['iuv'] + if iuv is not None: + iuv_size = results['ann_info']['iuv_size'] + iuv = cv2.warpAffine( + iuv, + trans, (int(iuv_size[0]), int(iuv_size[1])), + flags=cv2.INTER_NEAREST) + results['iuv'] = iuv + + return results diff --git a/mmpose/datasets/pipelines/pose3d_transform.py b/mmpose/datasets/pipelines/pose3d_transform.py new file mode 100644 index 0000000..1249378 --- /dev/null +++ b/mmpose/datasets/pipelines/pose3d_transform.py @@ -0,0 +1,643 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import mmcv +import numpy as np +import torch +from mmcv.utils import build_from_cfg + +from mmpose.core.camera import CAMERAS +from mmpose.core.post_processing import fliplr_regression +from mmpose.datasets.builder import PIPELINES + + +@PIPELINES.register_module() +class GetRootCenteredPose: + """Zero-center the pose around a given root joint. Optionally, the root + joint can be removed from the original pose and stored as a separate item. + + Note that the root-centered joints may no longer align with some annotation + information (e.g. flip_pairs, num_joints, inference_channel, etc.) due to + the removal of the root joint. + + Args: + item (str): The name of the pose to apply root-centering. + root_index (int): Root joint index in the pose. + visible_item (str): The name of the visibility item. + remove_root (bool): If true, remove the root joint from the pose + root_name (str): Optional. If not none, it will be used as the key to + store the root position separated from the original pose. + + Required keys: + item + + Modified keys: + item, visible_item, root_name + """ + + def __init__(self, + item, + root_index, + visible_item=None, + remove_root=False, + root_name=None): + self.item = item + self.root_index = root_index + self.remove_root = remove_root + self.root_name = root_name + self.visible_item = visible_item + + def __call__(self, results): + assert self.item in results + joints = results[self.item] + root_idx = self.root_index + + assert joints.ndim >= 2 and joints.shape[-2] > root_idx,\ + f'Got invalid joint shape {joints.shape}' + + root = joints[..., root_idx:root_idx + 1, :] + joints = joints - root + + results[self.item] = joints + if self.root_name is not None: + results[self.root_name] = root + + if self.remove_root: + results[self.item] = np.delete( + results[self.item], root_idx, axis=-2) + if self.visible_item is not None: + assert self.visible_item in results + results[self.visible_item] = np.delete( + results[self.visible_item], root_idx, axis=-2) + # Add a flag to avoid latter transforms that rely on the root + # joint or the original joint index + results[f'{self.item}_root_removed'] = True + + # Save the root index which is necessary to restore the global pose + if self.root_name is not None: + results[f'{self.root_name}_index'] = self.root_index + + return results + + +@PIPELINES.register_module() +class NormalizeJointCoordinate: + """Normalize the joint coordinate with given mean and std. + + Args: + item (str): The name of the pose to normalize. + mean (array): Mean values of joint coordinates in shape [K, C]. + std (array): Std values of joint coordinates in shape [K, C]. + norm_param_file (str): Optionally load a dict containing `mean` and + `std` from a file using `mmcv.load`. + + Required keys: + item + + Modified keys: + item + """ + + def __init__(self, item, mean=None, std=None, norm_param_file=None): + self.item = item + self.norm_param_file = norm_param_file + if norm_param_file is not None: + norm_param = mmcv.load(norm_param_file) + assert 'mean' in norm_param and 'std' in norm_param + mean = norm_param['mean'] + std = norm_param['std'] + else: + assert mean is not None + assert std is not None + + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + + def __call__(self, results): + assert self.item in results + results[self.item] = (results[self.item] - self.mean) / self.std + results[f'{self.item}_mean'] = self.mean.copy() + results[f'{self.item}_std'] = self.std.copy() + return results + + +@PIPELINES.register_module() +class ImageCoordinateNormalization: + """Normalize the 2D joint coordinate with image width and height. Range [0, + w] is mapped to [-1, 1], while preserving the aspect ratio. + + Args: + item (str|list[str]): The name of the pose to normalize. + norm_camera (bool): Whether to normalize camera intrinsics. + Default: False. + camera_param (dict|None): The camera parameter dict. See the camera + class definition for more details. If None is given, the camera + parameter will be obtained during processing of each data sample + with the key "camera_param". + + Required keys: + item + + Modified keys: + item (, camera_param) + """ + + def __init__(self, item, norm_camera=False, camera_param=None): + self.item = item + if isinstance(self.item, str): + self.item = [self.item] + + self.norm_camera = norm_camera + + if camera_param is None: + self.static_camera = False + else: + self.static_camera = True + self.camera_param = camera_param + + def __call__(self, results): + center = np.array( + [0.5 * results['image_width'], 0.5 * results['image_height']], + dtype=np.float32) + scale = np.array(0.5 * results['image_width'], dtype=np.float32) + + for item in self.item: + results[item] = (results[item] - center) / scale + + if self.norm_camera: + if self.static_camera: + camera_param = copy.deepcopy(self.camera_param) + else: + assert 'camera_param' in results, \ + 'Camera parameters are missing.' + camera_param = results['camera_param'] + assert 'f' in camera_param and 'c' in camera_param + camera_param['f'] = camera_param['f'] / scale + camera_param['c'] = (camera_param['c'] - center[:, None]) / scale + if 'camera_param' not in results: + results['camera_param'] = dict() + results['camera_param'].update(camera_param) + + return results + + +@PIPELINES.register_module() +class CollectCameraIntrinsics: + """Store camera intrinsics in a 1-dim array, including f, c, k, p. + + Args: + camera_param (dict|None): The camera parameter dict. See the camera + class definition for more details. If None is given, the camera + parameter will be obtained during processing of each data sample + with the key "camera_param". + need_distortion (bool): Whether need distortion parameters k and p. + Default: True. + + Required keys: + camera_param (if camera parameters are not given in initialization) + + Modified keys: + intrinsics + """ + + def __init__(self, camera_param=None, need_distortion=True): + if camera_param is None: + self.static_camera = False + else: + self.static_camera = True + self.camera_param = camera_param + self.need_distortion = need_distortion + + def __call__(self, results): + if self.static_camera: + camera_param = copy.deepcopy(self.camera_param) + else: + assert 'camera_param' in results, 'Camera parameters are missing.' + camera_param = results['camera_param'] + assert 'f' in camera_param and 'c' in camera_param + intrinsics = np.concatenate( + [camera_param['f'].reshape(2), camera_param['c'].reshape(2)]) + if self.need_distortion: + assert 'k' in camera_param and 'p' in camera_param + intrinsics = np.concatenate([ + intrinsics, camera_param['k'].reshape(3), + camera_param['p'].reshape(2) + ]) + results['intrinsics'] = intrinsics + + return results + + +@PIPELINES.register_module() +class CameraProjection: + """Apply camera projection to joint coordinates. + + Args: + item (str): The name of the pose to apply camera projection. + mode (str): The type of camera projection, supported options are + + - world_to_camera + - world_to_pixel + - camera_to_world + - camera_to_pixel + output_name (str|None): The name of the projected pose. If None + (default) is given, the projected pose will be stored in place. + camera_type (str): The camera class name (should be registered in + CAMERA). + camera_param (dict|None): The camera parameter dict. See the camera + class definition for more details. If None is given, the camera + parameter will be obtained during processing of each data sample + with the key "camera_param". + + Required keys: + + - item + - camera_param (if camera parameters are not given in initialization) + + Modified keys: + output_name + """ + + def __init__(self, + item, + mode, + output_name=None, + camera_type='SimpleCamera', + camera_param=None): + self.item = item + self.mode = mode + self.output_name = output_name + self.camera_type = camera_type + allowed_mode = { + 'world_to_camera', + 'world_to_pixel', + 'camera_to_world', + 'camera_to_pixel', + } + if mode not in allowed_mode: + raise ValueError( + f'Got invalid mode: {mode}, allowed modes are {allowed_mode}') + + if camera_param is None: + self.static_camera = False + else: + self.static_camera = True + self.camera = self._build_camera(camera_param) + + def _build_camera(self, param): + cfgs = dict(type=self.camera_type, param=param) + return build_from_cfg(cfgs, CAMERAS) + + def __call__(self, results): + assert self.item in results + joints = results[self.item] + + if self.static_camera: + camera = self.camera + else: + assert 'camera_param' in results, 'Camera parameters are missing.' + camera = self._build_camera(results['camera_param']) + + if self.mode == 'world_to_camera': + output = camera.world_to_camera(joints) + elif self.mode == 'world_to_pixel': + output = camera.world_to_pixel(joints) + elif self.mode == 'camera_to_world': + output = camera.camera_to_world(joints) + elif self.mode == 'camera_to_pixel': + output = camera.camera_to_pixel(joints) + else: + raise NotImplementedError + + output_name = self.output_name + if output_name is None: + output_name = self.item + + results[output_name] = output + return results + + +@PIPELINES.register_module() +class RelativeJointRandomFlip: + """Data augmentation with random horizontal joint flip around a root joint. + + Args: + item (str|list[str]): The name of the pose to flip. + flip_cfg (dict|list[dict]): Configurations of the fliplr_regression + function. It should contain the following arguments: + + - ``center_mode``: The mode to set the center location on the \ + x-axis to flip around. + - ``center_x`` or ``center_index``: Set the x-axis location or \ + the root joint's index to define the flip center. + + Please refer to the docstring of the fliplr_regression function for + more details. + visible_item (str|list[str]): The name of the visibility item which + will be flipped accordingly along with the pose. + flip_prob (float): Probability of flip. + flip_camera (bool): Whether to flip horizontal distortion coefficients. + camera_param (dict|None): The camera parameter dict. See the camera + class definition for more details. If None is given, the camera + parameter will be obtained during processing of each data sample + with the key "camera_param". + + Required keys: + item + + Modified keys: + item (, camera_param) + """ + + def __init__(self, + item, + flip_cfg, + visible_item=None, + flip_prob=0.5, + flip_camera=False, + camera_param=None): + self.item = item + self.flip_cfg = flip_cfg + self.vis_item = visible_item + self.flip_prob = flip_prob + self.flip_camera = flip_camera + if camera_param is None: + self.static_camera = False + else: + self.static_camera = True + self.camera_param = camera_param + + if isinstance(self.item, str): + self.item = [self.item] + if isinstance(self.flip_cfg, dict): + self.flip_cfg = [self.flip_cfg] * len(self.item) + assert len(self.item) == len(self.flip_cfg) + if isinstance(self.vis_item, str): + self.vis_item = [self.vis_item] + + def __call__(self, results): + + if results.get(f'{self.item}_root_removed', False): + raise RuntimeError('The transform RelativeJointRandomFlip should ' + f'not be applied to {self.item} whose root ' + 'joint has been removed and joint indices have ' + 'been changed') + + if np.random.rand() <= self.flip_prob: + + flip_pairs = results['ann_info']['flip_pairs'] + + # flip joint coordinates + for i, item in enumerate(self.item): + assert item in results + joints = results[item] + + joints_flipped = fliplr_regression(joints, flip_pairs, + **self.flip_cfg[i]) + + results[item] = joints_flipped + + # flip joint visibility + for vis_item in self.vis_item: + assert vis_item in results + visible = results[vis_item] + visible_flipped = visible.copy() + for left, right in flip_pairs: + visible_flipped[..., left, :] = visible[..., right, :] + visible_flipped[..., right, :] = visible[..., left, :] + results[vis_item] = visible_flipped + + # flip horizontal distortion coefficients + if self.flip_camera: + if self.static_camera: + camera_param = copy.deepcopy(self.camera_param) + else: + assert 'camera_param' in results, \ + 'Camera parameters are missing.' + camera_param = results['camera_param'] + assert 'c' in camera_param + camera_param['c'][0] *= -1 + + if 'p' in camera_param: + camera_param['p'][0] *= -1 + + if 'camera_param' not in results: + results['camera_param'] = dict() + results['camera_param'].update(camera_param) + + return results + + +@PIPELINES.register_module() +class PoseSequenceToTensor: + """Convert pose sequence from numpy array to Tensor. + + The original pose sequence should have a shape of [T,K,C] or [K,C], where + T is the sequence length, K and C are keypoint number and dimension. The + converted pose sequence will have a shape of [KxC, T]. + + Args: + item (str): The name of the pose sequence + + Required keys: + item + + Modified keys: + item + """ + + def __init__(self, item): + self.item = item + + def __call__(self, results): + assert self.item in results + seq = results[self.item] + + assert isinstance(seq, np.ndarray) + assert seq.ndim in {2, 3} + + if seq.ndim == 2: + seq = seq[None, ...] + + T = seq.shape[0] + seq = seq.transpose(1, 2, 0).reshape(-1, T) + results[self.item] = torch.from_numpy(seq) + + return results + + +@PIPELINES.register_module() +class Generate3DHeatmapTarget: + """Generate the target 3d heatmap. + + Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'. + Modified keys: 'target', and 'target_weight'. + + Args: + sigma: Sigma of heatmap gaussian. + joint_indices (list): Indices of joints used for heatmap generation. + If None (default) is given, all joints will be used. + max_bound (float): The maximal value of heatmap. + """ + + def __init__(self, sigma=2, joint_indices=None, max_bound=1.0): + self.sigma = sigma + self.joint_indices = joint_indices + self.max_bound = max_bound + + def __call__(self, results): + """Generate the target heatmap.""" + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + cfg = results['ann_info'] + image_size = cfg['image_size'] + W, H, D = cfg['heatmap_size'] + heatmap3d_depth_bound = cfg['heatmap3d_depth_bound'] + joint_weights = cfg['joint_weights'] + use_different_joint_weights = cfg['use_different_joint_weights'] + + # select the joints used for target generation + if self.joint_indices is not None: + joints_3d = joints_3d[self.joint_indices, ...] + joints_3d_visible = joints_3d_visible[self.joint_indices, ...] + joint_weights = joint_weights[self.joint_indices, ...] + num_joints = joints_3d.shape[0] + + # get the joint location in heatmap coordinates + mu_x = joints_3d[:, 0] * W / image_size[0] + mu_y = joints_3d[:, 1] * H / image_size[1] + mu_z = (joints_3d[:, 2] / heatmap3d_depth_bound + 0.5) * D + + target = np.zeros([num_joints, D, H, W], dtype=np.float32) + + target_weight = joints_3d_visible[:, 0].astype(np.float32) + target_weight = target_weight * (mu_z >= 0) * (mu_z < D) + if use_different_joint_weights: + target_weight = target_weight * joint_weights + target_weight = target_weight[:, None] + + # only compute the voxel value near the joints location + tmp_size = 3 * self.sigma + + # get neighboring voxels coordinates + x = y = z = np.arange(2 * tmp_size + 1, dtype=np.float32) - tmp_size + zz, yy, xx = np.meshgrid(z, y, x) + xx = xx[None, ...].astype(np.float32) + yy = yy[None, ...].astype(np.float32) + zz = zz[None, ...].astype(np.float32) + mu_x = mu_x[..., None, None, None] + mu_y = mu_y[..., None, None, None] + mu_z = mu_z[..., None, None, None] + xx, yy, zz = xx + mu_x, yy + mu_y, zz + mu_z + + # round the coordinates + xx = xx.round().clip(0, W - 1) + yy = yy.round().clip(0, H - 1) + zz = zz.round().clip(0, D - 1) + + # compute the target value near joints + local_target = \ + np.exp(-((xx - mu_x)**2 + (yy - mu_y)**2 + (zz - mu_z)**2) / + (2 * self.sigma**2)) + + # put the local target value to the full target heatmap + local_size = xx.shape[1] + idx_joints = np.tile( + np.arange(num_joints)[:, None, None, None], + [1, local_size, local_size, local_size]) + idx = np.stack([idx_joints, zz, yy, xx], + axis=-1).astype(int).reshape(-1, 4) + target[idx[:, 0], idx[:, 1], idx[:, 2], + idx[:, 3]] = local_target.reshape(-1) + target = target * self.max_bound + results['target'] = target + results['target_weight'] = target_weight + return results + + +@PIPELINES.register_module() +class GenerateVoxel3DHeatmapTarget: + """Generate the target 3d heatmap. + + Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info_3d'. + Modified keys: 'target', and 'target_weight'. + + Args: + sigma: Sigma of heatmap gaussian (mm). + joint_indices (list): Indices of joints used for heatmap generation. + If None (default) is given, all joints will be used. + """ + + def __init__(self, sigma=200.0, joint_indices=None): + self.sigma = sigma # mm + self.joint_indices = joint_indices + + def __call__(self, results): + """Generate the target heatmap.""" + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + cfg = results['ann_info'] + + num_people = len(joints_3d) + num_joints = joints_3d[0].shape[0] + + if self.joint_indices is not None: + num_joints = len(self.joint_indices) + joint_indices = self.joint_indices + else: + joint_indices = list(range(num_joints)) + + space_size = cfg['space_size'] + space_center = cfg['space_center'] + cube_size = cfg['cube_size'] + grids_x = np.linspace(-space_size[0] / 2, space_size[0] / 2, + cube_size[0]) + space_center[0] + grids_y = np.linspace(-space_size[1] / 2, space_size[1] / 2, + cube_size[1]) + space_center[1] + grids_z = np.linspace(-space_size[2] / 2, space_size[2] / 2, + cube_size[2]) + space_center[2] + + target = np.zeros( + (num_joints, cube_size[0], cube_size[1], cube_size[2]), + dtype=np.float32) + + for n in range(num_people): + for idx, joint_id in enumerate(joint_indices): + mu_x = joints_3d[n][joint_id][0] + mu_y = joints_3d[n][joint_id][1] + mu_z = joints_3d[n][joint_id][2] + vis = joints_3d_visible[n][joint_id][0] + if vis < 1: + continue + i_x = [ + np.searchsorted(grids_x, mu_x - 3 * self.sigma), + np.searchsorted(grids_x, mu_x + 3 * self.sigma, 'right') + ] + i_y = [ + np.searchsorted(grids_y, mu_y - 3 * self.sigma), + np.searchsorted(grids_y, mu_y + 3 * self.sigma, 'right') + ] + i_z = [ + np.searchsorted(grids_z, mu_z - 3 * self.sigma), + np.searchsorted(grids_z, mu_z + 3 * self.sigma, 'right') + ] + if i_x[0] >= i_x[1] or i_y[0] >= i_y[1] or i_z[0] >= i_z[1]: + continue + kernel_xs, kernel_ys, kernel_zs = np.meshgrid( + grids_x[i_x[0]:i_x[1]], + grids_y[i_y[0]:i_y[1]], + grids_z[i_z[0]:i_z[1]], + indexing='ij') + g = np.exp(-((kernel_xs - mu_x)**2 + (kernel_ys - mu_y)**2 + + (kernel_zs - mu_z)**2) / (2 * self.sigma**2)) + target[idx, i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]] \ + = np.maximum(target[idx, i_x[0]:i_x[1], + i_y[0]:i_y[1], i_z[0]:i_z[1]], g) + + target = np.clip(target, 0, 1) + if target.shape[0] == 1: + target = target[0] + + results['targets_3d'] = target + + return results diff --git a/mmpose/datasets/pipelines/shared_transform.py b/mmpose/datasets/pipelines/shared_transform.py new file mode 100644 index 0000000..e4fea80 --- /dev/null +++ b/mmpose/datasets/pipelines/shared_transform.py @@ -0,0 +1,527 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from collections.abc import Sequence + +import mmcv +import numpy as np +from mmcv.parallel import DataContainer as DC +from mmcv.utils import build_from_cfg +from numpy import random +from torchvision.transforms import functional as F + +from ..builder import PIPELINES + +try: + import albumentations +except ImportError: + albumentations = None + + +@PIPELINES.register_module() +class ToTensor: + """Transform image to Tensor. + + Required key: 'img'. Modifies key: 'img'. + + Args: + results (dict): contain all information about training. + """ + + def __call__(self, results): + if isinstance(results['img'], (list, tuple)): + results['img'] = [F.to_tensor(img) for img in results['img']] + else: + results['img'] = F.to_tensor(results['img']) + + return results + + +@PIPELINES.register_module() +class NormalizeTensor: + """Normalize the Tensor image (CxHxW), with mean and std. + + Required key: 'img'. Modifies key: 'img'. + + Args: + mean (list[float]): Mean values of 3 channels. + std (list[float]): Std values of 3 channels. + """ + + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, results): + if isinstance(results['img'], (list, tuple)): + results['img'] = [ + F.normalize(img, mean=self.mean, std=self.std) + for img in results['img'] + ] + else: + results['img'] = F.normalize( + results['img'], mean=self.mean, std=self.std) + + return results + + +@PIPELINES.register_module() +class Compose: + """Compose a data pipeline with a sequence of transforms. + + Args: + transforms (list[dict | callable]): Either config + dicts of transforms or transform objects. + """ + + def __init__(self, transforms): + assert isinstance(transforms, Sequence) + self.transforms = [] + for transform in transforms: + if isinstance(transform, dict): + transform = build_from_cfg(transform, PIPELINES) + self.transforms.append(transform) + elif callable(transform): + self.transforms.append(transform) + else: + raise TypeError('transform must be callable or a dict, but got' + f' {type(transform)}') + + def __call__(self, data): + """Call function to apply transforms sequentially. + + Args: + data (dict): A result dict contains the data to transform. + + Returns: + dict: Transformed data. + """ + for t in self.transforms: + data = t(data) + if data is None: + return None + return data + + def __repr__(self): + """Compute the string representation.""" + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + format_string += f'\n {t}' + format_string += '\n)' + return format_string + + +@PIPELINES.register_module() +class Collect: + """Collect data from the loader relevant to the specific task. + + This keeps the items in `keys` as it is, and collect items in `meta_keys` + into a meta item called `meta_name`.This is usually the last stage of the + data loader pipeline. + For example, when keys='imgs', meta_keys=('filename', 'label', + 'original_shape'), meta_name='img_metas', the results will be a dict with + keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of + another dict with keys 'filename', 'label', 'original_shape'. + + Args: + keys (Sequence[str|tuple]): Required keys to be collected. If a tuple + (key, key_new) is given as an element, the item retrieved by key will + be renamed as key_new in collected data. + meta_name (str): The name of the key that contains meta information. + This key is always populated. Default: "img_metas". + meta_keys (Sequence[str|tuple]): Keys that are collected under + meta_name. The contents of the `meta_name` dictionary depends + on `meta_keys`. + """ + + def __init__(self, keys, meta_keys, meta_name='img_metas'): + self.keys = keys + self.meta_keys = meta_keys + self.meta_name = meta_name + + def __call__(self, results): + """Performs the Collect formatting. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + if 'ann_info' in results: + results.update(results['ann_info']) + + data = {} + for key in self.keys: + if isinstance(key, tuple): + assert len(key) == 2 + key_src, key_tgt = key[:2] + else: + key_src = key_tgt = key + data[key_tgt] = results[key_src] + + meta = {} + if len(self.meta_keys) != 0: + for key in self.meta_keys: + if isinstance(key, tuple): + assert len(key) == 2 + key_src, key_tgt = key[:2] + else: + key_src = key_tgt = key + meta[key_tgt] = results[key_src] + if 'bbox_id' in results: + meta['bbox_id'] = results['bbox_id'] + data[self.meta_name] = DC(meta, cpu_only=True) + + return data + + def __repr__(self): + """Compute the string representation.""" + return (f'{self.__class__.__name__}(' + f'keys={self.keys}, meta_keys={self.meta_keys})') + + +@PIPELINES.register_module() +class Albumentation: + """Albumentation augmentation (pixel-level transforms only). Adds custom + pixel-level transformations from Albumentations library. Please visit + `https://albumentations.readthedocs.io` to get more information. + + Note: we only support pixel-level transforms. + Please visit `https://github.com/albumentations-team/` + `albumentations#pixel-level-transforms` + to get more information about pixel-level transforms. + + An example of ``transforms`` is as followed: + + .. code-block:: python + + [ + dict( + type='RandomBrightnessContrast', + brightness_limit=[0.1, 0.3], + contrast_limit=[0.1, 0.3], + p=0.2), + dict(type='ChannelShuffle', p=0.1), + dict( + type='OneOf', + transforms=[ + dict(type='Blur', blur_limit=3, p=1.0), + dict(type='MedianBlur', blur_limit=3, p=1.0) + ], + p=0.1), + ] + + Args: + transforms (list[dict]): A list of Albumentation transformations + keymap (dict): Contains {'input key':'albumentation-style key'}, + e.g., {'img': 'image'}. + """ + + def __init__(self, transforms, keymap=None): + if albumentations is None: + raise RuntimeError('albumentations is not installed') + + self.transforms = transforms + self.filter_lost_elements = False + + self.aug = albumentations.Compose( + [self.albu_builder(t) for t in self.transforms]) + + if not keymap: + self.keymap_to_albu = { + 'img': 'image', + } + else: + self.keymap_to_albu = keymap + self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()} + + def albu_builder(self, cfg): + """Import a module from albumentations. + + It resembles some of :func:`build_from_cfg` logic. + + Args: + cfg (dict): Config dict. It should at least contain the key "type". + + Returns: + obj: The constructed object. + """ + + assert isinstance(cfg, dict) and 'type' in cfg + args = cfg.copy() + + obj_type = args.pop('type') + if mmcv.is_str(obj_type): + if albumentations is None: + raise RuntimeError('albumentations is not installed') + if not hasattr(albumentations.augmentations.transforms, obj_type): + warnings.warn('{obj_type} is not pixel-level transformations. ' + 'Please use with caution.') + obj_cls = getattr(albumentations, obj_type) + else: + raise TypeError(f'type must be a str, but got {type(obj_type)}') + + if 'transforms' in args: + args['transforms'] = [ + self.albu_builder(transform) + for transform in args['transforms'] + ] + + return obj_cls(**args) + + @staticmethod + def mapper(d, keymap): + """Dictionary mapper. + + Renames keys according to keymap provided. + + Args: + d (dict): old dict + keymap (dict): {'old_key':'new_key'} + + Returns: + dict: new dict. + """ + + updated_dict = {keymap.get(k, k): v for k, v in d.items()} + return updated_dict + + def __call__(self, results): + # dict to albumentations format + results = self.mapper(results, self.keymap_to_albu) + + results = self.aug(**results) + # back to the original format + results = self.mapper(results, self.keymap_back) + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + f'(transforms={self.transforms})' + return repr_str + + +@PIPELINES.register_module() +class PhotometricDistortion: + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. The position of random contrast is in + second or second to last. + + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + 8. randomly swap channels + + Args: + brightness_delta (int): delta of brightness. + contrast_range (tuple): range of contrast. + saturation_range (tuple): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18): + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + def convert(self, img, alpha=1, beta=0): + """Multiple with alpha and add beta with clip.""" + img = img.astype(np.float32) * alpha + beta + img = np.clip(img, 0, 255) + return img.astype(np.uint8) + + def brightness(self, img): + """Brightness distortion.""" + if random.randint(2): + return self.convert( + img, + beta=random.uniform(-self.brightness_delta, + self.brightness_delta)) + return img + + def contrast(self, img): + """Contrast distortion.""" + if random.randint(2): + return self.convert( + img, + alpha=random.uniform(self.contrast_lower, self.contrast_upper)) + return img + + def saturation(self, img): + # Apply saturation distortion to hsv-formatted img + img[:, :, 1] = self.convert( + img[:, :, 1], + alpha=random.uniform(self.saturation_lower, self.saturation_upper)) + return img + + def hue(self, img): + # Apply hue distortion to hsv-formatted img + img[:, :, 0] = (img[:, :, 0].astype(int) + + random.randint(-self.hue_delta, self.hue_delta)) % 180 + return img + + def swap_channels(self, img): + # Apply channel swap + if random.randint(2): + img = img[..., random.permutation(3)] + return img + + def __call__(self, results): + """Call function to perform photometric distortion on images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images distorted. + """ + + img = results['img'] + # random brightness + img = self.brightness(img) + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + mode = random.randint(2) + if mode == 1: + img = self.contrast(img) + + hsv_mode = random.randint(4) + if hsv_mode: + # random saturation/hue distortion + img = mmcv.bgr2hsv(img) + if hsv_mode == 1 or hsv_mode == 3: + img = self.saturation(img) + if hsv_mode == 2 or hsv_mode == 3: + img = self.hue(img) + img = mmcv.hsv2bgr(img) + + # random contrast + if mode == 0: + img = self.contrast(img) + + # randomly swap channels + self.swap_channels(img) + + results['img'] = img + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += (f'(brightness_delta={self.brightness_delta}, ' + f'contrast_range=({self.contrast_lower}, ' + f'{self.contrast_upper}), ' + f'saturation_range=({self.saturation_lower}, ' + f'{self.saturation_upper}), ' + f'hue_delta={self.hue_delta})') + return repr_str + + +@PIPELINES.register_module() +class MultiItemProcess: + """Process each item and merge multi-item results to lists. + + Args: + pipeline (dict): Dictionary to construct pipeline for a single item. + """ + + def __init__(self, pipeline): + self.pipeline = Compose(pipeline) + + def __call__(self, results): + results_ = {} + for idx, result in results.items(): + single_result = self.pipeline(result) + for k, v in single_result.items(): + if k in results_: + results_[k].append(v) + else: + results_[k] = [v] + + return results_ + + +@PIPELINES.register_module() +class DiscardDuplicatedItems: + + def __init__(self, keys_list): + """Discard duplicated single-item results. + + Args: + keys_list (list): List of keys that need to be deduplicate. + """ + self.keys_list = keys_list + + def __call__(self, results): + for k, v in results.items(): + if k in self.keys_list: + assert isinstance(v, Sequence) + results[k] = v[0] + + return results + + +@PIPELINES.register_module() +class MultitaskGatherTarget: + """Gather the targets for multitask heads. + + Args: + pipeline_list (list[list]): List of pipelines for all heads. + pipeline_indices (list[int]): Pipeline index of each head. + """ + + def __init__(self, + pipeline_list, + pipeline_indices=None, + keys=('target', 'target_weight')): + self.keys = keys + self.pipelines = [] + for pipeline in pipeline_list: + self.pipelines.append(Compose(pipeline)) + if pipeline_indices is None: + self.pipeline_indices = list(range(len(pipeline_list))) + else: + self.pipeline_indices = pipeline_indices + + def __call__(self, results): + # generate target and target weights using all pipelines + pipeline_outputs = [] + for pipeline in self.pipelines: + pipeline_output = pipeline(results) + pipeline_outputs.append(pipeline_output.copy()) + + for key in self.keys: + result_key = [] + for ind in self.pipeline_indices: + result_key.append(pipeline_outputs[ind].get(key, None)) + results[key] = result_key + return results + + +@PIPELINES.register_module() +class RenameKeys: + """Rename the keys. + + Args: + key_pairs (Sequence[tuple]): Required keys to be renamed. + If a tuple (key_src, key_tgt) is given as an element, + the item retrieved by key_src will be renamed as key_tgt. + """ + + def __init__(self, key_pairs): + self.key_pairs = key_pairs + + def __call__(self, results): + """Rename keys.""" + for key_pair in self.key_pairs: + assert len(key_pair) == 2 + key_src, key_tgt = key_pair + results[key_tgt] = results.pop(key_src) + return results diff --git a/mmpose/datasets/pipelines/top_down_transform.py b/mmpose/datasets/pipelines/top_down_transform.py new file mode 100644 index 0000000..1af1ea9 --- /dev/null +++ b/mmpose/datasets/pipelines/top_down_transform.py @@ -0,0 +1,736 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + +from mmpose.core.post_processing import (affine_transform, fliplr_joints, + get_affine_transform, get_warp_matrix, + warp_affine_joints) +from mmpose.datasets.builder import PIPELINES + + +@PIPELINES.register_module() +class TopDownRandomFlip: + """Data augmentation with random image flip. + + Required keys: 'img', 'joints_3d', 'joints_3d_visible', 'center' and + 'ann_info'. + + Modifies key: 'img', 'joints_3d', 'joints_3d_visible', 'center' and + 'flipped'. + + Args: + flip (bool): Option to perform random flip. + flip_prob (float): Probability of flip. + """ + + def __init__(self, flip_prob=0.5): + self.flip_prob = flip_prob + + def __call__(self, results): + """Perform data augmentation with random image flip.""" + img = results['img'] + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + center = results['center'] + + # A flag indicating whether the image is flipped, + # which can be used by child class. + flipped = False + if np.random.rand() <= self.flip_prob: + flipped = True + if not isinstance(img, list): + img = img[:, ::-1, :] + else: + img = [i[:, ::-1, :] for i in img] + if not isinstance(img, list): + joints_3d, joints_3d_visible = fliplr_joints( + joints_3d, joints_3d_visible, img.shape[1], + results['ann_info']['flip_pairs']) + center[0] = img.shape[1] - center[0] - 1 + else: + joints_3d, joints_3d_visible = fliplr_joints( + joints_3d, joints_3d_visible, img[0].shape[1], + results['ann_info']['flip_pairs']) + center[0] = img[0].shape[1] - center[0] - 1 + + results['img'] = img + results['joints_3d'] = joints_3d + results['joints_3d_visible'] = joints_3d_visible + results['center'] = center + results['flipped'] = flipped + + return results + + +@PIPELINES.register_module() +class TopDownHalfBodyTransform: + """Data augmentation with half-body transform. Keep only the upper body or + the lower body at random. + + Required keys: 'joints_3d', 'joints_3d_visible', and 'ann_info'. + + Modifies key: 'scale' and 'center'. + + Args: + num_joints_half_body (int): Threshold of performing + half-body transform. If the body has fewer number + of joints (< num_joints_half_body), ignore this step. + prob_half_body (float): Probability of half-body transform. + """ + + def __init__(self, num_joints_half_body=8, prob_half_body=0.3): + self.num_joints_half_body = num_joints_half_body + self.prob_half_body = prob_half_body + + @staticmethod + def half_body_transform(cfg, joints_3d, joints_3d_visible): + """Get center&scale for half-body transform.""" + upper_joints = [] + lower_joints = [] + for joint_id in range(cfg['num_joints']): + if joints_3d_visible[joint_id][0] > 0: + if joint_id in cfg['upper_body_ids']: + upper_joints.append(joints_3d[joint_id]) + else: + lower_joints.append(joints_3d[joint_id]) + + if np.random.randn() < 0.5 and len(upper_joints) > 2: + selected_joints = upper_joints + elif len(lower_joints) > 2: + selected_joints = lower_joints + else: + selected_joints = upper_joints + + if len(selected_joints) < 2: + return None, None + + selected_joints = np.array(selected_joints, dtype=np.float32) + center = selected_joints.mean(axis=0)[:2] + + left_top = np.amin(selected_joints, axis=0) + + right_bottom = np.amax(selected_joints, axis=0) + + w = right_bottom[0] - left_top[0] + h = right_bottom[1] - left_top[1] + + aspect_ratio = cfg['image_size'][0] / cfg['image_size'][1] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + scale = scale * 1.5 + return center, scale + + def __call__(self, results): + """Perform data augmentation with half-body transform.""" + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + + if (np.sum(joints_3d_visible[:, 0]) > self.num_joints_half_body + and np.random.rand() < self.prob_half_body): + + c_half_body, s_half_body = self.half_body_transform( + results['ann_info'], joints_3d, joints_3d_visible) + + if c_half_body is not None and s_half_body is not None: + results['center'] = c_half_body + results['scale'] = s_half_body + + return results + + +@PIPELINES.register_module() +class TopDownGetRandomScaleRotation: + """Data augmentation with random scaling & rotating. + + Required key: 'scale'. + + Modifies key: 'scale' and 'rotation'. + + Args: + rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``. + scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``. + rot_prob (float): Probability of random rotation. + """ + + def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6): + self.rot_factor = rot_factor + self.scale_factor = scale_factor + self.rot_prob = rot_prob + + def __call__(self, results): + """Perform data augmentation with random scaling & rotating.""" + s = results['scale'] + + sf = self.scale_factor + rf = self.rot_factor + + s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) + s = s * s_factor + + r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) + r = r_factor if np.random.rand() <= self.rot_prob else 0 + + results['scale'] = s + results['rotation'] = r + + return results + + +@PIPELINES.register_module() +class TopDownAffine: + """Affine transform the image to make input. + + Required keys:'img', 'joints_3d', 'joints_3d_visible', 'ann_info','scale', + 'rotation' and 'center'. + + Modified keys:'img', 'joints_3d', and 'joints_3d_visible'. + + Args: + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, use_udp=False): + self.use_udp = use_udp + + def __call__(self, results): + image_size = results['ann_info']['image_size'] + + img = results['img'] + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + c = results['center'] + s = results['scale'] + r = results['rotation'] + + if self.use_udp: + trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0) + if not isinstance(img, list): + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + else: + img = [ + cv2.warpAffine( + i, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) for i in img + ] + + joints_3d[:, 0:2] = \ + warp_affine_joints(joints_3d[:, 0:2].copy(), trans) + + else: + trans = get_affine_transform(c, s, r, image_size) + if not isinstance(img, list): + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + else: + img = [ + cv2.warpAffine( + i, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) for i in img + ] + for i in range(results['ann_info']['num_joints']): + if joints_3d_visible[i, 0] > 0.0: + joints_3d[i, + 0:2] = affine_transform(joints_3d[i, 0:2], trans) + + results['img'] = img + results['joints_3d'] = joints_3d + results['joints_3d_visible'] = joints_3d_visible + + return results + + +@PIPELINES.register_module() +class TopDownGenerateTarget: + """Generate the target heatmap. + + Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'. + + Modified keys: 'target', and 'target_weight'. + + Args: + sigma: Sigma of heatmap gaussian for 'MSRA' approach. + kernel: Kernel of heatmap gaussian for 'Megvii' approach. + encoding (str): Approach to generate target heatmaps. + Currently supported approaches: 'MSRA', 'Megvii', 'UDP'. + Default:'MSRA' + unbiased_encoding (bool): Option to use unbiased + encoding methods. + Paper ref: Zhang et al. Distribution-Aware Coordinate + Representation for Human Pose Estimation (CVPR 2020). + keypoint_pose_distance: Keypoint pose distance for UDP. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + target_type (str): supported targets: 'GaussianHeatmap', + 'CombinedTarget'. Default:'GaussianHeatmap' + CombinedTarget: The combination of classification target + (response map) and regression target (offset map). + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, + sigma=2, + kernel=(11, 11), + valid_radius_factor=0.0546875, + target_type='GaussianHeatmap', + encoding='MSRA', + unbiased_encoding=False): + self.sigma = sigma + self.unbiased_encoding = unbiased_encoding + self.kernel = kernel + self.valid_radius_factor = valid_radius_factor + self.target_type = target_type + self.encoding = encoding + + def _msra_generate_target(self, cfg, joints_3d, joints_3d_visible, sigma): + """Generate the target heatmap via "MSRA" approach. + + Args: + cfg (dict): data config + joints_3d: np.ndarray ([num_joints, 3]) + joints_3d_visible: np.ndarray ([num_joints, 3]) + sigma: Sigma of heatmap gaussian + Returns: + tuple: A tuple containing targets. + + - target: Target heatmaps. + - target_weight: (1: visible, 0: invisible) + """ + num_joints = cfg['num_joints'] + image_size = cfg['image_size'] + W, H = cfg['heatmap_size'] + joint_weights = cfg['joint_weights'] + use_different_joint_weights = cfg['use_different_joint_weights'] + + target_weight = np.zeros((num_joints, 1), dtype=np.float32) + target = np.zeros((num_joints, H, W), dtype=np.float32) + + # 3-sigma rule + tmp_size = sigma * 3 + + if self.unbiased_encoding: + for joint_id in range(num_joints): + target_weight[joint_id] = joints_3d_visible[joint_id, 0] + + feat_stride = image_size / [W, H] + mu_x = joints_3d[joint_id][0] / feat_stride[0] + mu_y = joints_3d[joint_id][1] / feat_stride[1] + # Check that any part of the gaussian is in-bounds + ul = [mu_x - tmp_size, mu_y - tmp_size] + br = [mu_x + tmp_size + 1, mu_y + tmp_size + 1] + if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0: + target_weight[joint_id] = 0 + + if target_weight[joint_id] == 0: + continue + + x = np.arange(0, W, 1, np.float32) + y = np.arange(0, H, 1, np.float32) + y = y[:, None] + + if target_weight[joint_id] > 0.5: + target[joint_id] = np.exp(-((x - mu_x)**2 + + (y - mu_y)**2) / + (2 * sigma**2)) + else: + for joint_id in range(num_joints): + target_weight[joint_id] = joints_3d_visible[joint_id, 0] + + feat_stride = image_size / [W, H] + mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5) + mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5) + # Check that any part of the gaussian is in-bounds + ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] + br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] + if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0: + target_weight[joint_id] = 0 + + if target_weight[joint_id] > 0.5: + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, None] + x0 = y0 = size // 2 + # The gaussian is not normalized, + # we want the center value to equal 1 + g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) + + # Usable gaussian range + g_x = max(0, -ul[0]), min(br[0], W) - ul[0] + g_y = max(0, -ul[1]), min(br[1], H) - ul[1] + # Image range + img_x = max(0, ul[0]), min(br[0], W) + img_y = max(0, ul[1]), min(br[1], H) + + target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ + g[g_y[0]:g_y[1], g_x[0]:g_x[1]] + + if use_different_joint_weights: + target_weight = np.multiply(target_weight, joint_weights) + + return target, target_weight + + def _megvii_generate_target(self, cfg, joints_3d, joints_3d_visible, + kernel): + """Generate the target heatmap via "Megvii" approach. + + Args: + cfg (dict): data config + joints_3d: np.ndarray ([num_joints, 3]) + joints_3d_visible: np.ndarray ([num_joints, 3]) + kernel: Kernel of heatmap gaussian + + Returns: + tuple: A tuple containing targets. + + - target: Target heatmaps. + - target_weight: (1: visible, 0: invisible) + """ + + num_joints = cfg['num_joints'] + image_size = cfg['image_size'] + W, H = cfg['heatmap_size'] + heatmaps = np.zeros((num_joints, H, W), dtype='float32') + target_weight = np.zeros((num_joints, 1), dtype=np.float32) + + for i in range(num_joints): + target_weight[i] = joints_3d_visible[i, 0] + + if target_weight[i] < 1: + continue + + target_y = int(joints_3d[i, 1] * H / image_size[1]) + target_x = int(joints_3d[i, 0] * W / image_size[0]) + + if (target_x >= W or target_x < 0) \ + or (target_y >= H or target_y < 0): + target_weight[i] = 0 + continue + + heatmaps[i, target_y, target_x] = 1 + heatmaps[i] = cv2.GaussianBlur(heatmaps[i], kernel, 0) + maxi = heatmaps[i, target_y, target_x] + + heatmaps[i] /= maxi / 255 + + return heatmaps, target_weight + + def _udp_generate_target(self, cfg, joints_3d, joints_3d_visible, factor, + target_type): + """Generate the target heatmap via 'UDP' approach. Paper ref: Huang et + al. The Devil is in the Details: Delving into Unbiased Data Processing + for Human Pose Estimation (CVPR 2020). + + Note: + - num keypoints: K + - heatmap height: H + - heatmap width: W + - num target channels: C + - C = K if target_type=='GaussianHeatmap' + - C = 3*K if target_type=='CombinedTarget' + + Args: + cfg (dict): data config + joints_3d (np.ndarray[K, 3]): Annotated keypoints. + joints_3d_visible (np.ndarray[K, 3]): Visibility of keypoints. + factor (float): kernel factor for GaussianHeatmap target or + valid radius factor for CombinedTarget. + target_type (str): 'GaussianHeatmap' or 'CombinedTarget'. + GaussianHeatmap: Heatmap target with gaussian distribution. + CombinedTarget: The combination of classification target + (response map) and regression target (offset map). + + Returns: + tuple: A tuple containing targets. + + - target (np.ndarray[C, H, W]): Target heatmaps. + - target_weight (np.ndarray[K, 1]): (1: visible, 0: invisible) + """ + num_joints = cfg['num_joints'] + image_size = cfg['image_size'] + heatmap_size = cfg['heatmap_size'] + joint_weights = cfg['joint_weights'] + use_different_joint_weights = cfg['use_different_joint_weights'] + + target_weight = np.ones((num_joints, 1), dtype=np.float32) + target_weight[:, 0] = joints_3d_visible[:, 0] + + if target_type.lower() == 'GaussianHeatmap'.lower(): + target = np.zeros((num_joints, heatmap_size[1], heatmap_size[0]), + dtype=np.float32) + + tmp_size = factor * 3 + + # prepare for gaussian + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, None] + + for joint_id in range(num_joints): + feat_stride = (image_size - 1.0) / (heatmap_size - 1.0) + mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5) + mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5) + # Check that any part of the gaussian is in-bounds + ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] + br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] + if ul[0] >= heatmap_size[0] or ul[1] >= heatmap_size[1] \ + or br[0] < 0 or br[1] < 0: + # If not, just return the image as is + target_weight[joint_id] = 0 + continue + + # # Generate gaussian + mu_x_ac = joints_3d[joint_id][0] / feat_stride[0] + mu_y_ac = joints_3d[joint_id][1] / feat_stride[1] + x0 = y0 = size // 2 + x0 += mu_x_ac - mu_x + y0 += mu_y_ac - mu_y + g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * factor**2)) + + # Usable gaussian range + g_x = max(0, -ul[0]), min(br[0], heatmap_size[0]) - ul[0] + g_y = max(0, -ul[1]), min(br[1], heatmap_size[1]) - ul[1] + # Image range + img_x = max(0, ul[0]), min(br[0], heatmap_size[0]) + img_y = max(0, ul[1]), min(br[1], heatmap_size[1]) + + v = target_weight[joint_id] + if v > 0.5: + target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ + g[g_y[0]:g_y[1], g_x[0]:g_x[1]] + + elif target_type.lower() == 'CombinedTarget'.lower(): + target = np.zeros( + (num_joints, 3, heatmap_size[1] * heatmap_size[0]), + dtype=np.float32) + feat_width = heatmap_size[0] + feat_height = heatmap_size[1] + feat_x_int = np.arange(0, feat_width) + feat_y_int = np.arange(0, feat_height) + feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int) + feat_x_int = feat_x_int.flatten() + feat_y_int = feat_y_int.flatten() + # Calculate the radius of the positive area in classification + # heatmap. + valid_radius = factor * heatmap_size[1] + feat_stride = (image_size - 1.0) / (heatmap_size - 1.0) + for joint_id in range(num_joints): + mu_x = joints_3d[joint_id][0] / feat_stride[0] + mu_y = joints_3d[joint_id][1] / feat_stride[1] + x_offset = (mu_x - feat_x_int) / valid_radius + y_offset = (mu_y - feat_y_int) / valid_radius + dis = x_offset**2 + y_offset**2 + keep_pos = np.where(dis <= 1)[0] + v = target_weight[joint_id] + if v > 0.5: + target[joint_id, 0, keep_pos] = 1 + target[joint_id, 1, keep_pos] = x_offset[keep_pos] + target[joint_id, 2, keep_pos] = y_offset[keep_pos] + target = target.reshape(num_joints * 3, heatmap_size[1], + heatmap_size[0]) + else: + raise ValueError('target_type should be either ' + "'GaussianHeatmap' or 'CombinedTarget'") + + if use_different_joint_weights: + target_weight = np.multiply(target_weight, joint_weights) + + return target, target_weight + + def __call__(self, results): + """Generate the target heatmap.""" + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + + assert self.encoding in ['MSRA', 'Megvii', 'UDP'] + + if self.encoding == 'MSRA': + if isinstance(self.sigma, list): + num_sigmas = len(self.sigma) + cfg = results['ann_info'] + num_joints = cfg['num_joints'] + heatmap_size = cfg['heatmap_size'] + + target = np.empty( + (0, num_joints, heatmap_size[1], heatmap_size[0]), + dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_sigmas): + target_i, target_weight_i = self._msra_generate_target( + cfg, joints_3d, joints_3d_visible, self.sigma[i]) + target = np.concatenate([target, target_i[None]], axis=0) + target_weight = np.concatenate( + [target_weight, target_weight_i[None]], axis=0) + else: + target, target_weight = self._msra_generate_target( + results['ann_info'], joints_3d, joints_3d_visible, + self.sigma) + + elif self.encoding == 'Megvii': + if isinstance(self.kernel, list): + num_kernels = len(self.kernel) + cfg = results['ann_info'] + num_joints = cfg['num_joints'] + W, H = cfg['heatmap_size'] + + target = np.empty((0, num_joints, H, W), dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_kernels): + target_i, target_weight_i = self._megvii_generate_target( + cfg, joints_3d, joints_3d_visible, self.kernel[i]) + target = np.concatenate([target, target_i[None]], axis=0) + target_weight = np.concatenate( + [target_weight, target_weight_i[None]], axis=0) + else: + target, target_weight = self._megvii_generate_target( + results['ann_info'], joints_3d, joints_3d_visible, + self.kernel) + + elif self.encoding == 'UDP': + if self.target_type.lower() == 'CombinedTarget'.lower(): + factors = self.valid_radius_factor + channel_factor = 3 + elif self.target_type.lower() == 'GaussianHeatmap'.lower(): + factors = self.sigma + channel_factor = 1 + else: + raise ValueError('target_type should be either ' + "'GaussianHeatmap' or 'CombinedTarget'") + if isinstance(factors, list): + num_factors = len(factors) + cfg = results['ann_info'] + num_joints = cfg['num_joints'] + W, H = cfg['heatmap_size'] + + target = np.empty((0, channel_factor * num_joints, H, W), + dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_factors): + target_i, target_weight_i = self._udp_generate_target( + cfg, joints_3d, joints_3d_visible, factors[i], + self.target_type) + target = np.concatenate([target, target_i[None]], axis=0) + target_weight = np.concatenate( + [target_weight, target_weight_i[None]], axis=0) + else: + target, target_weight = self._udp_generate_target( + results['ann_info'], joints_3d, joints_3d_visible, factors, + self.target_type) + else: + raise ValueError( + f'Encoding approach {self.encoding} is not supported!') + + if results['ann_info'].get('max_num_joints', None) is not None: + W, H = results['ann_info']['heatmap_size'] + padded_length = int(results['ann_info'].get('max_num_joints') - results['ann_info'].get('num_joints')) + target_weight = np.concatenate([target_weight, np.zeros((padded_length, 1), dtype=np.float32)], 0) + target = np.concatenate([target, np.zeros((padded_length, H, W), dtype=np.float32)], 0) + + results['target'] = target + results['target_weight'] = target_weight + + results['dataset_idx'] = results['ann_info'].get('dataset_idx', 0) + + return results + + +@PIPELINES.register_module() +class TopDownGenerateTargetRegression: + """Generate the target regression vector (coordinates). + + Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'. Modified keys: + 'target', and 'target_weight'. + """ + + def __init__(self): + pass + + def _generate_target(self, cfg, joints_3d, joints_3d_visible): + """Generate the target regression vector. + + Args: + cfg (dict): data config + joints_3d: np.ndarray([num_joints, 3]) + joints_3d_visible: np.ndarray([num_joints, 3]) + + Returns: + target, target_weight(1: visible, 0: invisible) + """ + image_size = cfg['image_size'] + joint_weights = cfg['joint_weights'] + use_different_joint_weights = cfg['use_different_joint_weights'] + + mask = (joints_3d[:, 0] >= 0) * ( + joints_3d[:, 0] <= image_size[0] - 1) * (joints_3d[:, 1] >= 0) * ( + joints_3d[:, 1] <= image_size[1] - 1) + + target = joints_3d[:, :2] / image_size + + target = target.astype(np.float32) + target_weight = joints_3d_visible[:, :2] * mask[:, None] + + if use_different_joint_weights: + target_weight = np.multiply(target_weight, joint_weights) + + return target, target_weight + + def __call__(self, results): + """Generate the target heatmap.""" + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + + target, target_weight = self._generate_target(results['ann_info'], + joints_3d, + joints_3d_visible) + + results['target'] = target + results['target_weight'] = target_weight + + return results + + +@PIPELINES.register_module() +class TopDownRandomTranslation: + """Data augmentation with random translation. + + Required key: 'scale' and 'center'. + + Modifies key: 'center'. + + Note: + - bbox height: H + - bbox width: W + + Args: + trans_factor (float): Translating center to + ``[-trans_factor, trans_factor] * [W, H] + center``. + trans_prob (float): Probability of random translation. + """ + + def __init__(self, trans_factor=0.15, trans_prob=1.0): + self.trans_factor = trans_factor + self.trans_prob = trans_prob + + def __call__(self, results): + """Perform data augmentation with random translation.""" + center = results['center'] + scale = results['scale'] + if np.random.rand() <= self.trans_prob: + # reference bbox size is [200, 200] pixels + center += self.trans_factor * np.random.uniform( + -1, 1, size=2) * scale * 200 + results['center'] = center + return results diff --git a/mmpose/datasets/registry.py b/mmpose/datasets/registry.py new file mode 100644 index 0000000..ba3cc49 --- /dev/null +++ b/mmpose/datasets/registry.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from .builder import DATASETS, PIPELINES + +__all__ = ['DATASETS', 'PIPELINES'] + +warnings.simplefilter('once', DeprecationWarning) +warnings.warn( + 'Registries (DATASETS, PIPELINES) have been moved to ' + 'mmpose.datasets.builder. Importing from ' + 'mmpose.models.registry will be deprecated in the future.', + DeprecationWarning) diff --git a/mmpose/datasets/samplers/__init__.py b/mmpose/datasets/samplers/__init__.py new file mode 100644 index 0000000..da09eff --- /dev/null +++ b/mmpose/datasets/samplers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .distributed_sampler import DistributedSampler + +__all__ = ['DistributedSampler'] diff --git a/mmpose/datasets/samplers/distributed_sampler.py b/mmpose/datasets/samplers/distributed_sampler.py new file mode 100644 index 0000000..bcb5f52 --- /dev/null +++ b/mmpose/datasets/samplers/distributed_sampler.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.utils.data import DistributedSampler as _DistributedSampler + + +class DistributedSampler(_DistributedSampler): + """DistributedSampler inheriting from + `torch.utils.data.DistributedSampler`. + + In pytorch of lower versions, there is no `shuffle` argument. This child + class will port one to DistributedSampler. + """ + + def __init__(self, + dataset, + num_replicas=None, + rank=None, + shuffle=True, + seed=0): + super().__init__( + dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + # for the compatibility from PyTorch 1.3+ + self.seed = seed if seed is not None else 0 + + def __iter__(self): + """Deterministically shuffle based on epoch.""" + if self.shuffle: + g = torch.Generator() + g.manual_seed(self.epoch + self.seed) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank:self.total_size:self.num_replicas] + assert len(indices) == self.num_samples + return iter(indices) diff --git a/mmpose/deprecated.py b/mmpose/deprecated.py new file mode 100644 index 0000000..b930901 --- /dev/null +++ b/mmpose/deprecated.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from .datasets.builder import DATASETS +from .datasets.datasets.base import Kpt2dSviewRgbImgTopDownDataset +from .models.builder import HEADS, POSENETS +from .models.detectors import AssociativeEmbedding +from .models.heads import (AEHigherResolutionHead, AESimpleHead, + DeepposeRegressionHead, HMRMeshHead, + TopdownHeatmapMSMUHead, + TopdownHeatmapMultiStageHead, + TopdownHeatmapSimpleHead) + + +@DATASETS.register_module() +class TopDownFreiHandDataset(Kpt2dSviewRgbImgTopDownDataset): + """Deprecated TopDownFreiHandDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'TopDownFreiHandDataset has been renamed into FreiHandDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/202 for details.') + ) + + def _get_db(self): + return [] + + def evaluate(self, cfg, preds, output_dir, *args, **kwargs): + return None + + +@DATASETS.register_module() +class TopDownOneHand10KDataset(Kpt2dSviewRgbImgTopDownDataset): + """Deprecated TopDownOneHand10KDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'TopDownOneHand10KDataset has been renamed into OneHand10KDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/202 for details.') + ) + + def _get_db(self): + return [] + + def evaluate(self, cfg, preds, output_dir, *args, **kwargs): + return None + + +@DATASETS.register_module() +class TopDownPanopticDataset(Kpt2dSviewRgbImgTopDownDataset): + """Deprecated TopDownPanopticDataset.""" + + def __init__(self, *args, **kwargs): + raise (ImportError( + 'TopDownPanopticDataset has been renamed into PanopticDataset,' + 'check https://github.com/open-mmlab/mmpose/pull/202 for details.') + ) + + def _get_db(self): + return [] + + def evaluate(self, cfg, preds, output_dir, *args, **kwargs): + return None + + +@HEADS.register_module() +class BottomUpHigherResolutionHead(AEHigherResolutionHead): + """Bottom-up head for Higher Resolution. + + BottomUpHigherResolutionHead has been renamed into AEHigherResolutionHead, + check https://github.com/open- mmlab/mmpose/pull/656 for details. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'BottomUpHigherResolutionHead has been renamed into ' + 'AEHigherResolutionHead, check ' + 'https://github.com/open-mmlab/mmpose/pull/656 for details.', + DeprecationWarning) + + +@HEADS.register_module() +class BottomUpSimpleHead(AESimpleHead): + """Bottom-up simple head. + + BottomUpSimpleHead has been renamed into AESimpleHead, check + https://github.com/open-mmlab/mmpose/pull/656 for details. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'BottomUpHigherResolutionHead has been renamed into ' + 'AEHigherResolutionHead, check ' + 'https://github.com/open-mmlab/mmpose/pull/656 for details', + DeprecationWarning) + + +@HEADS.register_module() +class TopDownSimpleHead(TopdownHeatmapSimpleHead): + """Top-down heatmap simple head. + + TopDownSimpleHead has been renamed into TopdownHeatmapSimpleHead, check + https://github.com/open-mmlab/mmpose/pull/656 for details. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'TopDownSimpleHead has been renamed into ' + 'TopdownHeatmapSimpleHead, check ' + 'https://github.com/open-mmlab/mmpose/pull/656 for details.', + DeprecationWarning) + + +@HEADS.register_module() +class TopDownMultiStageHead(TopdownHeatmapMultiStageHead): + """Top-down heatmap multi-stage head. + + TopDownMultiStageHead has been renamed into TopdownHeatmapMultiStageHead, + check https://github.com/open-mmlab/mmpose/pull/656 for details. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'TopDownMultiStageHead has been renamed into ' + 'TopdownHeatmapMultiStageHead, check ' + 'https://github.com/open-mmlab/mmpose/pull/656 for details.', + DeprecationWarning) + + +@HEADS.register_module() +class TopDownMSMUHead(TopdownHeatmapMSMUHead): + """Heads for multi-stage multi-unit heads. + + TopDownMSMUHead has been renamed into TopdownHeatmapMSMUHead, check + https://github.com/open-mmlab/mmpose/pull/656 for details. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'TopDownMSMUHead has been renamed into ' + 'TopdownHeatmapMSMUHead, check ' + 'https://github.com/open-mmlab/mmpose/pull/656 for details.', + DeprecationWarning) + + +@HEADS.register_module() +class MeshHMRHead(HMRMeshHead): + """SMPL parameters regressor head. + + MeshHMRHead has been renamed into HMRMeshHead, check + https://github.com/open-mmlab/mmpose/pull/656 for details. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'MeshHMRHead has been renamed into ' + 'HMRMeshHead, check ' + 'https://github.com/open-mmlab/mmpose/pull/656 for details.', + DeprecationWarning) + + +@HEADS.register_module() +class FcHead(DeepposeRegressionHead): + """FcHead (deprecated). + + FcHead has been renamed into DeepposeRegressionHead, check + https://github.com/open-mmlab/mmpose/pull/656 for details. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'FcHead has been renamed into ' + 'DeepposeRegressionHead, check ' + 'https://github.com/open-mmlab/mmpose/pull/656 for details.', + DeprecationWarning) + + +@POSENETS.register_module() +class BottomUp(AssociativeEmbedding): + """Associative Embedding. + + BottomUp has been renamed into AssociativeEmbedding, check + https://github.com/open-mmlab/mmpose/pull/656 for details. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'BottomUp has been renamed into ' + 'AssociativeEmbedding, check ' + 'https://github.com/open-mmlab/mmpose/pull/656 for details.', + DeprecationWarning) diff --git a/mmpose/models/__init__.py b/mmpose/models/__init__.py new file mode 100644 index 0000000..dbec55e --- /dev/null +++ b/mmpose/models/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backbones import * # noqa +from .builder import (BACKBONES, HEADS, LOSSES, MESH_MODELS, NECKS, POSENETS, + build_backbone, build_head, build_loss, build_mesh_model, + build_neck, build_posenet) +from .detectors import * # noqa +from .heads import * # noqa +from .losses import * # noqa +from .necks import * # noqa +from .utils import * # noqa + +__all__ = [ + 'BACKBONES', 'HEADS', 'NECKS', 'LOSSES', 'POSENETS', 'MESH_MODELS', + 'build_backbone', 'build_head', 'build_loss', 'build_posenet', + 'build_neck', 'build_mesh_model' +] diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py new file mode 100644 index 0000000..2b8efcf --- /dev/null +++ b/mmpose/models/backbones/__init__.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .alexnet import AlexNet +from .cpm import CPM +from .hourglass import HourglassNet +from .hourglass_ae import HourglassAENet +from .hrformer import HRFormer +from .hrnet import HRNet +from .litehrnet import LiteHRNet +from .mobilenet_v2 import MobileNetV2 +from .mobilenet_v3 import MobileNetV3 +from .mspn import MSPN +from .regnet import RegNet +from .resnest import ResNeSt +from .resnet import ResNet, ResNetV1d +from .resnext import ResNeXt +from .rsn import RSN +from .scnet import SCNet +from .seresnet import SEResNet +from .seresnext import SEResNeXt +from .shufflenet_v1 import ShuffleNetV1 +from .shufflenet_v2 import ShuffleNetV2 +from .tcn import TCN +from .v2v_net import V2VNet +from .vgg import VGG +from .vipnas_mbv3 import ViPNAS_MobileNetV3 +from .vipnas_resnet import ViPNAS_ResNet +from .vit import ViT +from .vit_moe import ViTMoE + +__all__ = [ + 'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2', + 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet', + 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', + 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', + 'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT', 'ViTMoE' +] diff --git a/mmpose/models/backbones/alexnet.py b/mmpose/models/backbones/alexnet.py new file mode 100644 index 0000000..a8efd74 --- /dev/null +++ b/mmpose/models/backbones/alexnet.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +@BACKBONES.register_module() +class AlexNet(BaseBackbone): + """`AlexNet `__ backbone. + + The input for AlexNet is a 224x224 RGB image. + + Args: + num_classes (int): number of classes for classification. + The default value is -1, which uses the backbone as + a feature extractor without the top classifier. + """ + + def __init__(self, num_classes=-1): + super().__init__() + self.num_classes = num_classes + self.features = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(64, 192, kernel_size=5, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(192, 384, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(384, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + ) + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Dropout(), + nn.Linear(256 * 6 * 6, 4096), + nn.ReLU(inplace=True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(inplace=True), + nn.Linear(4096, num_classes), + ) + + def forward(self, x): + + x = self.features(x) + if self.num_classes > 0: + x = x.view(x.size(0), 256 * 6 * 6) + x = self.classifier(x) + + return x diff --git a/mmpose/models/backbones/base_backbone.py b/mmpose/models/backbones/base_backbone.py new file mode 100644 index 0000000..d64dca1 --- /dev/null +++ b/mmpose/models/backbones/base_backbone.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +from abc import ABCMeta, abstractmethod + +import torch.nn as nn + +# from .utils import load_checkpoint +from mmcv_custom.checkpoint import load_checkpoint + +class BaseBackbone(nn.Module, metaclass=ABCMeta): + """Base backbone. + + This class defines the basic functions of a backbone. Any backbone that + inherits this class should at least define its own `forward` function. + """ + + def init_weights(self, pretrained=None, patch_padding='pad', part_features=None): + """Init backbone weights. + + Args: + pretrained (str | None): If pretrained is a string, then it + initializes backbone weights by loading the pretrained + checkpoint. If pretrained is None, then it follows default + initializer or customized initializer in subclasses. + """ + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding, part_features=part_features) + elif pretrained is None: + # use default initializer or customized initializer in subclasses + pass + else: + raise TypeError('pretrained must be a str or None.' + f' But received {type(pretrained)}.') + + @abstractmethod + def forward(self, x): + """Forward function. + + Args: + x (Tensor | tuple[Tensor]): x could be a torch.Tensor or a tuple of + torch.Tensor, containing input data for forward computation. + """ diff --git a/mmpose/models/backbones/cpm.py b/mmpose/models/backbones/cpm.py new file mode 100644 index 0000000..458245d --- /dev/null +++ b/mmpose/models/backbones/cpm.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, constant_init, normal_init +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import load_checkpoint + + +class CpmBlock(nn.Module): + """CpmBlock for Convolutional Pose Machine. + + Args: + in_channels (int): Input channels of this block. + channels (list): Output channels of each conv module. + kernels (list): Kernel sizes of each conv module. + """ + + def __init__(self, + in_channels, + channels=(128, 128, 128), + kernels=(11, 11, 11), + norm_cfg=None): + super().__init__() + + assert len(channels) == len(kernels) + layers = [] + for i in range(len(channels)): + if i == 0: + input_channels = in_channels + else: + input_channels = channels[i - 1] + layers.append( + ConvModule( + input_channels, + channels[i], + kernels[i], + padding=(kernels[i] - 1) // 2, + norm_cfg=norm_cfg)) + self.model = nn.Sequential(*layers) + + def forward(self, x): + """Model forward function.""" + out = self.model(x) + return out + + +@BACKBONES.register_module() +class CPM(BaseBackbone): + """CPM backbone. + + Convolutional Pose Machines. + More details can be found in the `paper + `__ . + + Args: + in_channels (int): The input channels of the CPM. + out_channels (int): The output channels of the CPM. + feat_channels (int): Feature channel of each CPM stage. + middle_channels (int): Feature channel of conv after the middle stage. + num_stages (int): Number of stages. + norm_cfg (dict): Dictionary to construct and config norm layer. + + Example: + >>> from mmpose.models import CPM + >>> import torch + >>> self = CPM(3, 17) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 368, 368) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... print(tuple(level_output.shape)) + (1, 17, 46, 46) + (1, 17, 46, 46) + (1, 17, 46, 46) + (1, 17, 46, 46) + (1, 17, 46, 46) + (1, 17, 46, 46) + """ + + def __init__(self, + in_channels, + out_channels, + feat_channels=128, + middle_channels=32, + num_stages=6, + norm_cfg=dict(type='BN', requires_grad=True)): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + assert in_channels == 3 + + self.num_stages = num_stages + assert self.num_stages >= 1 + + self.stem = nn.Sequential( + ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1), + ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1), + ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1), + ConvModule(128, 32, 5, padding=2, norm_cfg=norm_cfg), + ConvModule(32, 512, 9, padding=4, norm_cfg=norm_cfg), + ConvModule(512, 512, 1, padding=0, norm_cfg=norm_cfg), + ConvModule(512, out_channels, 1, padding=0, act_cfg=None)) + + self.middle = nn.Sequential( + ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1), + ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1), + ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) + + self.cpm_stages = nn.ModuleList([ + CpmBlock( + middle_channels + out_channels, + channels=[feat_channels, feat_channels, feat_channels], + kernels=[11, 11, 11], + norm_cfg=norm_cfg) for _ in range(num_stages - 1) + ]) + + self.middle_conv = nn.ModuleList([ + nn.Sequential( + ConvModule( + 128, middle_channels, 5, padding=2, norm_cfg=norm_cfg)) + for _ in range(num_stages - 1) + ]) + + self.out_convs = nn.ModuleList([ + nn.Sequential( + ConvModule( + feat_channels, + feat_channels, + 1, + padding=0, + norm_cfg=norm_cfg), + ConvModule(feat_channels, out_channels, 1, act_cfg=None)) + for _ in range(num_stages - 1) + ]) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Model forward function.""" + stage1_out = self.stem(x) + middle_out = self.middle(x) + out_feats = [] + + out_feats.append(stage1_out) + + for ind in range(self.num_stages - 1): + single_stage = self.cpm_stages[ind] + out_conv = self.out_convs[ind] + + inp_feat = torch.cat( + [out_feats[-1], self.middle_conv[ind](middle_out)], 1) + cpm_feat = single_stage(inp_feat) + out_feat = out_conv(cpm_feat) + out_feats.append(out_feat) + + return out_feats diff --git a/mmpose/models/backbones/hourglass.py b/mmpose/models/backbones/hourglass.py new file mode 100644 index 0000000..bf75fad --- /dev/null +++ b/mmpose/models/backbones/hourglass.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +from mmcv.cnn import ConvModule, constant_init, normal_init +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .resnet import BasicBlock, ResLayer +from .utils import load_checkpoint + + +class HourglassModule(nn.Module): + """Hourglass Module for HourglassNet backbone. + + Generate module recursively and use BasicBlock as the base unit. + + Args: + depth (int): Depth of current HourglassModule. + stage_channels (list[int]): Feature channels of sub-modules in current + and follow-up HourglassModule. + stage_blocks (list[int]): Number of sub-modules stacked in current and + follow-up HourglassModule. + norm_cfg (dict): Dictionary to construct and config norm layer. + """ + + def __init__(self, + depth, + stage_channels, + stage_blocks, + norm_cfg=dict(type='BN', requires_grad=True)): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + self.depth = depth + + cur_block = stage_blocks[0] + next_block = stage_blocks[1] + + cur_channel = stage_channels[0] + next_channel = stage_channels[1] + + self.up1 = ResLayer( + BasicBlock, cur_block, cur_channel, cur_channel, norm_cfg=norm_cfg) + + self.low1 = ResLayer( + BasicBlock, + cur_block, + cur_channel, + next_channel, + stride=2, + norm_cfg=norm_cfg) + + if self.depth > 1: + self.low2 = HourglassModule(depth - 1, stage_channels[1:], + stage_blocks[1:]) + else: + self.low2 = ResLayer( + BasicBlock, + next_block, + next_channel, + next_channel, + norm_cfg=norm_cfg) + + self.low3 = ResLayer( + BasicBlock, + cur_block, + next_channel, + cur_channel, + norm_cfg=norm_cfg, + downsample_first=False) + + self.up2 = nn.Upsample(scale_factor=2) + + def forward(self, x): + """Model forward function.""" + up1 = self.up1(x) + low1 = self.low1(x) + low2 = self.low2(low1) + low3 = self.low3(low2) + up2 = self.up2(low3) + return up1 + up2 + + +@BACKBONES.register_module() +class HourglassNet(BaseBackbone): + """HourglassNet backbone. + + Stacked Hourglass Networks for Human Pose Estimation. + More details can be found in the `paper + `__ . + + Args: + downsample_times (int): Downsample times in a HourglassModule. + num_stacks (int): Number of HourglassModule modules stacked, + 1 for Hourglass-52, 2 for Hourglass-104. + stage_channels (list[int]): Feature channel of each sub-module in a + HourglassModule. + stage_blocks (list[int]): Number of sub-modules stacked in a + HourglassModule. + feat_channel (int): Feature channel of conv after a HourglassModule. + norm_cfg (dict): Dictionary to construct and config norm layer. + + Example: + >>> from mmpose.models import HourglassNet + >>> import torch + >>> self = HourglassNet() + >>> self.eval() + >>> inputs = torch.rand(1, 3, 511, 511) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... print(tuple(level_output.shape)) + (1, 256, 128, 128) + (1, 256, 128, 128) + """ + + def __init__(self, + downsample_times=5, + num_stacks=2, + stage_channels=(256, 256, 384, 384, 384, 512), + stage_blocks=(2, 2, 2, 2, 2, 4), + feat_channel=256, + norm_cfg=dict(type='BN', requires_grad=True)): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + self.num_stacks = num_stacks + assert self.num_stacks >= 1 + assert len(stage_channels) == len(stage_blocks) + assert len(stage_channels) > downsample_times + + cur_channel = stage_channels[0] + + self.stem = nn.Sequential( + ConvModule(3, 128, 7, padding=3, stride=2, norm_cfg=norm_cfg), + ResLayer(BasicBlock, 1, 128, 256, stride=2, norm_cfg=norm_cfg)) + + self.hourglass_modules = nn.ModuleList([ + HourglassModule(downsample_times, stage_channels, stage_blocks) + for _ in range(num_stacks) + ]) + + self.inters = ResLayer( + BasicBlock, + num_stacks - 1, + cur_channel, + cur_channel, + norm_cfg=norm_cfg) + + self.conv1x1s = nn.ModuleList([ + ConvModule( + cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None) + for _ in range(num_stacks - 1) + ]) + + self.out_convs = nn.ModuleList([ + ConvModule( + cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg) + for _ in range(num_stacks) + ]) + + self.remap_convs = nn.ModuleList([ + ConvModule( + feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None) + for _ in range(num_stacks - 1) + ]) + + self.relu = nn.ReLU(inplace=True) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Model forward function.""" + inter_feat = self.stem(x) + out_feats = [] + + for ind in range(self.num_stacks): + single_hourglass = self.hourglass_modules[ind] + out_conv = self.out_convs[ind] + + hourglass_feat = single_hourglass(inter_feat) + out_feat = out_conv(hourglass_feat) + out_feats.append(out_feat) + + if ind < self.num_stacks - 1: + inter_feat = self.conv1x1s[ind]( + inter_feat) + self.remap_convs[ind]( + out_feat) + inter_feat = self.inters[ind](self.relu(inter_feat)) + + return out_feats diff --git a/mmpose/models/backbones/hourglass_ae.py b/mmpose/models/backbones/hourglass_ae.py new file mode 100644 index 0000000..5a700e5 --- /dev/null +++ b/mmpose/models/backbones/hourglass_ae.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +from mmcv.cnn import ConvModule, MaxPool2d, constant_init, normal_init +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import load_checkpoint + + +class HourglassAEModule(nn.Module): + """Modified Hourglass Module for HourglassNet_AE backbone. + + Generate module recursively and use BasicBlock as the base unit. + + Args: + depth (int): Depth of current HourglassModule. + stage_channels (list[int]): Feature channels of sub-modules in current + and follow-up HourglassModule. + norm_cfg (dict): Dictionary to construct and config norm layer. + """ + + def __init__(self, + depth, + stage_channels, + norm_cfg=dict(type='BN', requires_grad=True)): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + self.depth = depth + + cur_channel = stage_channels[0] + next_channel = stage_channels[1] + + self.up1 = ConvModule( + cur_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg) + + self.pool1 = MaxPool2d(2, 2) + + self.low1 = ConvModule( + cur_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg) + + if self.depth > 1: + self.low2 = HourglassAEModule(depth - 1, stage_channels[1:]) + else: + self.low2 = ConvModule( + next_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg) + + self.low3 = ConvModule( + next_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg) + + self.up2 = nn.UpsamplingNearest2d(scale_factor=2) + + def forward(self, x): + """Model forward function.""" + up1 = self.up1(x) + pool1 = self.pool1(x) + low1 = self.low1(pool1) + low2 = self.low2(low1) + low3 = self.low3(low2) + up2 = self.up2(low3) + return up1 + up2 + + +@BACKBONES.register_module() +class HourglassAENet(BaseBackbone): + """Hourglass-AE Network proposed by Newell et al. + + Associative Embedding: End-to-End Learning for Joint + Detection and Grouping. + + More details can be found in the `paper + `__ . + + Args: + downsample_times (int): Downsample times in a HourglassModule. + num_stacks (int): Number of HourglassModule modules stacked, + 1 for Hourglass-52, 2 for Hourglass-104. + stage_channels (list[int]): Feature channel of each sub-module in a + HourglassModule. + stage_blocks (list[int]): Number of sub-modules stacked in a + HourglassModule. + feat_channels (int): Feature channel of conv after a HourglassModule. + norm_cfg (dict): Dictionary to construct and config norm layer. + + Example: + >>> from mmpose.models import HourglassAENet + >>> import torch + >>> self = HourglassAENet() + >>> self.eval() + >>> inputs = torch.rand(1, 3, 512, 512) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... print(tuple(level_output.shape)) + (1, 34, 128, 128) + """ + + def __init__(self, + downsample_times=4, + num_stacks=1, + out_channels=34, + stage_channels=(256, 384, 512, 640, 768), + feat_channels=256, + norm_cfg=dict(type='BN', requires_grad=True)): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + self.num_stacks = num_stacks + assert self.num_stacks >= 1 + assert len(stage_channels) > downsample_times + + cur_channels = stage_channels[0] + + self.stem = nn.Sequential( + ConvModule(3, 64, 7, padding=3, stride=2, norm_cfg=norm_cfg), + ConvModule(64, 128, 3, padding=1, norm_cfg=norm_cfg), + MaxPool2d(2, 2), + ConvModule(128, 128, 3, padding=1, norm_cfg=norm_cfg), + ConvModule(128, feat_channels, 3, padding=1, norm_cfg=norm_cfg), + ) + + self.hourglass_modules = nn.ModuleList([ + nn.Sequential( + HourglassAEModule( + downsample_times, stage_channels, norm_cfg=norm_cfg), + ConvModule( + feat_channels, + feat_channels, + 3, + padding=1, + norm_cfg=norm_cfg), + ConvModule( + feat_channels, + feat_channels, + 3, + padding=1, + norm_cfg=norm_cfg)) for _ in range(num_stacks) + ]) + + self.out_convs = nn.ModuleList([ + ConvModule( + cur_channels, + out_channels, + 1, + padding=0, + norm_cfg=None, + act_cfg=None) for _ in range(num_stacks) + ]) + + self.remap_out_convs = nn.ModuleList([ + ConvModule( + out_channels, + feat_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=None) for _ in range(num_stacks - 1) + ]) + + self.remap_feature_convs = nn.ModuleList([ + ConvModule( + feat_channels, + feat_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=None) for _ in range(num_stacks - 1) + ]) + + self.relu = nn.ReLU(inplace=True) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Model forward function.""" + inter_feat = self.stem(x) + out_feats = [] + + for ind in range(self.num_stacks): + single_hourglass = self.hourglass_modules[ind] + out_conv = self.out_convs[ind] + + hourglass_feat = single_hourglass(inter_feat) + out_feat = out_conv(hourglass_feat) + out_feats.append(out_feat) + + if ind < self.num_stacks - 1: + inter_feat = inter_feat + self.remap_out_convs[ind]( + out_feat) + self.remap_feature_convs[ind]( + hourglass_feat) + + return out_feats diff --git a/mmpose/models/backbones/hrformer.py b/mmpose/models/backbones/hrformer.py new file mode 100644 index 0000000..b843300 --- /dev/null +++ b/mmpose/models/backbones/hrformer.py @@ -0,0 +1,746 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import math + +import torch +import torch.nn as nn +# from timm.models.layers import to_2tuple, trunc_normal_ +from mmcv.cnn import (build_activation_layer, build_conv_layer, + build_norm_layer, trunc_normal_init) +from mmcv.cnn.bricks.transformer import build_dropout +from mmcv.runner import BaseModule +from torch.nn.functional import pad + +from ..builder import BACKBONES +from .hrnet import Bottleneck, HRModule, HRNet + + +def nlc_to_nchw(x, hw_shape): + """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor. + + Args: + x (Tensor): The input tensor of shape [N, L, C] before conversion. + hw_shape (Sequence[int]): The height and width of output feature map. + + Returns: + Tensor: The output tensor of shape [N, C, H, W] after conversion. + """ + H, W = hw_shape + assert len(x.shape) == 3 + B, L, C = x.shape + assert L == H * W, 'The seq_len doesn\'t match H, W' + return x.transpose(1, 2).reshape(B, C, H, W) + + +def nchw_to_nlc(x): + """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor. + + Args: + x (Tensor): The input tensor of shape [N, C, H, W] before conversion. + + Returns: + Tensor: The output tensor of shape [N, L, C] after conversion. + """ + assert len(x.shape) == 4 + return x.flatten(2).transpose(1, 2).contiguous() + + +def build_drop_path(drop_path_rate): + """Build drop path layer.""" + return build_dropout(dict(type='DropPath', drop_prob=drop_path_rate)) + + +class WindowMSA(BaseModule): + """Window based multi-head self-attention (W-MSA) module with relative + position bias. + + Args: + embed_dims (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (tuple[int]): The height and width of the window. + qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. + Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Default: 0.0 + proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. + with_rpe (bool, optional): If True, use relative position bias. + Default: True. + init_cfg (dict | None, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + window_size, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0., + proj_drop_rate=0., + with_rpe=True, + init_cfg=None): + + super().__init__(init_cfg=init_cfg) + self.embed_dims = embed_dims + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_embed_dims = embed_dims // num_heads + self.scale = qk_scale or head_embed_dims**-0.5 + + self.with_rpe = with_rpe + if self.with_rpe: + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros( + (2 * window_size[0] - 1) * (2 * window_size[1] - 1), + num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + Wh, Ww = self.window_size + rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww) + rel_position_index = rel_index_coords + rel_index_coords.T + rel_position_index = rel_position_index.flip(1).contiguous() + self.register_buffer('relative_position_index', rel_position_index) + + self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop_rate) + self.proj = nn.Linear(embed_dims, embed_dims) + self.proj_drop = nn.Dropout(proj_drop_rate) + + self.softmax = nn.Softmax(dim=-1) + + def init_weights(self): + trunc_normal_init(self.relative_position_bias_table, std=0.02) + + def forward(self, x, mask=None): + """ + Args: + + x (tensor): input features with shape of (B*num_windows, N, C) + mask (tensor | None, Optional): mask with shape of (num_windows, + Wh*Ww, Wh*Ww), value should be between (-inf, 0]. + """ + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.with_rpe: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B // nW, nW, self.num_heads, N, + N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + @staticmethod + def double_step_seq(step1, len1, step2, len2): + seq1 = torch.arange(0, step1 * len1, step1) + seq2 = torch.arange(0, step2 * len2, step2) + return (seq1[:, None] + seq2[None, :]).reshape(1, -1) + + +class LocalWindowSelfAttention(BaseModule): + r""" Local-window Self Attention (LSA) module with relative position bias. + + This module is the short-range self-attention module in the + Interlaced Sparse Self-Attention `_. + + Args: + embed_dims (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (tuple[int] | int): The height and width of the window. + qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. + Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Default: 0.0 + proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. + with_rpe (bool, optional): If True, use relative position bias. + Default: True. + with_pad_mask (bool, optional): If True, mask out the padded tokens in + the attention process. Default: False. + init_cfg (dict | None, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + window_size, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0., + proj_drop_rate=0., + with_rpe=True, + with_pad_mask=False, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + if isinstance(window_size, int): + window_size = (window_size, window_size) + self.window_size = window_size + self.with_pad_mask = with_pad_mask + self.attn = WindowMSA( + embed_dims=embed_dims, + num_heads=num_heads, + window_size=window_size, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop_rate=attn_drop_rate, + proj_drop_rate=proj_drop_rate, + with_rpe=with_rpe, + init_cfg=init_cfg) + + def forward(self, x, H, W, **kwargs): + """Forward function.""" + B, N, C = x.shape + x = x.view(B, H, W, C) + Wh, Ww = self.window_size + + # center-pad the feature on H and W axes + pad_h = math.ceil(H / Wh) * Wh - H + pad_w = math.ceil(W / Ww) * Ww - W + x = pad(x, (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, + pad_h - pad_h // 2)) + + # permute + x = x.view(B, math.ceil(H / Wh), Wh, math.ceil(W / Ww), Ww, C) + x = x.permute(0, 1, 3, 2, 4, 5) + x = x.reshape(-1, Wh * Ww, C) # (B*num_window, Wh*Ww, C) + + # attention + if self.with_pad_mask and pad_h > 0 and pad_w > 0: + pad_mask = x.new_zeros(1, H, W, 1) + pad_mask = pad( + pad_mask, [ + 0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, + pad_h - pad_h // 2 + ], + value=-float('inf')) + pad_mask = pad_mask.view(1, math.ceil(H / Wh), Wh, + math.ceil(W / Ww), Ww, 1) + pad_mask = pad_mask.permute(1, 3, 0, 2, 4, 5) + pad_mask = pad_mask.reshape(-1, Wh * Ww) + pad_mask = pad_mask[:, None, :].expand([-1, Wh * Ww, -1]) + out = self.attn(x, pad_mask, **kwargs) + else: + out = self.attn(x, **kwargs) + + # reverse permutation + out = out.reshape(B, math.ceil(H / Wh), math.ceil(W / Ww), Wh, Ww, C) + out = out.permute(0, 1, 3, 2, 4, 5) + out = out.reshape(B, H + pad_h, W + pad_w, C) + + # de-pad + out = out[:, pad_h // 2:H + pad_h // 2, pad_w // 2:W + pad_w // 2] + return out.reshape(B, N, C) + + +class CrossFFN(BaseModule): + r"""FFN with Depthwise Conv of HRFormer. + + Args: + in_features (int): The feature dimension. + hidden_features (int, optional): The hidden dimension of FFNs. + Defaults: The same as in_features. + act_cfg (dict, optional): Config of activation layer. + Default: dict(type='GELU'). + dw_act_cfg (dict, optional): Config of activation layer appended + right after DW Conv. Default: dict(type='GELU'). + norm_cfg (dict, optional): Config of norm layer. + Default: dict(type='SyncBN'). + init_cfg (dict | list | None, optional): The init config. + Default: None. + """ + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_cfg=dict(type='GELU'), + dw_act_cfg=dict(type='GELU'), + norm_cfg=dict(type='SyncBN'), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1) + self.act1 = build_activation_layer(act_cfg) + self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1] + self.dw3x3 = nn.Conv2d( + hidden_features, + hidden_features, + kernel_size=3, + stride=1, + groups=hidden_features, + padding=1) + self.act2 = build_activation_layer(dw_act_cfg) + self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1] + self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1) + self.act3 = build_activation_layer(act_cfg) + self.norm3 = build_norm_layer(norm_cfg, out_features)[1] + + # put the modules togather + self.layers = [ + self.fc1, self.norm1, self.act1, self.dw3x3, self.norm2, self.act2, + self.fc2, self.norm3, self.act3 + ] + + def forward(self, x, H, W): + """Forward function.""" + x = nlc_to_nchw(x, (H, W)) + for layer in self.layers: + x = layer(x) + x = nchw_to_nlc(x) + return x + + +class HRFormerBlock(BaseModule): + """High-Resolution Block for HRFormer. + + Args: + in_features (int): The input dimension. + out_features (int): The output dimension. + num_heads (int): The number of head within each LSA. + window_size (int, optional): The window size for the LSA. + Default: 7 + mlp_ratio (int, optional): The expansion ration of FFN. + Default: 4 + act_cfg (dict, optional): Config of activation layer. + Default: dict(type='GELU'). + norm_cfg (dict, optional): Config of norm layer. + Default: dict(type='SyncBN'). + transformer_norm_cfg (dict, optional): Config of transformer norm + layer. Default: dict(type='LN', eps=1e-6). + init_cfg (dict | list | None, optional): The init config. + Default: None. + """ + + expansion = 1 + + def __init__(self, + in_features, + out_features, + num_heads, + window_size=7, + mlp_ratio=4.0, + drop_path=0.0, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='SyncBN'), + transformer_norm_cfg=dict(type='LN', eps=1e-6), + init_cfg=None, + **kwargs): + super(HRFormerBlock, self).__init__(init_cfg=init_cfg) + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + + self.norm1 = build_norm_layer(transformer_norm_cfg, in_features)[1] + self.attn = LocalWindowSelfAttention( + in_features, + num_heads=num_heads, + window_size=window_size, + init_cfg=None, + **kwargs) + + self.norm2 = build_norm_layer(transformer_norm_cfg, out_features)[1] + self.ffn = CrossFFN( + in_features=in_features, + hidden_features=int(in_features * mlp_ratio), + out_features=out_features, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + dw_act_cfg=act_cfg, + init_cfg=None) + + self.drop_path = build_drop_path( + drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, x): + """Forward function.""" + B, C, H, W = x.size() + # Attention + x = x.view(B, C, -1).permute(0, 2, 1) + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + # FFN + x = x + self.drop_path(self.ffn(self.norm2(x), H, W)) + x = x.permute(0, 2, 1).view(B, C, H, W) + return x + + def extra_repr(self): + """(Optional) Set the extra information about this module.""" + return 'num_heads={}, window_size={}, mlp_ratio={}'.format( + self.num_heads, self.window_size, self.mlp_ratio) + + +class HRFomerModule(HRModule): + """High-Resolution Module for HRFormer. + + Args: + num_branches (int): The number of branches in the HRFormerModule. + block (nn.Module): The building block of HRFormer. + The block should be the HRFormerBlock. + num_blocks (tuple): The number of blocks in each branch. + The length must be equal to num_branches. + num_inchannels (tuple): The number of input channels in each branch. + The length must be equal to num_branches. + num_channels (tuple): The number of channels in each branch. + The length must be equal to num_branches. + num_heads (tuple): The number of heads within the LSAs. + num_window_sizes (tuple): The window size for the LSAs. + num_mlp_ratios (tuple): The expansion ratio for the FFNs. + drop_path (int, optional): The drop path rate of HRFomer. + Default: 0.0 + multiscale_output (bool, optional): Whether to output multi-level + features produced by multiple branches. If False, only the first + level feature will be output. Default: True. + conv_cfg (dict, optional): Config of the conv layers. + Default: None. + norm_cfg (dict, optional): Config of the norm layers appended + right after conv. Default: dict(type='SyncBN', requires_grad=True) + transformer_norm_cfg (dict, optional): Config of the norm layers. + Default: dict(type='LN', eps=1e-6) + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False + upsample_cfg(dict, optional): The config of upsample layers in fuse + layers. Default: dict(mode='bilinear', align_corners=False) + """ + + def __init__(self, + num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + num_heads, + num_window_sizes, + num_mlp_ratios, + multiscale_output=True, + drop_paths=0.0, + with_rpe=True, + with_pad_mask=False, + conv_cfg=None, + norm_cfg=dict(type='SyncBN', requires_grad=True), + transformer_norm_cfg=dict(type='LN', eps=1e-6), + with_cp=False, + upsample_cfg=dict(mode='bilinear', align_corners=False)): + + self.transformer_norm_cfg = transformer_norm_cfg + self.drop_paths = drop_paths + self.num_heads = num_heads + self.num_window_sizes = num_window_sizes + self.num_mlp_ratios = num_mlp_ratios + self.with_rpe = with_rpe + self.with_pad_mask = with_pad_mask + + super().__init__(num_branches, block, num_blocks, num_inchannels, + num_channels, multiscale_output, with_cp, conv_cfg, + norm_cfg, upsample_cfg) + + def _make_one_branch(self, + branch_index, + block, + num_blocks, + num_channels, + stride=1): + """Build one branch.""" + # HRFormerBlock does not support down sample layer yet. + assert stride == 1 and self.in_channels[branch_index] == num_channels[ + branch_index] + layers = [] + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + num_heads=self.num_heads[branch_index], + window_size=self.num_window_sizes[branch_index], + mlp_ratio=self.num_mlp_ratios[branch_index], + drop_path=self.drop_paths[0], + norm_cfg=self.norm_cfg, + transformer_norm_cfg=self.transformer_norm_cfg, + init_cfg=None, + with_rpe=self.with_rpe, + with_pad_mask=self.with_pad_mask)) + + self.in_channels[ + branch_index] = self.in_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + num_heads=self.num_heads[branch_index], + window_size=self.num_window_sizes[branch_index], + mlp_ratio=self.num_mlp_ratios[branch_index], + drop_path=self.drop_paths[i], + norm_cfg=self.norm_cfg, + transformer_norm_cfg=self.transformer_norm_cfg, + init_cfg=None, + with_rpe=self.with_rpe, + with_pad_mask=self.with_pad_mask)) + return nn.Sequential(*layers) + + def _make_fuse_layers(self): + """Build fuse layers.""" + if self.num_branches == 1: + return None + num_branches = self.num_branches + num_inchannels = self.in_channels + fuse_layers = [] + for i in range(num_branches if self.multiscale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + num_inchannels[j], + num_inchannels[i], + kernel_size=1, + stride=1, + bias=False), + build_norm_layer(self.norm_cfg, + num_inchannels[i])[1], + nn.Upsample( + scale_factor=2**(j - i), + mode=self.upsample_cfg['mode'], + align_corners=self. + upsample_cfg['align_corners']))) + elif j == i: + fuse_layer.append(None) + else: + conv3x3s = [] + for k in range(i - j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_inchannels[i] + with_out_act = False + else: + num_outchannels_conv3x3 = num_inchannels[j] + with_out_act = True + sub_modules = [ + build_conv_layer( + self.conv_cfg, + num_inchannels[j], + num_inchannels[j], + kernel_size=3, + stride=2, + padding=1, + groups=num_inchannels[j], + bias=False, + ), + build_norm_layer(self.norm_cfg, + num_inchannels[j])[1], + build_conv_layer( + self.conv_cfg, + num_inchannels[j], + num_outchannels_conv3x3, + kernel_size=1, + stride=1, + bias=False, + ), + build_norm_layer(self.norm_cfg, + num_outchannels_conv3x3)[1] + ] + if with_out_act: + sub_modules.append(nn.ReLU(False)) + conv3x3s.append(nn.Sequential(*sub_modules)) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_inchannels(self): + """Return the number of input channels.""" + return self.in_channels + + +@BACKBONES.register_module() +class HRFormer(HRNet): + """HRFormer backbone. + + This backbone is the implementation of `HRFormer: High-Resolution + Transformer for Dense Prediction `_. + + Args: + extra (dict): Detailed configuration for each stage of HRNet. + There must be 4 stages, the configuration for each stage must have + 5 keys: + + - num_modules (int): The number of HRModule in this stage. + - num_branches (int): The number of branches in the HRModule. + - block (str): The type of block. + - num_blocks (tuple): The number of blocks in each branch. + The length must be equal to num_branches. + - num_channels (tuple): The number of channels in each branch. + The length must be equal to num_branches. + in_channels (int): Number of input image channels. Normally 3. + conv_cfg (dict): Dictionary to construct and config conv layer. + Default: None. + norm_cfg (dict): Config of norm layer. + Use `SyncBN` by default. + transformer_norm_cfg (dict): Config of transformer norm layer. + Use `LN` by default. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + Example: + >>> from mmpose.models import HRFormer + >>> import torch + >>> extra = dict( + >>> stage1=dict( + >>> num_modules=1, + >>> num_branches=1, + >>> block='BOTTLENECK', + >>> num_blocks=(2, ), + >>> num_channels=(64, )), + >>> stage2=dict( + >>> num_modules=1, + >>> num_branches=2, + >>> block='HRFORMER', + >>> window_sizes=(7, 7), + >>> num_heads=(1, 2), + >>> mlp_ratios=(4, 4), + >>> num_blocks=(2, 2), + >>> num_channels=(32, 64)), + >>> stage3=dict( + >>> num_modules=4, + >>> num_branches=3, + >>> block='HRFORMER', + >>> window_sizes=(7, 7, 7), + >>> num_heads=(1, 2, 4), + >>> mlp_ratios=(4, 4, 4), + >>> num_blocks=(2, 2, 2), + >>> num_channels=(32, 64, 128)), + >>> stage4=dict( + >>> num_modules=2, + >>> num_branches=4, + >>> block='HRFORMER', + >>> window_sizes=(7, 7, 7, 7), + >>> num_heads=(1, 2, 4, 8), + >>> mlp_ratios=(4, 4, 4, 4), + >>> num_blocks=(2, 2, 2, 2), + >>> num_channels=(32, 64, 128, 256))) + >>> self = HRFormer(extra, in_channels=1) + >>> self.eval() + >>> inputs = torch.rand(1, 1, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 32, 8, 8) + (1, 64, 4, 4) + (1, 128, 2, 2) + (1, 256, 1, 1) + """ + + blocks_dict = {'BOTTLENECK': Bottleneck, 'HRFORMERBLOCK': HRFormerBlock} + + def __init__(self, + extra, + in_channels=3, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + transformer_norm_cfg=dict(type='LN', eps=1e-6), + norm_eval=False, + with_cp=False, + zero_init_residual=False, + frozen_stages=-1): + + # stochastic depth + depths = [ + extra[stage]['num_blocks'][0] * extra[stage]['num_modules'] + for stage in ['stage2', 'stage3', 'stage4'] + ] + depth_s2, depth_s3, _ = depths + drop_path_rate = extra['drop_path_rate'] + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] + extra['stage2']['drop_path_rates'] = dpr[0:depth_s2] + extra['stage3']['drop_path_rates'] = dpr[depth_s2:depth_s2 + depth_s3] + extra['stage4']['drop_path_rates'] = dpr[depth_s2 + depth_s3:] + + # HRFormer use bilinear upsample as default + upsample_cfg = extra.get('upsample', { + 'mode': 'bilinear', + 'align_corners': False + }) + extra['upsample'] = upsample_cfg + self.transformer_norm_cfg = transformer_norm_cfg + self.with_rpe = extra.get('with_rpe', True) + self.with_pad_mask = extra.get('with_pad_mask', False) + + super().__init__(extra, in_channels, conv_cfg, norm_cfg, norm_eval, + with_cp, zero_init_residual, frozen_stages) + + def _make_stage(self, + layer_config, + num_inchannels, + multiscale_output=True): + """Make each stage.""" + num_modules = layer_config['num_modules'] + num_branches = layer_config['num_branches'] + num_blocks = layer_config['num_blocks'] + num_channels = layer_config['num_channels'] + block = self.blocks_dict[layer_config['block']] + num_heads = layer_config['num_heads'] + num_window_sizes = layer_config['window_sizes'] + num_mlp_ratios = layer_config['mlp_ratios'] + drop_path_rates = layer_config['drop_path_rates'] + + modules = [] + for i in range(num_modules): + # multiscale_output is only used at the last module + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + + modules.append( + HRFomerModule( + num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + num_heads, + num_window_sizes, + num_mlp_ratios, + reset_multiscale_output, + drop_paths=drop_path_rates[num_blocks[0] * + i:num_blocks[0] * (i + 1)], + with_rpe=self.with_rpe, + with_pad_mask=self.with_pad_mask, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + transformer_norm_cfg=self.transformer_norm_cfg, + with_cp=self.with_cp, + upsample_cfg=self.upsample_cfg)) + num_inchannels = modules[-1].get_num_inchannels() + + return nn.Sequential(*modules), num_inchannels diff --git a/mmpose/models/backbones/hrnet.py b/mmpose/models/backbones/hrnet.py new file mode 100644 index 0000000..87dc8ce --- /dev/null +++ b/mmpose/models/backbones/hrnet.py @@ -0,0 +1,604 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, + normal_init) +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .resnet import BasicBlock, Bottleneck, get_expansion +from .utils import load_checkpoint + + +class HRModule(nn.Module): + """High-Resolution Module for HRNet. + + In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange + is in this module. + """ + + def __init__(self, + num_branches, + blocks, + num_blocks, + in_channels, + num_channels, + multiscale_output=False, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + upsample_cfg=dict(mode='nearest', align_corners=None)): + + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + self._check_branches(num_branches, num_blocks, in_channels, + num_channels) + + self.in_channels = in_channels + self.num_branches = num_branches + + self.multiscale_output = multiscale_output + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.upsample_cfg = upsample_cfg + self.with_cp = with_cp + self.branches = self._make_branches(num_branches, blocks, num_blocks, + num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(inplace=True) + + @staticmethod + def _check_branches(num_branches, num_blocks, in_channels, num_channels): + """Check input to avoid ValueError.""" + if num_branches != len(num_blocks): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_BLOCKS({len(num_blocks)})' + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_CHANNELS({len(num_channels)})' + raise ValueError(error_msg) + + if num_branches != len(in_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_INCHANNELS({len(in_channels)})' + raise ValueError(error_msg) + + def _make_one_branch(self, + branch_index, + block, + num_blocks, + num_channels, + stride=1): + """Make one branch.""" + downsample = None + if stride != 1 or \ + self.in_channels[branch_index] != \ + num_channels[branch_index] * get_expansion(block): + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + self.in_channels[branch_index], + num_channels[branch_index] * get_expansion(block), + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer( + self.norm_cfg, + num_channels[branch_index] * get_expansion(block))[1]) + + layers = [] + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index] * get_expansion(block), + stride=stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + self.in_channels[branch_index] = \ + num_channels[branch_index] * get_expansion(block) + for _ in range(1, num_blocks[branch_index]): + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index] * get_expansion(block), + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + """Make branches.""" + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels)) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + """Make fuse layer.""" + if self.num_branches == 1: + return None + + num_branches = self.num_branches + in_channels = self.in_channels + fuse_layers = [] + num_out_branches = num_branches if self.multiscale_output else 1 + + for i in range(num_out_branches): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, in_channels[i])[1], + nn.Upsample( + scale_factor=2**(j - i), + mode=self.upsample_cfg['mode'], + align_corners=self. + upsample_cfg['align_corners']))) + elif j == i: + fuse_layer.append(None) + else: + conv_downsamples = [] + for k in range(i - j): + if k == i - j - 1: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[i])[1])) + else: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + nn.ReLU(inplace=True))) + fuse_layer.append(nn.Sequential(*conv_downsamples)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def forward(self, x): + """Forward function.""" + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + for i in range(len(self.fuse_layers)): + y = 0 + for j in range(self.num_branches): + if i == j: + y += x[j] + else: + y += self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + return x_fuse + + +@BACKBONES.register_module() +class HRNet(nn.Module): + """HRNet backbone. + + `High-Resolution Representations for Labeling Pixels and Regions + `__ + + Args: + extra (dict): detailed configuration for each stage of HRNet. + in_channels (int): Number of input image channels. Default: 3. + conv_cfg (dict): dictionary to construct and config conv layer. + norm_cfg (dict): dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + + Example: + >>> from mmpose.models import HRNet + >>> import torch + >>> extra = dict( + >>> stage1=dict( + >>> num_modules=1, + >>> num_branches=1, + >>> block='BOTTLENECK', + >>> num_blocks=(4, ), + >>> num_channels=(64, )), + >>> stage2=dict( + >>> num_modules=1, + >>> num_branches=2, + >>> block='BASIC', + >>> num_blocks=(4, 4), + >>> num_channels=(32, 64)), + >>> stage3=dict( + >>> num_modules=4, + >>> num_branches=3, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4), + >>> num_channels=(32, 64, 128)), + >>> stage4=dict( + >>> num_modules=3, + >>> num_branches=4, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4, 4), + >>> num_channels=(32, 64, 128, 256))) + >>> self = HRNet(extra, in_channels=1) + >>> self.eval() + >>> inputs = torch.rand(1, 1, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 32, 8, 8) + """ + + blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} + + def __init__(self, + extra, + in_channels=3, + conv_cfg=None, + norm_cfg=dict(type='BN'), + norm_eval=False, + with_cp=False, + zero_init_residual=False, + frozen_stages=-1): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + self.extra = extra + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + self.frozen_stages = frozen_stages + + # stem net + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) + self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) + + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + 64, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.relu = nn.ReLU(inplace=True) + + self.upsample_cfg = self.extra.get('upsample', { + 'mode': 'nearest', + 'align_corners': None + }) + + # stage 1 + self.stage1_cfg = self.extra['stage1'] + num_channels = self.stage1_cfg['num_channels'][0] + block_type = self.stage1_cfg['block'] + num_blocks = self.stage1_cfg['num_blocks'][0] + + block = self.blocks_dict[block_type] + stage1_out_channels = num_channels * get_expansion(block) + self.layer1 = self._make_layer(block, 64, stage1_out_channels, + num_blocks) + + # stage 2 + self.stage2_cfg = self.extra['stage2'] + num_channels = self.stage2_cfg['num_channels'] + block_type = self.stage2_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [ + channel * get_expansion(block) for channel in num_channels + ] + self.transition1 = self._make_transition_layer([stage1_out_channels], + num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + # stage 3 + self.stage3_cfg = self.extra['stage3'] + num_channels = self.stage3_cfg['num_channels'] + block_type = self.stage3_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [ + channel * get_expansion(block) for channel in num_channels + ] + self.transition2 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + # stage 4 + self.stage4_cfg = self.extra['stage4'] + num_channels = self.stage4_cfg['num_channels'] + block_type = self.stage4_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [ + channel * get_expansion(block) for channel in num_channels + ] + self.transition3 = self._make_transition_layer(pre_stage_channels, + num_channels) + + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, + num_channels, + multiscale_output=self.stage4_cfg.get('multiscale_output', False)) + + self._freeze_stages() + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + def _make_transition_layer(self, num_channels_pre_layer, + num_channels_cur_layer): + """Make transition layer.""" + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + num_channels_pre_layer[i], + num_channels_cur_layer[i], + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + num_channels_cur_layer[i])[1], + nn.ReLU(inplace=True))) + else: + transition_layers.append(None) + else: + conv_downsamples = [] + for j in range(i + 1 - num_branches_pre): + in_channels = num_channels_pre_layer[-1] + out_channels = num_channels_cur_layer[i] \ + if j == i - num_branches_pre else in_channels + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, out_channels)[1], + nn.ReLU(inplace=True))) + transition_layers.append(nn.Sequential(*conv_downsamples)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, in_channels, out_channels, blocks, stride=1): + """Make layer.""" + downsample = None + if stride != 1 or in_channels != out_channels: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, out_channels)[1]) + + layers = [] + layers.append( + block( + in_channels, + out_channels, + stride=stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + for _ in range(1, blocks): + layers.append( + block( + out_channels, + out_channels, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, in_channels, multiscale_output=True): + """Make stage.""" + num_modules = layer_config['num_modules'] + num_branches = layer_config['num_branches'] + num_blocks = layer_config['num_blocks'] + num_channels = layer_config['num_channels'] + block = self.blocks_dict[layer_config['block']] + + hr_modules = [] + for i in range(num_modules): + # multi_scale_output is only used for the last module + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + + hr_modules.append( + HRModule( + num_branches, + block, + num_blocks, + in_channels, + num_channels, + reset_multiscale_output, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + upsample_cfg=self.upsample_cfg)) + + in_channels = hr_modules[-1].in_channels + + return nn.Sequential(*hr_modules), in_channels + + def _freeze_stages(self): + """Freeze parameters.""" + if self.frozen_stages >= 0: + self.norm1.eval() + self.norm2.eval() + + for m in [self.conv1, self.norm1, self.conv2, self.norm2]: + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + if i == 1: + m = getattr(self, 'layer1') + else: + m = getattr(self, f'stage{i}') + + m.eval() + for param in m.parameters(): + param.requires_grad = False + + if i < 4: + m = getattr(self, f'transition{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + constant_init(m.norm3, 0) + elif isinstance(m, BasicBlock): + constant_init(m.norm2, 0) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Forward function.""" + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.norm2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['num_branches']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['num_branches']): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['num_branches']): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + + return y_list + + def train(self, mode=True): + """Convert the model into training mode.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/litehrnet.py b/mmpose/models/backbones/litehrnet.py new file mode 100644 index 0000000..9543688 --- /dev/null +++ b/mmpose/models/backbones/litehrnet.py @@ -0,0 +1,984 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/HRNet/Lite-HRNet +# Original licence: Apache License 2.0. +# ------------------------------------------------------------------------------ + +import mmcv +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, + build_conv_layer, build_norm_layer, constant_init, + normal_init) +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .utils import channel_shuffle, load_checkpoint + + +class SpatialWeighting(nn.Module): + """Spatial weighting module. + + Args: + channels (int): The channels of the module. + ratio (int): channel reduction ratio. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + act_cfg (dict): Config dict for activation layer. + Default: (dict(type='ReLU'), dict(type='Sigmoid')). + The last ConvModule uses Sigmoid by default. + """ + + def __init__(self, + channels, + ratio=16, + conv_cfg=None, + norm_cfg=None, + act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))): + super().__init__() + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert mmcv.is_tuple_of(act_cfg, dict) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=int(channels / ratio), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=int(channels / ratio), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x): + out = self.global_avgpool(x) + out = self.conv1(out) + out = self.conv2(out) + return x * out + + +class CrossResolutionWeighting(nn.Module): + """Cross-resolution channel weighting module. + + Args: + channels (int): The channels of the module. + ratio (int): channel reduction ratio. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + act_cfg (dict): Config dict for activation layer. + Default: (dict(type='ReLU'), dict(type='Sigmoid')). + The last ConvModule uses Sigmoid by default. + """ + + def __init__(self, + channels, + ratio=16, + conv_cfg=None, + norm_cfg=None, + act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))): + super().__init__() + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert mmcv.is_tuple_of(act_cfg, dict) + self.channels = channels + total_channel = sum(channels) + self.conv1 = ConvModule( + in_channels=total_channel, + out_channels=int(total_channel / ratio), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=int(total_channel / ratio), + out_channels=total_channel, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x): + mini_size = x[-1].size()[-2:] + out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]] + out = torch.cat(out, dim=1) + out = self.conv1(out) + out = self.conv2(out) + out = torch.split(out, self.channels, dim=1) + out = [ + s * F.interpolate(a, size=s.size()[-2:], mode='nearest') + for s, a in zip(x, out) + ] + return out + + +class ConditionalChannelWeighting(nn.Module): + """Conditional channel weighting block. + + Args: + in_channels (int): The input channels of the block. + stride (int): Stride of the 3x3 convolution layer. + reduce_ratio (int): channel reduction ratio. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + stride, + reduce_ratio, + conv_cfg=None, + norm_cfg=dict(type='BN'), + with_cp=False): + super().__init__() + self.with_cp = with_cp + self.stride = stride + assert stride in [1, 2] + + branch_channels = [channel // 2 for channel in in_channels] + + self.cross_resolution_weighting = CrossResolutionWeighting( + branch_channels, + ratio=reduce_ratio, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + + self.depthwise_convs = nn.ModuleList([ + ConvModule( + channel, + channel, + kernel_size=3, + stride=self.stride, + padding=1, + groups=channel, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) for channel in branch_channels + ]) + + self.spatial_weighting = nn.ModuleList([ + SpatialWeighting(channels=channel, ratio=4) + for channel in branch_channels + ]) + + def forward(self, x): + + def _inner_forward(x): + x = [s.chunk(2, dim=1) for s in x] + x1 = [s[0] for s in x] + x2 = [s[1] for s in x] + + x2 = self.cross_resolution_weighting(x2) + x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)] + x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)] + + out = [torch.cat([s1, s2], dim=1) for s1, s2 in zip(x1, x2)] + out = [channel_shuffle(s, 2) for s in out] + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class Stem(nn.Module): + """Stem network block. + + Args: + in_channels (int): The input channels of the block. + stem_channels (int): Output channels of the stem layer. + out_channels (int): The output channels of the block. + expand_ratio (int): adjusts number of channels of the hidden layer + in InvertedResidual by this amount. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + stem_channels, + out_channels, + expand_ratio, + conv_cfg=None, + norm_cfg=dict(type='BN'), + with_cp=False): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + + self.conv1 = ConvModule( + in_channels=in_channels, + out_channels=stem_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type='ReLU')) + + mid_channels = int(round(stem_channels * expand_ratio)) + branch_channels = stem_channels // 2 + if stem_channels == self.out_channels: + inc_channels = self.out_channels - branch_channels + else: + inc_channels = self.out_channels - stem_channels + + self.branch1 = nn.Sequential( + ConvModule( + branch_channels, + branch_channels, + kernel_size=3, + stride=2, + padding=1, + groups=branch_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), + ConvModule( + branch_channels, + inc_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU')), + ) + + self.expand_conv = ConvModule( + branch_channels, + mid_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU')) + self.depthwise_conv = ConvModule( + mid_channels, + mid_channels, + kernel_size=3, + stride=2, + padding=1, + groups=mid_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + self.linear_conv = ConvModule( + mid_channels, + branch_channels + if stem_channels == self.out_channels else stem_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU')) + + def forward(self, x): + + def _inner_forward(x): + x = self.conv1(x) + x1, x2 = x.chunk(2, dim=1) + + x2 = self.expand_conv(x2) + x2 = self.depthwise_conv(x2) + x2 = self.linear_conv(x2) + + out = torch.cat((self.branch1(x1), x2), dim=1) + + out = channel_shuffle(out, 2) + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class IterativeHead(nn.Module): + """Extra iterative head for feature learning. + + Args: + in_channels (int): The input channels of the block. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + """ + + def __init__(self, in_channels, norm_cfg=dict(type='BN')): + super().__init__() + projects = [] + num_branchs = len(in_channels) + self.in_channels = in_channels[::-1] + + for i in range(num_branchs): + if i != num_branchs - 1: + projects.append( + DepthwiseSeparableConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i + 1], + kernel_size=3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU'), + dw_act_cfg=None, + pw_act_cfg=dict(type='ReLU'))) + else: + projects.append( + DepthwiseSeparableConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU'), + dw_act_cfg=None, + pw_act_cfg=dict(type='ReLU'))) + self.projects = nn.ModuleList(projects) + + def forward(self, x): + x = x[::-1] + + y = [] + last_x = None + for i, s in enumerate(x): + if last_x is not None: + last_x = F.interpolate( + last_x, + size=s.size()[-2:], + mode='bilinear', + align_corners=True) + s = s + last_x + s = self.projects[i](s) + y.append(s) + last_x = s + + return y[::-1] + + +class ShuffleUnit(nn.Module): + """InvertedResidual block for ShuffleNetV2 backbone. + + Args: + in_channels (int): The input channels of the block. + out_channels (int): The output channels of the block. + stride (int): Stride of the 3x3 convolution layer. Default: 1 + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + out_channels, + stride=1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + super().__init__() + self.stride = stride + self.with_cp = with_cp + + branch_features = out_channels // 2 + if self.stride == 1: + assert in_channels == branch_features * 2, ( + f'in_channels ({in_channels}) should equal to ' + f'branch_features * 2 ({branch_features * 2}) ' + 'when stride is 1') + + if in_channels != branch_features * 2: + assert self.stride != 1, ( + f'stride ({self.stride}) should not equal 1 when ' + f'in_channels != branch_features * 2') + + if self.stride > 1: + self.branch1 = nn.Sequential( + ConvModule( + in_channels, + in_channels, + kernel_size=3, + stride=self.stride, + padding=1, + groups=in_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), + ConvModule( + in_channels, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ) + + self.branch2 = nn.Sequential( + ConvModule( + in_channels if (self.stride > 1) else branch_features, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + branch_features, + branch_features, + kernel_size=3, + stride=self.stride, + padding=1, + groups=branch_features, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), + ConvModule( + branch_features, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x): + + def _inner_forward(x): + if self.stride > 1: + out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) + else: + x1, x2 = x.chunk(2, dim=1) + out = torch.cat((x1, self.branch2(x2)), dim=1) + + out = channel_shuffle(out, 2) + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class LiteHRModule(nn.Module): + """High-Resolution Module for LiteHRNet. + + It contains conditional channel weighting blocks and + shuffle blocks. + + + Args: + num_branches (int): Number of branches in the module. + num_blocks (int): Number of blocks in the module. + in_channels (list(int)): Number of input image channels. + reduce_ratio (int): Channel reduction ratio. + module_type (str): 'LITE' or 'NAIVE' + multiscale_output (bool): Whether to output multi-scale features. + with_fuse (bool): Whether to use fuse layers. + conv_cfg (dict): dictionary to construct and config conv layer. + norm_cfg (dict): dictionary to construct and config norm layer. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + def __init__( + self, + num_branches, + num_blocks, + in_channels, + reduce_ratio, + module_type, + multiscale_output=False, + with_fuse=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + with_cp=False, + ): + super().__init__() + self._check_branches(num_branches, in_channels) + + self.in_channels = in_channels + self.num_branches = num_branches + + self.module_type = module_type + self.multiscale_output = multiscale_output + self.with_fuse = with_fuse + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.with_cp = with_cp + + if self.module_type.upper() == 'LITE': + self.layers = self._make_weighting_blocks(num_blocks, reduce_ratio) + elif self.module_type.upper() == 'NAIVE': + self.layers = self._make_naive_branches(num_branches, num_blocks) + else: + raise ValueError("module_type should be either 'LITE' or 'NAIVE'.") + if self.with_fuse: + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU() + + def _check_branches(self, num_branches, in_channels): + """Check input to avoid ValueError.""" + if num_branches != len(in_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_INCHANNELS({len(in_channels)})' + raise ValueError(error_msg) + + def _make_weighting_blocks(self, num_blocks, reduce_ratio, stride=1): + """Make channel weighting blocks.""" + layers = [] + for i in range(num_blocks): + layers.append( + ConditionalChannelWeighting( + self.in_channels, + stride=stride, + reduce_ratio=reduce_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + with_cp=self.with_cp)) + + return nn.Sequential(*layers) + + def _make_one_branch(self, branch_index, num_blocks, stride=1): + """Make one branch.""" + layers = [] + layers.append( + ShuffleUnit( + self.in_channels[branch_index], + self.in_channels[branch_index], + stride=stride, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type='ReLU'), + with_cp=self.with_cp)) + for i in range(1, num_blocks): + layers.append( + ShuffleUnit( + self.in_channels[branch_index], + self.in_channels[branch_index], + stride=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type='ReLU'), + with_cp=self.with_cp)) + + return nn.Sequential(*layers) + + def _make_naive_branches(self, num_branches, num_blocks): + """Make branches.""" + branches = [] + + for i in range(num_branches): + branches.append(self._make_one_branch(i, num_blocks)) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + """Make fuse layer.""" + if self.num_branches == 1: + return None + + num_branches = self.num_branches + in_channels = self.in_channels + fuse_layers = [] + num_out_branches = num_branches if self.multiscale_output else 1 + for i in range(num_out_branches): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, in_channels[i])[1], + nn.Upsample( + scale_factor=2**(j - i), mode='nearest'))) + elif j == i: + fuse_layer.append(None) + else: + conv_downsamples = [] + for k in range(i - j): + if k == i - j - 1: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=3, + stride=2, + padding=1, + groups=in_channels[j], + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[i])[1])) + else: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=3, + stride=2, + padding=1, + groups=in_channels[j], + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + nn.ReLU(inplace=True))) + fuse_layer.append(nn.Sequential(*conv_downsamples)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def forward(self, x): + """Forward function.""" + if self.num_branches == 1: + return [self.layers[0](x[0])] + + if self.module_type.upper() == 'LITE': + out = self.layers(x) + elif self.module_type.upper() == 'NAIVE': + for i in range(self.num_branches): + x[i] = self.layers[i](x[i]) + out = x + + if self.with_fuse: + out_fuse = [] + for i in range(len(self.fuse_layers)): + # `y = 0` will lead to decreased accuracy (0.5~1 mAP) + y = out[0] if i == 0 else self.fuse_layers[i][0](out[0]) + for j in range(self.num_branches): + if i == j: + y += out[j] + else: + y += self.fuse_layers[i][j](out[j]) + out_fuse.append(self.relu(y)) + out = out_fuse + if not self.multiscale_output: + out = [out[0]] + return out + + +@BACKBONES.register_module() +class LiteHRNet(nn.Module): + """Lite-HRNet backbone. + + `Lite-HRNet: A Lightweight High-Resolution Network + `_. + + Code adapted from 'https://github.com/HRNet/Lite-HRNet'. + + Args: + extra (dict): detailed configuration for each stage of HRNet. + in_channels (int): Number of input image channels. Default: 3. + conv_cfg (dict): dictionary to construct and config conv layer. + norm_cfg (dict): dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + + Example: + >>> from mmpose.models import LiteHRNet + >>> import torch + >>> extra=dict( + >>> stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), + >>> num_stages=3, + >>> stages_spec=dict( + >>> num_modules=(2, 4, 2), + >>> num_branches=(2, 3, 4), + >>> num_blocks=(2, 2, 2), + >>> module_type=('LITE', 'LITE', 'LITE'), + >>> with_fuse=(True, True, True), + >>> reduce_ratios=(8, 8, 8), + >>> num_channels=( + >>> (40, 80), + >>> (40, 80, 160), + >>> (40, 80, 160, 320), + >>> )), + >>> with_head=False) + >>> self = LiteHRNet(extra, in_channels=1) + >>> self.eval() + >>> inputs = torch.rand(1, 1, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 40, 8, 8) + """ + + def __init__(self, + extra, + in_channels=3, + conv_cfg=None, + norm_cfg=dict(type='BN'), + norm_eval=False, + with_cp=False): + super().__init__() + self.extra = extra + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.stem = Stem( + in_channels, + stem_channels=self.extra['stem']['stem_channels'], + out_channels=self.extra['stem']['out_channels'], + expand_ratio=self.extra['stem']['expand_ratio'], + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + + self.num_stages = self.extra['num_stages'] + self.stages_spec = self.extra['stages_spec'] + + num_channels_last = [ + self.stem.out_channels, + ] + for i in range(self.num_stages): + num_channels = self.stages_spec['num_channels'][i] + num_channels = [num_channels[i] for i in range(len(num_channels))] + setattr( + self, f'transition{i}', + self._make_transition_layer(num_channels_last, num_channels)) + + stage, num_channels_last = self._make_stage( + self.stages_spec, i, num_channels, multiscale_output=True) + setattr(self, f'stage{i}', stage) + + self.with_head = self.extra['with_head'] + if self.with_head: + self.head_layer = IterativeHead( + in_channels=num_channels_last, + norm_cfg=self.norm_cfg, + ) + + def _make_transition_layer(self, num_channels_pre_layer, + num_channels_cur_layer): + """Make transition layer.""" + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + num_channels_pre_layer[i], + num_channels_pre_layer[i], + kernel_size=3, + stride=1, + padding=1, + groups=num_channels_pre_layer[i], + bias=False), + build_norm_layer(self.norm_cfg, + num_channels_pre_layer[i])[1], + build_conv_layer( + self.conv_cfg, + num_channels_pre_layer[i], + num_channels_cur_layer[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, + num_channels_cur_layer[i])[1], + nn.ReLU())) + else: + transition_layers.append(None) + else: + conv_downsamples = [] + for j in range(i + 1 - num_branches_pre): + in_channels = num_channels_pre_layer[-1] + out_channels = num_channels_cur_layer[i] \ + if j == i - num_branches_pre else in_channels + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + in_channels, + kernel_size=3, + stride=2, + padding=1, + groups=in_channels, + bias=False), + build_norm_layer(self.norm_cfg, in_channels)[1], + build_conv_layer( + self.conv_cfg, + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, out_channels)[1], + nn.ReLU())) + transition_layers.append(nn.Sequential(*conv_downsamples)) + + return nn.ModuleList(transition_layers) + + def _make_stage(self, + stages_spec, + stage_index, + in_channels, + multiscale_output=True): + num_modules = stages_spec['num_modules'][stage_index] + num_branches = stages_spec['num_branches'][stage_index] + num_blocks = stages_spec['num_blocks'][stage_index] + reduce_ratio = stages_spec['reduce_ratios'][stage_index] + with_fuse = stages_spec['with_fuse'][stage_index] + module_type = stages_spec['module_type'][stage_index] + + modules = [] + for i in range(num_modules): + # multi_scale_output is only used last module + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + + modules.append( + LiteHRModule( + num_branches, + num_blocks, + in_channels, + reduce_ratio, + module_type, + multiscale_output=reset_multiscale_output, + with_fuse=with_fuse, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + with_cp=self.with_cp)) + in_channels = modules[-1].in_channels + + return nn.Sequential(*modules), in_channels + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Forward function.""" + x = self.stem(x) + + y_list = [x] + for i in range(self.num_stages): + x_list = [] + transition = getattr(self, f'transition{i}') + for j in range(self.stages_spec['num_branches'][i]): + if transition[j]: + if j >= len(y_list): + x_list.append(transition[j](y_list[-1])) + else: + x_list.append(transition[j](y_list[j])) + else: + x_list.append(y_list[j]) + y_list = getattr(self, f'stage{i}')(x_list) + + x = y_list + if self.with_head: + x = self.head_layer(x) + + return [x[0]] + + def train(self, mode=True): + """Convert the model into training mode.""" + super().train(mode) + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/mobilenet_v2.py b/mmpose/models/backbones/mobilenet_v2.py new file mode 100644 index 0000000..5dc0cd1 --- /dev/null +++ b/mmpose/models/backbones/mobilenet_v2.py @@ -0,0 +1,275 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import logging + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule, constant_init, kaiming_init +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import load_checkpoint, make_divisible + + +class InvertedResidual(nn.Module): + """InvertedResidual block for MobileNetV2. + + Args: + in_channels (int): The input channels of the InvertedResidual block. + out_channels (int): The output channels of the InvertedResidual block. + stride (int): Stride of the middle (first) 3x3 convolution. + expand_ratio (int): adjusts number of channels of the hidden layer + in InvertedResidual by this amount. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + out_channels, + stride, + expand_ratio, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + act_cfg = copy.deepcopy(act_cfg) + super().__init__() + self.stride = stride + assert stride in [1, 2], f'stride must in [1, 2]. ' \ + f'But received {stride}.' + self.with_cp = with_cp + self.use_res_connect = self.stride == 1 and in_channels == out_channels + hidden_dim = int(round(in_channels * expand_ratio)) + + layers = [] + if expand_ratio != 1: + layers.append( + ConvModule( + in_channels=in_channels, + out_channels=hidden_dim, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + layers.extend([ + ConvModule( + in_channels=hidden_dim, + out_channels=hidden_dim, + kernel_size=3, + stride=stride, + padding=1, + groups=hidden_dim, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + in_channels=hidden_dim, + out_channels=out_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + + def _inner_forward(x): + if self.use_res_connect: + return x + self.conv(x) + return self.conv(x) + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +@BACKBONES.register_module() +class MobileNetV2(BaseBackbone): + """MobileNetV2 backbone. + + Args: + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Default: 1.0. + out_indices (None or Sequence[int]): Output from which stages. + Default: (7, ). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + # Parameters to build layers. 4 parameters are needed to construct a + # layer, from left to right: expand_ratio, channel, num_blocks, stride. + arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], + [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], + [6, 320, 1, 1]] + + def __init__(self, + widen_factor=1., + out_indices=(7, ), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + norm_eval=False, + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + act_cfg = copy.deepcopy(act_cfg) + super().__init__() + self.widen_factor = widen_factor + self.out_indices = out_indices + for index in out_indices: + if index not in range(0, 8): + raise ValueError('the item in out_indices must in ' + f'range(0, 8). But received {index}') + + if frozen_stages not in range(-1, 8): + raise ValueError('frozen_stages must be in range(-1, 8). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.in_channels = make_divisible(32 * widen_factor, 8) + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.layers = [] + + for i, layer_cfg in enumerate(self.arch_settings): + expand_ratio, channel, num_blocks, stride = layer_cfg + out_channels = make_divisible(channel * widen_factor, 8) + inverted_res_layer = self.make_layer( + out_channels=out_channels, + num_blocks=num_blocks, + stride=stride, + expand_ratio=expand_ratio) + layer_name = f'layer{i + 1}' + self.add_module(layer_name, inverted_res_layer) + self.layers.append(layer_name) + + if widen_factor > 1.0: + self.out_channel = int(1280 * widen_factor) + else: + self.out_channel = 1280 + + layer = ConvModule( + in_channels=self.in_channels, + out_channels=self.out_channel, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.add_module('conv2', layer) + self.layers.append('conv2') + + def make_layer(self, out_channels, num_blocks, stride, expand_ratio): + """Stack InvertedResidual blocks to build a layer for MobileNetV2. + + Args: + out_channels (int): out_channels of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Default: 1 + expand_ratio (int): Expand the number of channels of the + hidden layer in InvertedResidual by this ratio. Default: 6. + """ + layers = [] + for i in range(num_blocks): + if i >= 1: + stride = 1 + layers.append( + InvertedResidual( + self.in_channels, + out_channels, + stride, + expand_ratio=expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/mobilenet_v3.py b/mmpose/models/backbones/mobilenet_v3.py new file mode 100644 index 0000000..d640abe --- /dev/null +++ b/mmpose/models/backbones/mobilenet_v3.py @@ -0,0 +1,188 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import logging + +import torch.nn as nn +from mmcv.cnn import ConvModule, constant_init, kaiming_init +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import InvertedResidual, load_checkpoint + + +@BACKBONES.register_module() +class MobileNetV3(BaseBackbone): + """MobileNetV3 backbone. + + Args: + arch (str): Architecture of mobilnetv3, from {small, big}. + Default: small. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + out_indices (None or Sequence[int]): Output from which stages. + Default: (-1, ), which means output tensors from final stage. + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save + some memory while slowing down the training speed. + Default: False. + """ + # Parameters to build each block: + # [kernel size, mid channels, out channels, with_se, act type, stride] + arch_settings = { + 'small': [[3, 16, 16, True, 'ReLU', 2], + [3, 72, 24, False, 'ReLU', 2], + [3, 88, 24, False, 'ReLU', 1], + [5, 96, 40, True, 'HSwish', 2], + [5, 240, 40, True, 'HSwish', 1], + [5, 240, 40, True, 'HSwish', 1], + [5, 120, 48, True, 'HSwish', 1], + [5, 144, 48, True, 'HSwish', 1], + [5, 288, 96, True, 'HSwish', 2], + [5, 576, 96, True, 'HSwish', 1], + [5, 576, 96, True, 'HSwish', 1]], + 'big': [[3, 16, 16, False, 'ReLU', 1], + [3, 64, 24, False, 'ReLU', 2], + [3, 72, 24, False, 'ReLU', 1], + [5, 72, 40, True, 'ReLU', 2], + [5, 120, 40, True, 'ReLU', 1], + [5, 120, 40, True, 'ReLU', 1], + [3, 240, 80, False, 'HSwish', 2], + [3, 200, 80, False, 'HSwish', 1], + [3, 184, 80, False, 'HSwish', 1], + [3, 184, 80, False, 'HSwish', 1], + [3, 480, 112, True, 'HSwish', 1], + [3, 672, 112, True, 'HSwish', 1], + [5, 672, 160, True, 'HSwish', 1], + [5, 672, 160, True, 'HSwish', 2], + [5, 960, 160, True, 'HSwish', 1]] + } # yapf: disable + + def __init__(self, + arch='small', + conv_cfg=None, + norm_cfg=dict(type='BN'), + out_indices=(-1, ), + frozen_stages=-1, + norm_eval=False, + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + assert arch in self.arch_settings + for index in out_indices: + if index not in range(-len(self.arch_settings[arch]), + len(self.arch_settings[arch])): + raise ValueError('the item in out_indices must in ' + f'range(0, {len(self.arch_settings[arch])}). ' + f'But received {index}') + + if frozen_stages not in range(-1, len(self.arch_settings[arch])): + raise ValueError('frozen_stages must be in range(-1, ' + f'{len(self.arch_settings[arch])}). ' + f'But received {frozen_stages}') + self.arch = arch + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.in_channels = 16 + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=dict(type='HSwish')) + + self.layers = self._make_layer() + self.feat_dim = self.arch_settings[arch][-1][2] + + def _make_layer(self): + layers = [] + layer_setting = self.arch_settings[self.arch] + for i, params in enumerate(layer_setting): + (kernel_size, mid_channels, out_channels, with_se, act, + stride) = params + if with_se: + se_cfg = dict( + channels=mid_channels, + ratio=4, + act_cfg=(dict(type='ReLU'), dict(type='HSigmoid'))) + else: + se_cfg = None + + layer = InvertedResidual( + in_channels=self.in_channels, + out_channels=out_channels, + mid_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + se_cfg=se_cfg, + with_expand_conv=True, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type=act), + with_cp=self.with_cp) + self.in_channels = out_channels + layer_name = f'layer{i + 1}' + self.add_module(layer_name, layer) + layers.append(layer_name) + return layers + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices or \ + i - len(self.layers) in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/mspn.py b/mmpose/models/backbones/mspn.py new file mode 100644 index 0000000..71cee34 --- /dev/null +++ b/mmpose/models/backbones/mspn.py @@ -0,0 +1,513 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +from collections import OrderedDict + +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init, + normal_init) +from mmcv.runner.checkpoint import load_state_dict + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .resnet import Bottleneck as _Bottleneck +from .utils.utils import get_state_dict + + +class Bottleneck(_Bottleneck): + expansion = 4 + """Bottleneck block for MSPN. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + stride (int): stride of the block. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + """ + + def __init__(self, in_channels, out_channels, **kwargs): + super().__init__(in_channels, out_channels * 4, **kwargs) + + +class DownsampleModule(nn.Module): + """Downsample module for MSPN. + + Args: + block (nn.Module): Downsample block. + num_blocks (list): Number of blocks in each downsample unit. + num_units (int): Numbers of downsample units. Default: 4 + has_skip (bool): Have skip connections from prior upsample + module or not. Default:False + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + in_channels (int): Number of channels of the input feature to + downsample module. Default: 64 + """ + + def __init__(self, + block, + num_blocks, + num_units=4, + has_skip=False, + norm_cfg=dict(type='BN'), + in_channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.has_skip = has_skip + self.in_channels = in_channels + assert len(num_blocks) == num_units + self.num_blocks = num_blocks + self.num_units = num_units + self.norm_cfg = norm_cfg + self.layer1 = self._make_layer(block, in_channels, num_blocks[0]) + for i in range(1, num_units): + module_name = f'layer{i + 1}' + self.add_module( + module_name, + self._make_layer( + block, in_channels * pow(2, i), num_blocks[i], stride=2)) + + def _make_layer(self, block, out_channels, blocks, stride=1): + downsample = None + if stride != 1 or self.in_channels != out_channels * block.expansion: + downsample = ConvModule( + self.in_channels, + out_channels * block.expansion, + kernel_size=1, + stride=stride, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=None, + inplace=True) + + units = list() + units.append( + block( + self.in_channels, + out_channels, + stride=stride, + downsample=downsample, + norm_cfg=self.norm_cfg)) + self.in_channels = out_channels * block.expansion + for _ in range(1, blocks): + units.append(block(self.in_channels, out_channels)) + + return nn.Sequential(*units) + + def forward(self, x, skip1, skip2): + out = list() + for i in range(self.num_units): + module_name = f'layer{i + 1}' + module_i = getattr(self, module_name) + x = module_i(x) + if self.has_skip: + x = x + skip1[i] + skip2[i] + out.append(x) + out.reverse() + + return tuple(out) + + +class UpsampleUnit(nn.Module): + """Upsample unit for upsample module. + + Args: + ind (int): Indicates whether to interpolate (>0) and whether to + generate feature map for the next hourglass-like module. + num_units (int): Number of units that form a upsample module. Along + with ind and gen_cross_conv, nm_units is used to decide whether + to generate feature map for the next hourglass-like module. + in_channels (int): Channel number of the skip-in feature maps from + the corresponding downsample unit. + unit_channels (int): Channel number in this unit. Default:256. + gen_skip: (bool): Whether or not to generate skips for the posterior + downsample module. Default:False + gen_cross_conv (bool): Whether to generate feature map for the next + hourglass-like module. Default:False + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + out_channels (int): Number of channels of feature output by upsample + module. Must equal to in_channels of downsample module. Default:64 + """ + + def __init__(self, + ind, + num_units, + in_channels, + unit_channels=256, + gen_skip=False, + gen_cross_conv=False, + norm_cfg=dict(type='BN'), + out_channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.num_units = num_units + self.norm_cfg = norm_cfg + self.in_skip = ConvModule( + in_channels, + unit_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=None, + inplace=True) + self.relu = nn.ReLU(inplace=True) + + self.ind = ind + if self.ind > 0: + self.up_conv = ConvModule( + unit_channels, + unit_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=None, + inplace=True) + + self.gen_skip = gen_skip + if self.gen_skip: + self.out_skip1 = ConvModule( + in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + inplace=True) + + self.out_skip2 = ConvModule( + unit_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + inplace=True) + + self.gen_cross_conv = gen_cross_conv + if self.ind == num_units - 1 and self.gen_cross_conv: + self.cross_conv = ConvModule( + unit_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + inplace=True) + + def forward(self, x, up_x): + out = self.in_skip(x) + + if self.ind > 0: + up_x = F.interpolate( + up_x, + size=(x.size(2), x.size(3)), + mode='bilinear', + align_corners=True) + up_x = self.up_conv(up_x) + out = out + up_x + out = self.relu(out) + + skip1 = None + skip2 = None + if self.gen_skip: + skip1 = self.out_skip1(x) + skip2 = self.out_skip2(out) + + cross_conv = None + if self.ind == self.num_units - 1 and self.gen_cross_conv: + cross_conv = self.cross_conv(out) + + return out, skip1, skip2, cross_conv + + +class UpsampleModule(nn.Module): + """Upsample module for MSPN. + + Args: + unit_channels (int): Channel number in the upsample units. + Default:256. + num_units (int): Numbers of upsample units. Default: 4 + gen_skip (bool): Whether to generate skip for posterior downsample + module or not. Default:False + gen_cross_conv (bool): Whether to generate feature map for the next + hourglass-like module. Default:False + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + out_channels (int): Number of channels of feature output by upsample + module. Must equal to in_channels of downsample module. Default:64 + """ + + def __init__(self, + unit_channels=256, + num_units=4, + gen_skip=False, + gen_cross_conv=False, + norm_cfg=dict(type='BN'), + out_channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.in_channels = list() + for i in range(num_units): + self.in_channels.append(Bottleneck.expansion * out_channels * + pow(2, i)) + self.in_channels.reverse() + self.num_units = num_units + self.gen_skip = gen_skip + self.gen_cross_conv = gen_cross_conv + self.norm_cfg = norm_cfg + for i in range(num_units): + module_name = f'up{i + 1}' + self.add_module( + module_name, + UpsampleUnit( + i, + self.num_units, + self.in_channels[i], + unit_channels, + self.gen_skip, + self.gen_cross_conv, + norm_cfg=self.norm_cfg, + out_channels=64)) + + def forward(self, x): + out = list() + skip1 = list() + skip2 = list() + cross_conv = None + for i in range(self.num_units): + module_i = getattr(self, f'up{i + 1}') + if i == 0: + outi, skip1_i, skip2_i, _ = module_i(x[i], None) + elif i == self.num_units - 1: + outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1]) + else: + outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1]) + out.append(outi) + skip1.append(skip1_i) + skip2.append(skip2_i) + skip1.reverse() + skip2.reverse() + + return out, skip1, skip2, cross_conv + + +class SingleStageNetwork(nn.Module): + """Single_stage Network. + + Args: + unit_channels (int): Channel number in the upsample units. Default:256. + num_units (int): Numbers of downsample/upsample units. Default: 4 + gen_skip (bool): Whether to generate skip for posterior downsample + module or not. Default:False + gen_cross_conv (bool): Whether to generate feature map for the next + hourglass-like module. Default:False + has_skip (bool): Have skip connections from prior upsample + module or not. Default:False + num_blocks (list): Number of blocks in each downsample unit. + Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks) + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + in_channels (int): Number of channels of the feature from ResNetTop. + Default: 64. + """ + + def __init__(self, + has_skip=False, + gen_skip=False, + gen_cross_conv=False, + unit_channels=256, + num_units=4, + num_blocks=[2, 2, 2, 2], + norm_cfg=dict(type='BN'), + in_channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + num_blocks = cp.deepcopy(num_blocks) + super().__init__() + assert len(num_blocks) == num_units + self.has_skip = has_skip + self.gen_skip = gen_skip + self.gen_cross_conv = gen_cross_conv + self.num_units = num_units + self.unit_channels = unit_channels + self.num_blocks = num_blocks + self.norm_cfg = norm_cfg + + self.downsample = DownsampleModule(Bottleneck, num_blocks, num_units, + has_skip, norm_cfg, in_channels) + self.upsample = UpsampleModule(unit_channels, num_units, gen_skip, + gen_cross_conv, norm_cfg, in_channels) + + def forward(self, x, skip1, skip2): + mid = self.downsample(x, skip1, skip2) + out, skip1, skip2, cross_conv = self.upsample(mid) + + return out, skip1, skip2, cross_conv + + +class ResNetTop(nn.Module): + """ResNet top for MSPN. + + Args: + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + channels (int): Number of channels of the feature output by ResNetTop. + """ + + def __init__(self, norm_cfg=dict(type='BN'), channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.top = nn.Sequential( + ConvModule( + 3, + channels, + kernel_size=7, + stride=2, + padding=3, + norm_cfg=norm_cfg, + inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1)) + + def forward(self, img): + return self.top(img) + + +@BACKBONES.register_module() +class MSPN(BaseBackbone): + """MSPN backbone. Paper ref: Li et al. "Rethinking on Multi-Stage Networks + for Human Pose Estimation" (CVPR 2020). + + Args: + unit_channels (int): Number of Channels in an upsample unit. + Default: 256 + num_stages (int): Number of stages in a multi-stage MSPN. Default: 4 + num_units (int): Number of downsample/upsample units in a single-stage + network. Default: 4 + Note: Make sure num_units == len(self.num_blocks) + num_blocks (list): Number of bottlenecks in each + downsample unit. Default: [2, 2, 2, 2] + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + res_top_channels (int): Number of channels of feature from ResNetTop. + Default: 64. + + Example: + >>> from mmpose.models import MSPN + >>> import torch + >>> self = MSPN(num_stages=2,num_units=2,num_blocks=[2,2]) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 511, 511) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... for feature in level_output: + ... print(tuple(feature.shape)) + ... + (1, 256, 64, 64) + (1, 256, 128, 128) + (1, 256, 64, 64) + (1, 256, 128, 128) + """ + + def __init__(self, + unit_channels=256, + num_stages=4, + num_units=4, + num_blocks=[2, 2, 2, 2], + norm_cfg=dict(type='BN'), + res_top_channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + num_blocks = cp.deepcopy(num_blocks) + super().__init__() + self.unit_channels = unit_channels + self.num_stages = num_stages + self.num_units = num_units + self.num_blocks = num_blocks + self.norm_cfg = norm_cfg + + assert self.num_stages > 0 + assert self.num_units > 1 + assert self.num_units == len(self.num_blocks) + self.top = ResNetTop(norm_cfg=norm_cfg) + self.multi_stage_mspn = nn.ModuleList([]) + for i in range(self.num_stages): + if i == 0: + has_skip = False + else: + has_skip = True + if i != self.num_stages - 1: + gen_skip = True + gen_cross_conv = True + else: + gen_skip = False + gen_cross_conv = False + self.multi_stage_mspn.append( + SingleStageNetwork(has_skip, gen_skip, gen_cross_conv, + unit_channels, num_units, num_blocks, + norm_cfg, res_top_channels)) + + def forward(self, x): + """Model forward function.""" + out_feats = [] + skip1 = None + skip2 = None + x = self.top(x) + for i in range(self.num_stages): + out, skip1, skip2, x = self.multi_stage_mspn[i](x, skip1, skip2) + out_feats.append(out) + + return out_feats + + def init_weights(self, pretrained=None): + """Initialize model weights.""" + if isinstance(pretrained, str): + logger = get_root_logger() + state_dict_tmp = get_state_dict(pretrained) + state_dict = OrderedDict() + state_dict['top'] = OrderedDict() + state_dict['bottlenecks'] = OrderedDict() + for k, v in state_dict_tmp.items(): + if k.startswith('layer'): + if 'downsample.0' in k: + state_dict['bottlenecks'][k.replace( + 'downsample.0', 'downsample.conv')] = v + elif 'downsample.1' in k: + state_dict['bottlenecks'][k.replace( + 'downsample.1', 'downsample.bn')] = v + else: + state_dict['bottlenecks'][k] = v + elif k.startswith('conv1'): + state_dict['top'][k.replace('conv1', 'top.0.conv')] = v + elif k.startswith('bn1'): + state_dict['top'][k.replace('bn1', 'top.0.bn')] = v + + load_state_dict( + self.top, state_dict['top'], strict=False, logger=logger) + for i in range(self.num_stages): + load_state_dict( + self.multi_stage_mspn[i].downsample, + state_dict['bottlenecks'], + strict=False, + logger=logger) + else: + for m in self.multi_stage_mspn.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + elif isinstance(m, nn.Linear): + normal_init(m, std=0.01) + + for m in self.top.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) diff --git a/mmpose/models/backbones/regnet.py b/mmpose/models/backbones/regnet.py new file mode 100644 index 0000000..693417c --- /dev/null +++ b/mmpose/models/backbones/regnet.py @@ -0,0 +1,317 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import numpy as np +import torch.nn as nn +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from .resnet import ResNet +from .resnext import Bottleneck + + +@BACKBONES.register_module() +class RegNet(ResNet): + """RegNet backbone. + + More details can be found in `paper `__ . + + Args: + arch (dict): The parameter of RegNets. + - w0 (int): initial width + - wa (float): slope of width + - wm (float): quantization parameter to quantize the width + - depth (int): depth of the backbone + - group_w (int): width of group + - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck. + strides (Sequence[int]): Strides of the first block of each stage. + base_channels (int): Base channels after stem layer. + in_channels (int): Number of input image channels. Default: 3. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. Default: "pytorch". + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. Default: -1. + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN', requires_grad=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + + Example: + >>> from mmpose.models import RegNet + >>> import torch + >>> self = RegNet( + arch=dict( + w0=88, + wa=26.31, + wm=2.25, + group_w=48, + depth=25, + bot_mul=1.0), + out_indices=(0, 1, 2, 3)) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 96, 8, 8) + (1, 192, 4, 4) + (1, 432, 2, 2) + (1, 1008, 1, 1) + """ + arch_settings = { + 'regnetx_400mf': + dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), + 'regnetx_800mf': + dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0), + 'regnetx_1.6gf': + dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0), + 'regnetx_3.2gf': + dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0), + 'regnetx_4.0gf': + dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0), + 'regnetx_6.4gf': + dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0), + 'regnetx_8.0gf': + dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0), + 'regnetx_12gf': + dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0), + } + + def __init__(self, + arch, + in_channels=3, + stem_channels=32, + base_channels=32, + strides=(2, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(3, ), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + with_cp=False, + zero_init_residual=True): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super(ResNet, self).__init__() + + # Generate RegNet parameters first + if isinstance(arch, str): + assert arch in self.arch_settings, \ + f'"arch": "{arch}" is not one of the' \ + ' arch_settings' + arch = self.arch_settings[arch] + elif not isinstance(arch, dict): + raise TypeError('Expect "arch" to be either a string ' + f'or a dict, got {type(arch)}') + + widths, num_stages = self.generate_regnet( + arch['w0'], + arch['wa'], + arch['wm'], + arch['depth'], + ) + # Convert to per stage format + stage_widths, stage_blocks = self.get_stages_from_blocks(widths) + # Generate group widths and bot muls + group_widths = [arch['group_w'] for _ in range(num_stages)] + self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)] + # Adjust the compatibility of stage_widths and group_widths + stage_widths, group_widths = self.adjust_width_group( + stage_widths, self.bottleneck_ratio, group_widths) + + # Group params by stage + self.stage_widths = stage_widths + self.group_widths = group_widths + self.depth = sum(stage_blocks) + self.stem_channels = stem_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert 1 <= num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + if self.deep_stem: + raise NotImplementedError( + 'deep_stem has not been implemented for RegNet') + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.zero_init_residual = zero_init_residual + self.stage_blocks = stage_blocks[:num_stages] + + self._make_stem_layer(in_channels, stem_channels) + + _in_channels = stem_channels + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = self.strides[i] + dilation = self.dilations[i] + group_width = self.group_widths[i] + width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i])) + stage_groups = width // group_width + + res_layer = self.make_res_layer( + block=Bottleneck, + num_blocks=num_blocks, + in_channels=_in_channels, + out_channels=self.stage_widths[i], + expansion=1, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=self.with_cp, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + base_channels=self.stage_widths[i], + groups=stage_groups, + width_per_group=group_width) + _in_channels = self.stage_widths[i] + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = stage_widths[-1] + + def _make_stem_layer(self, in_channels, base_channels): + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + base_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, base_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + + @staticmethod + def generate_regnet(initial_width, + width_slope, + width_parameter, + depth, + divisor=8): + """Generates per block width from RegNet parameters. + + Args: + initial_width ([int]): Initial width of the backbone + width_slope ([float]): Slope of the quantized linear function + width_parameter ([int]): Parameter used to quantize the width. + depth ([int]): Depth of the backbone. + divisor (int, optional): The divisor of channels. Defaults to 8. + + Returns: + list, int: return a list of widths of each stage and the number of + stages + """ + assert width_slope >= 0 + assert initial_width > 0 + assert width_parameter > 1 + assert initial_width % divisor == 0 + widths_cont = np.arange(depth) * width_slope + initial_width + ks = np.round( + np.log(widths_cont / initial_width) / np.log(width_parameter)) + widths = initial_width * np.power(width_parameter, ks) + widths = np.round(np.divide(widths, divisor)) * divisor + num_stages = len(np.unique(widths)) + widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist() + return widths, num_stages + + @staticmethod + def quantize_float(number, divisor): + """Converts a float to closest non-zero int divisible by divior. + + Args: + number (int): Original number to be quantized. + divisor (int): Divisor used to quantize the number. + + Returns: + int: quantized number that is divisible by devisor. + """ + return int(round(number / divisor) * divisor) + + def adjust_width_group(self, widths, bottleneck_ratio, groups): + """Adjusts the compatibility of widths and groups. + + Args: + widths (list[int]): Width of each stage. + bottleneck_ratio (float): Bottleneck ratio. + groups (int): number of groups in each stage + + Returns: + tuple(list): The adjusted widths and groups of each stage. + """ + bottleneck_width = [ + int(w * b) for w, b in zip(widths, bottleneck_ratio) + ] + groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)] + bottleneck_width = [ + self.quantize_float(w_bot, g) + for w_bot, g in zip(bottleneck_width, groups) + ] + widths = [ + int(w_bot / b) + for w_bot, b in zip(bottleneck_width, bottleneck_ratio) + ] + return widths, groups + + def get_stages_from_blocks(self, widths): + """Gets widths/stage_blocks of network at each stage. + + Args: + widths (list[int]): Width in each stage. + + Returns: + tuple(list): width and depth of each stage + """ + width_diff = [ + width != width_prev + for width, width_prev in zip(widths + [0], [0] + widths) + ] + stage_widths = [ + width for width, diff in zip(widths, width_diff[:-1]) if diff + ] + stage_blocks = np.diff([ + depth for depth, diff in zip(range(len(width_diff)), width_diff) + if diff + ]).tolist() + return stage_widths, stage_blocks + + def forward(self, x): + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + return tuple(outs) diff --git a/mmpose/models/backbones/resnest.py b/mmpose/models/backbones/resnest.py new file mode 100644 index 0000000..0a2d408 --- /dev/null +++ b/mmpose/models/backbones/resnest.py @@ -0,0 +1,338 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResLayer, ResNetV1d + + +class RSoftmax(nn.Module): + """Radix Softmax module in ``SplitAttentionConv2d``. + + Args: + radix (int): Radix of input. + groups (int): Groups of input. + """ + + def __init__(self, radix, groups): + super().__init__() + self.radix = radix + self.groups = groups + + def forward(self, x): + batch = x.size(0) + if self.radix > 1: + x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2) + x = F.softmax(x, dim=1) + x = x.reshape(batch, -1) + else: + x = torch.sigmoid(x) + return x + + +class SplitAttentionConv2d(nn.Module): + """Split-Attention Conv2d. + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int | tuple[int]): Same as nn.Conv2d. + stride (int | tuple[int]): Same as nn.Conv2d. + padding (int | tuple[int]): Same as nn.Conv2d. + dilation (int | tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of SplitAttentionConv2d. + Default: 4. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Default: None. + """ + + def __init__(self, + in_channels, + channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + radix=2, + reduction_factor=4, + conv_cfg=None, + norm_cfg=dict(type='BN')): + super().__init__() + inter_channels = max(in_channels * radix // reduction_factor, 32) + self.radix = radix + self.groups = groups + self.channels = channels + self.conv = build_conv_layer( + conv_cfg, + in_channels, + channels * radix, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups * radix, + bias=False) + self.norm0_name, norm0 = build_norm_layer( + norm_cfg, channels * radix, postfix=0) + self.add_module(self.norm0_name, norm0) + self.relu = nn.ReLU(inplace=True) + self.fc1 = build_conv_layer( + None, channels, inter_channels, 1, groups=self.groups) + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, inter_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.fc2 = build_conv_layer( + None, inter_channels, channels * radix, 1, groups=self.groups) + self.rsoftmax = RSoftmax(radix, groups) + + @property + def norm0(self): + return getattr(self, self.norm0_name) + + @property + def norm1(self): + return getattr(self, self.norm1_name) + + def forward(self, x): + x = self.conv(x) + x = self.norm0(x) + x = self.relu(x) + + batch, rchannel = x.shape[:2] + if self.radix > 1: + splits = x.view(batch, self.radix, -1, *x.shape[2:]) + gap = splits.sum(dim=1) + else: + gap = x + gap = F.adaptive_avg_pool2d(gap, 1) + gap = self.fc1(gap) + + gap = self.norm1(gap) + gap = self.relu(gap) + + atten = self.fc2(gap) + atten = self.rsoftmax(atten).view(batch, -1, 1, 1) + + if self.radix > 1: + attens = atten.view(batch, self.radix, -1, *atten.shape[2:]) + out = torch.sum(attens * splits, dim=1) + else: + out = atten * x + return out.contiguous() + + +class Bottleneck(_Bottleneck): + """Bottleneck block for ResNeSt. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + groups (int): Groups of conv2. + width_per_group (int): Width per group of conv2. 64x4d indicates + ``groups=64, width_per_group=4`` and 32x8d indicates + ``groups=32, width_per_group=8``. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of SplitAttentionConv2d. + Default: 4. + avg_down_stride (bool): Whether to use average pool for stride in + Bottleneck. Default: True. + stride (int): stride of the block. Default: 1 + dilation (int): dilation of convolution. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + def __init__(self, + in_channels, + out_channels, + groups=1, + width_per_group=4, + base_channels=64, + radix=2, + reduction_factor=4, + avg_down_stride=True, + **kwargs): + super().__init__(in_channels, out_channels, **kwargs) + + self.groups = groups + self.width_per_group = width_per_group + + # For ResNet bottleneck, middle channels are determined by expansion + # and out_channels, but for ResNeXt bottleneck, it is determined by + # groups and width_per_group and the stage it is located in. + if groups != 1: + assert self.mid_channels % base_channels == 0 + self.mid_channels = ( + groups * width_per_group * self.mid_channels // base_channels) + + self.avg_down_stride = avg_down_stride and self.conv2_stride > 1 + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=1) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.out_channels, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.in_channels, + self.mid_channels, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = SplitAttentionConv2d( + self.mid_channels, + self.mid_channels, + kernel_size=3, + stride=1 if self.avg_down_stride else self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + radix=radix, + reduction_factor=reduction_factor, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + delattr(self, self.norm2_name) + + if self.avg_down_stride: + self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1) + + self.conv3 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.out_channels, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + def forward(self, x): + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + + if self.avg_down_stride: + out = self.avd_layer(out) + + out = self.conv3(out) + out = self.norm3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@BACKBONES.register_module() +class ResNeSt(ResNetV1d): + """ResNeSt backbone. + + Please refer to the `paper `__ + for details. + + Args: + depth (int): Network depth, from {50, 101, 152, 200}. + groups (int): Groups of conv2 in Bottleneck. Default: 32. + width_per_group (int): Width per group of conv2 in Bottleneck. + Default: 4. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of SplitAttentionConv2d. + Default: 4. + avg_down_stride (bool): Whether to use average pool for stride in + Bottleneck. Default: True. + in_channels (int): Number of input image channels. Default: 3. + stem_channels (int): Output channels of the stem layer. Default: 64. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)), + 200: (Bottleneck, (3, 24, 36, 3)), + 269: (Bottleneck, (3, 30, 48, 8)) + } + + def __init__(self, + depth, + groups=1, + width_per_group=4, + radix=2, + reduction_factor=4, + avg_down_stride=True, + **kwargs): + self.groups = groups + self.width_per_group = width_per_group + self.radix = radix + self.reduction_factor = reduction_factor + self.avg_down_stride = avg_down_stride + super().__init__(depth=depth, **kwargs) + + def make_res_layer(self, **kwargs): + return ResLayer( + groups=self.groups, + width_per_group=self.width_per_group, + base_channels=self.base_channels, + radix=self.radix, + reduction_factor=self.reduction_factor, + avg_down_stride=self.avg_down_stride, + **kwargs) diff --git a/mmpose/models/backbones/resnet.py b/mmpose/models/backbones/resnet.py new file mode 100644 index 0000000..649496a --- /dev/null +++ b/mmpose/models/backbones/resnet.py @@ -0,0 +1,701 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer, + constant_init, kaiming_init) +from mmcv.utils.parrots_wrapper import _BatchNorm + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +class BasicBlock(nn.Module): + """BasicBlock for ResNet. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + expansion (int): The ratio of ``out_channels/mid_channels`` where + ``mid_channels`` is the output channels of conv1. This is a + reserved argument in BasicBlock and should always be 1. Default: 1. + stride (int): stride of the block. Default: 1 + dilation (int): dilation of convolution. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None. + style (str): `pytorch` or `caffe`. It is unused and reserved for + unified API with Bottleneck. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + """ + + def __init__(self, + in_channels, + out_channels, + expansion=1, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN')): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.expansion = expansion + assert self.expansion == 1 + assert out_channels % expansion == 0 + self.mid_channels = out_channels // expansion + self.stride = stride + self.dilation = dilation + self.style = style + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, self.mid_channels, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + norm_cfg, out_channels, postfix=2) + + self.conv1 = build_conv_layer( + conv_cfg, + in_channels, + self.mid_channels, + 3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + conv_cfg, + self.mid_channels, + out_channels, + 3, + padding=1, + bias=False) + self.add_module(self.norm2_name, norm2) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + """Bottleneck block for ResNet. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + expansion (int): The ratio of ``out_channels/mid_channels`` where + ``mid_channels`` is the input/output channels of conv2. Default: 4. + stride (int): stride of the block. Default: 1 + dilation (int): dilation of convolution. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None. + style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Default: "pytorch". + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + """ + + def __init__(self, + in_channels, + out_channels, + expansion=4, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN')): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + assert style in ['pytorch', 'caffe'] + + self.in_channels = in_channels + self.out_channels = out_channels + self.expansion = expansion + assert out_channels % expansion == 0 + self.mid_channels = out_channels // expansion + self.stride = stride + self.dilation = dilation + self.style = style + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + if self.style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, self.mid_channels, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + norm_cfg, self.mid_channels, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + norm_cfg, out_channels, postfix=3) + + self.conv1 = build_conv_layer( + conv_cfg, + in_channels, + self.mid_channels, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + conv_cfg, + self.mid_channels, + self.mid_channels, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + conv_cfg, + self.mid_channels, + out_channels, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + @property + def norm3(self): + """nn.Module: the normalization layer named "norm3" """ + return getattr(self, self.norm3_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +def get_expansion(block, expansion=None): + """Get the expansion of a residual block. + + The block expansion will be obtained by the following order: + + 1. If ``expansion`` is given, just return it. + 2. If ``block`` has the attribute ``expansion``, then return + ``block.expansion``. + 3. Return the default value according the the block type: + 1 for ``BasicBlock`` and 4 for ``Bottleneck``. + + Args: + block (class): The block class. + expansion (int | None): The given expansion ratio. + + Returns: + int: The expansion of the block. + """ + if isinstance(expansion, int): + assert expansion > 0 + elif expansion is None: + if hasattr(block, 'expansion'): + expansion = block.expansion + elif issubclass(block, BasicBlock): + expansion = 1 + elif issubclass(block, Bottleneck): + expansion = 4 + else: + raise TypeError(f'expansion is not specified for {block.__name__}') + else: + raise TypeError('expansion must be an integer or None') + + return expansion + + +class ResLayer(nn.Sequential): + """ResLayer to build ResNet style backbone. + + Args: + block (nn.Module): Residual block used to build ResLayer. + num_blocks (int): Number of blocks. + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + expansion (int, optional): The expansion for BasicBlock/Bottleneck. + If not specified, it will firstly be obtained via + ``block.expansion``. If the block has no attribute "expansion", + the following default values will be used: 1 for BasicBlock and + 4 for Bottleneck. Default: None. + stride (int): stride of the first block. Default: 1. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + downsample_first (bool): Downsample at the first block or last block. + False for Hourglass, True for ResNet. Default: True + """ + + def __init__(self, + block, + num_blocks, + in_channels, + out_channels, + expansion=None, + stride=1, + avg_down=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + downsample_first=True, + **kwargs): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + self.block = block + self.expansion = get_expansion(block, expansion) + + downsample = None + if stride != 1 or in_channels != out_channels: + downsample = [] + conv_stride = stride + if avg_down and stride != 1: + conv_stride = 1 + downsample.append( + nn.AvgPool2d( + kernel_size=stride, + stride=stride, + ceil_mode=True, + count_include_pad=False)) + downsample.extend([ + build_conv_layer( + conv_cfg, + in_channels, + out_channels, + kernel_size=1, + stride=conv_stride, + bias=False), + build_norm_layer(norm_cfg, out_channels)[1] + ]) + downsample = nn.Sequential(*downsample) + + layers = [] + if downsample_first: + layers.append( + block( + in_channels=in_channels, + out_channels=out_channels, + expansion=self.expansion, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + in_channels = out_channels + for _ in range(1, num_blocks): + layers.append( + block( + in_channels=in_channels, + out_channels=out_channels, + expansion=self.expansion, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + else: # downsample_first=False is for HourglassModule + for i in range(0, num_blocks - 1): + layers.append( + block( + in_channels=in_channels, + out_channels=in_channels, + expansion=self.expansion, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + layers.append( + block( + in_channels=in_channels, + out_channels=out_channels, + expansion=self.expansion, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + + super().__init__(*layers) + + +@BACKBONES.register_module() +class ResNet(BaseBackbone): + """ResNet backbone. + + Please refer to the `paper `__ for + details. + + Args: + depth (int): Network depth, from {18, 34, 50, 101, 152}. + in_channels (int): Number of input image channels. Default: 3. + stem_channels (int): Output channels of the stem layer. Default: 64. + base_channels (int): Middle channels of the first stage. Default: 64. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + + Example: + >>> from mmpose.models import ResNet + >>> import torch + >>> self = ResNet(depth=18, out_indices=(0, 1, 2, 3)) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 64, 8, 8) + (1, 128, 4, 4) + (1, 256, 2, 2) + (1, 512, 1, 1) + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + depth, + in_channels=3, + stem_channels=64, + base_channels=64, + expansion=None, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(3, ), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + with_cp=False, + zero_init_residual=True): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + self.depth = depth + self.stem_channels = stem_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert 1 <= num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.zero_init_residual = zero_init_residual + self.block, stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + self.expansion = get_expansion(self.block, expansion) + + self._make_stem_layer(in_channels, stem_channels) + + self.res_layers = [] + _in_channels = stem_channels + _out_channels = base_channels * self.expansion + for i, num_blocks in enumerate(self.stage_blocks): + stride = strides[i] + dilation = dilations[i] + res_layer = self.make_res_layer( + block=self.block, + num_blocks=num_blocks, + in_channels=_in_channels, + out_channels=_out_channels, + expansion=self.expansion, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + _in_channels = _out_channels + _out_channels *= 2 + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = res_layer[-1].out_channels + + def make_res_layer(self, **kwargs): + """Make a ResLayer.""" + return ResLayer(**kwargs) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def _make_stem_layer(self, in_channels, stem_channels): + """Make stem layer.""" + if self.deep_stem: + self.stem = nn.Sequential( + ConvModule( + in_channels, + stem_channels // 2, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True), + ConvModule( + stem_channels // 2, + stem_channels // 2, + kernel_size=3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True), + ConvModule( + stem_channels // 2, + stem_channels, + kernel_size=3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + else: + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels, + kernel_size=7, + stride=2, + padding=3, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, stem_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def _freeze_stages(self): + """Freeze parameters.""" + if self.frozen_stages >= 0: + if self.deep_stem: + self.stem.eval() + for param in self.stem.parameters(): + param.requires_grad = False + else: + self.norm1.eval() + for m in [self.conv1, self.norm1]: + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + super().init_weights(pretrained) + if pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + constant_init(m.norm3, 0) + elif isinstance(m, BasicBlock): + constant_init(m.norm2, 0) + + def forward(self, x): + """Forward function.""" + if self.deep_stem: + x = self.stem(x) + else: + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + return tuple(outs) + + def train(self, mode=True): + """Convert the model into training mode.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + +@BACKBONES.register_module() +class ResNetV1d(ResNet): + r"""ResNetV1d variant described in `Bag of Tricks + `__. + + Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in + the input stem with three 3x3 convs. And in the downsampling block, a 2x2 + avg_pool with stride 2 is added before conv, whose stride is changed to 1. + """ + + def __init__(self, **kwargs): + super().__init__(deep_stem=True, avg_down=True, **kwargs) diff --git a/mmpose/models/backbones/resnext.py b/mmpose/models/backbones/resnext.py new file mode 100644 index 0000000..c10dc33 --- /dev/null +++ b/mmpose/models/backbones/resnext.py @@ -0,0 +1,162 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResLayer, ResNet + + +class Bottleneck(_Bottleneck): + """Bottleneck block for ResNeXt. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + groups (int): Groups of conv2. + width_per_group (int): Width per group of conv2. 64x4d indicates + ``groups=64, width_per_group=4`` and 32x8d indicates + ``groups=32, width_per_group=8``. + stride (int): stride of the block. Default: 1 + dilation (int): dilation of convolution. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + def __init__(self, + in_channels, + out_channels, + base_channels=64, + groups=32, + width_per_group=4, + **kwargs): + super().__init__(in_channels, out_channels, **kwargs) + self.groups = groups + self.width_per_group = width_per_group + + # For ResNet bottleneck, middle channels are determined by expansion + # and out_channels, but for ResNeXt bottleneck, it is determined by + # groups and width_per_group and the stage it is located in. + if groups != 1: + assert self.mid_channels % base_channels == 0 + self.mid_channels = ( + groups * width_per_group * self.mid_channels // base_channels) + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.out_channels, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.in_channels, + self.mid_channels, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.mid_channels, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.out_channels, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + +@BACKBONES.register_module() +class ResNeXt(ResNet): + """ResNeXt backbone. + + Please refer to the `paper `__ for + details. + + Args: + depth (int): Network depth, from {50, 101, 152}. + groups (int): Groups of conv2 in Bottleneck. Default: 32. + width_per_group (int): Width per group of conv2 in Bottleneck. + Default: 4. + in_channels (int): Number of input image channels. Default: 3. + stem_channels (int): Output channels of the stem layer. Default: 64. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + + Example: + >>> from mmpose.models import ResNeXt + >>> import torch + >>> self = ResNeXt(depth=50, out_indices=(0, 1, 2, 3)) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 256, 8, 8) + (1, 512, 4, 4) + (1, 1024, 2, 2) + (1, 2048, 1, 1) + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, depth, groups=32, width_per_group=4, **kwargs): + self.groups = groups + self.width_per_group = width_per_group + super().__init__(depth, **kwargs) + + def make_res_layer(self, **kwargs): + return ResLayer( + groups=self.groups, + width_per_group=self.width_per_group, + base_channels=self.base_channels, + **kwargs) diff --git a/mmpose/models/backbones/rsn.py b/mmpose/models/backbones/rsn.py new file mode 100644 index 0000000..29038af --- /dev/null +++ b/mmpose/models/backbones/rsn.py @@ -0,0 +1,616 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init, + normal_init) + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +class RSB(nn.Module): + """Residual Steps block for RSN. Paper ref: Cai et al. "Learning Delicate + Local Representations for Multi-Person Pose Estimation" (ECCV 2020). + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + num_steps (int): Numbers of steps in RSB + stride (int): stride of the block. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None. + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + expand_times (int): Times by which the in_channels are expanded. + Default:26. + res_top_channels (int): Number of channels of feature output by + ResNet_top. Default:64. + """ + + expansion = 1 + + def __init__(self, + in_channels, + out_channels, + num_steps=4, + stride=1, + downsample=None, + with_cp=False, + norm_cfg=dict(type='BN'), + expand_times=26, + res_top_channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + assert num_steps > 1 + self.in_channels = in_channels + self.branch_channels = self.in_channels * expand_times + self.branch_channels //= res_top_channels + self.out_channels = out_channels + self.stride = stride + self.downsample = downsample + self.with_cp = with_cp + self.norm_cfg = norm_cfg + self.num_steps = num_steps + self.conv_bn_relu1 = ConvModule( + self.in_channels, + self.num_steps * self.branch_channels, + kernel_size=1, + stride=self.stride, + padding=0, + norm_cfg=self.norm_cfg, + inplace=False) + for i in range(self.num_steps): + for j in range(i + 1): + module_name = f'conv_bn_relu2_{i + 1}_{j + 1}' + self.add_module( + module_name, + ConvModule( + self.branch_channels, + self.branch_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + inplace=False)) + self.conv_bn3 = ConvModule( + self.num_steps * self.branch_channels, + self.out_channels * self.expansion, + kernel_size=1, + stride=1, + padding=0, + act_cfg=None, + norm_cfg=self.norm_cfg, + inplace=False) + self.relu = nn.ReLU(inplace=False) + + def forward(self, x): + """Forward function.""" + + identity = x + x = self.conv_bn_relu1(x) + spx = torch.split(x, self.branch_channels, 1) + outputs = list() + outs = list() + for i in range(self.num_steps): + outputs_i = list() + outputs.append(outputs_i) + for j in range(i + 1): + if j == 0: + inputs = spx[i] + else: + inputs = outputs[i][j - 1] + if i > j: + inputs = inputs + outputs[i - 1][j] + module_name = f'conv_bn_relu2_{i + 1}_{j + 1}' + module_i_j = getattr(self, module_name) + outputs[i].append(module_i_j(inputs)) + + outs.append(outputs[i][i]) + out = torch.cat(tuple(outs), 1) + out = self.conv_bn3(out) + + if self.downsample is not None: + identity = self.downsample(identity) + out = out + identity + + out = self.relu(out) + + return out + + +class Downsample_module(nn.Module): + """Downsample module for RSN. + + Args: + block (nn.Module): Downsample block. + num_blocks (list): Number of blocks in each downsample unit. + num_units (int): Numbers of downsample units. Default: 4 + has_skip (bool): Have skip connections from prior upsample + module or not. Default:False + num_steps (int): Number of steps in a block. Default:4 + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + in_channels (int): Number of channels of the input feature to + downsample module. Default: 64 + expand_times (int): Times by which the in_channels are expanded. + Default:26. + """ + + def __init__(self, + block, + num_blocks, + num_steps=4, + num_units=4, + has_skip=False, + norm_cfg=dict(type='BN'), + in_channels=64, + expand_times=26): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.has_skip = has_skip + self.in_channels = in_channels + assert len(num_blocks) == num_units + self.num_blocks = num_blocks + self.num_units = num_units + self.num_steps = num_steps + self.norm_cfg = norm_cfg + self.layer1 = self._make_layer( + block, + in_channels, + num_blocks[0], + expand_times=expand_times, + res_top_channels=in_channels) + for i in range(1, num_units): + module_name = f'layer{i + 1}' + self.add_module( + module_name, + self._make_layer( + block, + in_channels * pow(2, i), + num_blocks[i], + stride=2, + expand_times=expand_times, + res_top_channels=in_channels)) + + def _make_layer(self, + block, + out_channels, + blocks, + stride=1, + expand_times=26, + res_top_channels=64): + downsample = None + if stride != 1 or self.in_channels != out_channels * block.expansion: + downsample = ConvModule( + self.in_channels, + out_channels * block.expansion, + kernel_size=1, + stride=stride, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=None, + inplace=True) + + units = list() + units.append( + block( + self.in_channels, + out_channels, + num_steps=self.num_steps, + stride=stride, + downsample=downsample, + norm_cfg=self.norm_cfg, + expand_times=expand_times, + res_top_channels=res_top_channels)) + self.in_channels = out_channels * block.expansion + for _ in range(1, blocks): + units.append( + block( + self.in_channels, + out_channels, + num_steps=self.num_steps, + expand_times=expand_times, + res_top_channels=res_top_channels)) + + return nn.Sequential(*units) + + def forward(self, x, skip1, skip2): + out = list() + for i in range(self.num_units): + module_name = f'layer{i + 1}' + module_i = getattr(self, module_name) + x = module_i(x) + if self.has_skip: + x = x + skip1[i] + skip2[i] + out.append(x) + out.reverse() + + return tuple(out) + + +class Upsample_unit(nn.Module): + """Upsample unit for upsample module. + + Args: + ind (int): Indicates whether to interpolate (>0) and whether to + generate feature map for the next hourglass-like module. + num_units (int): Number of units that form a upsample module. Along + with ind and gen_cross_conv, nm_units is used to decide whether + to generate feature map for the next hourglass-like module. + in_channels (int): Channel number of the skip-in feature maps from + the corresponding downsample unit. + unit_channels (int): Channel number in this unit. Default:256. + gen_skip: (bool): Whether or not to generate skips for the posterior + downsample module. Default:False + gen_cross_conv (bool): Whether to generate feature map for the next + hourglass-like module. Default:False + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + out_channels (in): Number of channels of feature output by upsample + module. Must equal to in_channels of downsample module. Default:64 + """ + + def __init__(self, + ind, + num_units, + in_channels, + unit_channels=256, + gen_skip=False, + gen_cross_conv=False, + norm_cfg=dict(type='BN'), + out_channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.num_units = num_units + self.norm_cfg = norm_cfg + self.in_skip = ConvModule( + in_channels, + unit_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=None, + inplace=True) + self.relu = nn.ReLU(inplace=True) + + self.ind = ind + if self.ind > 0: + self.up_conv = ConvModule( + unit_channels, + unit_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=None, + inplace=True) + + self.gen_skip = gen_skip + if self.gen_skip: + self.out_skip1 = ConvModule( + in_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + inplace=True) + + self.out_skip2 = ConvModule( + unit_channels, + in_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + inplace=True) + + self.gen_cross_conv = gen_cross_conv + if self.ind == num_units - 1 and self.gen_cross_conv: + self.cross_conv = ConvModule( + unit_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + inplace=True) + + def forward(self, x, up_x): + out = self.in_skip(x) + + if self.ind > 0: + up_x = F.interpolate( + up_x, + size=(x.size(2), x.size(3)), + mode='bilinear', + align_corners=True) + up_x = self.up_conv(up_x) + out = out + up_x + out = self.relu(out) + + skip1 = None + skip2 = None + if self.gen_skip: + skip1 = self.out_skip1(x) + skip2 = self.out_skip2(out) + + cross_conv = None + if self.ind == self.num_units - 1 and self.gen_cross_conv: + cross_conv = self.cross_conv(out) + + return out, skip1, skip2, cross_conv + + +class Upsample_module(nn.Module): + """Upsample module for RSN. + + Args: + unit_channels (int): Channel number in the upsample units. + Default:256. + num_units (int): Numbers of upsample units. Default: 4 + gen_skip (bool): Whether to generate skip for posterior downsample + module or not. Default:False + gen_cross_conv (bool): Whether to generate feature map for the next + hourglass-like module. Default:False + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + out_channels (int): Number of channels of feature output by upsample + module. Must equal to in_channels of downsample module. Default:64 + """ + + def __init__(self, + unit_channels=256, + num_units=4, + gen_skip=False, + gen_cross_conv=False, + norm_cfg=dict(type='BN'), + out_channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.in_channels = list() + for i in range(num_units): + self.in_channels.append(RSB.expansion * out_channels * pow(2, i)) + self.in_channels.reverse() + self.num_units = num_units + self.gen_skip = gen_skip + self.gen_cross_conv = gen_cross_conv + self.norm_cfg = norm_cfg + for i in range(num_units): + module_name = f'up{i + 1}' + self.add_module( + module_name, + Upsample_unit( + i, + self.num_units, + self.in_channels[i], + unit_channels, + self.gen_skip, + self.gen_cross_conv, + norm_cfg=self.norm_cfg, + out_channels=64)) + + def forward(self, x): + out = list() + skip1 = list() + skip2 = list() + cross_conv = None + for i in range(self.num_units): + module_i = getattr(self, f'up{i + 1}') + if i == 0: + outi, skip1_i, skip2_i, _ = module_i(x[i], None) + elif i == self.num_units - 1: + outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1]) + else: + outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1]) + out.append(outi) + skip1.append(skip1_i) + skip2.append(skip2_i) + skip1.reverse() + skip2.reverse() + + return out, skip1, skip2, cross_conv + + +class Single_stage_RSN(nn.Module): + """Single_stage Residual Steps Network. + + Args: + unit_channels (int): Channel number in the upsample units. Default:256. + num_units (int): Numbers of downsample/upsample units. Default: 4 + gen_skip (bool): Whether to generate skip for posterior downsample + module or not. Default:False + gen_cross_conv (bool): Whether to generate feature map for the next + hourglass-like module. Default:False + has_skip (bool): Have skip connections from prior upsample + module or not. Default:False + num_steps (int): Number of steps in RSB. Default: 4 + num_blocks (list): Number of blocks in each downsample unit. + Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks) + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + in_channels (int): Number of channels of the feature from ResNet_Top. + Default: 64. + expand_times (int): Times by which the in_channels are expanded in RSB. + Default:26. + """ + + def __init__(self, + has_skip=False, + gen_skip=False, + gen_cross_conv=False, + unit_channels=256, + num_units=4, + num_steps=4, + num_blocks=[2, 2, 2, 2], + norm_cfg=dict(type='BN'), + in_channels=64, + expand_times=26): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + num_blocks = cp.deepcopy(num_blocks) + super().__init__() + assert len(num_blocks) == num_units + self.has_skip = has_skip + self.gen_skip = gen_skip + self.gen_cross_conv = gen_cross_conv + self.num_units = num_units + self.num_steps = num_steps + self.unit_channels = unit_channels + self.num_blocks = num_blocks + self.norm_cfg = norm_cfg + + self.downsample = Downsample_module(RSB, num_blocks, num_steps, + num_units, has_skip, norm_cfg, + in_channels, expand_times) + self.upsample = Upsample_module(unit_channels, num_units, gen_skip, + gen_cross_conv, norm_cfg, in_channels) + + def forward(self, x, skip1, skip2): + mid = self.downsample(x, skip1, skip2) + out, skip1, skip2, cross_conv = self.upsample(mid) + + return out, skip1, skip2, cross_conv + + +class ResNet_top(nn.Module): + """ResNet top for RSN. + + Args: + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + channels (int): Number of channels of the feature output by ResNet_top. + """ + + def __init__(self, norm_cfg=dict(type='BN'), channels=64): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.top = nn.Sequential( + ConvModule( + 3, + channels, + kernel_size=7, + stride=2, + padding=3, + norm_cfg=norm_cfg, + inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1)) + + def forward(self, img): + return self.top(img) + + +@BACKBONES.register_module() +class RSN(BaseBackbone): + """Residual Steps Network backbone. Paper ref: Cai et al. "Learning + Delicate Local Representations for Multi-Person Pose Estimation" (ECCV + 2020). + + Args: + unit_channels (int): Number of Channels in an upsample unit. + Default: 256 + num_stages (int): Number of stages in a multi-stage RSN. Default: 4 + num_units (int): NUmber of downsample/upsample units in a single-stage + RSN. Default: 4 Note: Make sure num_units == len(self.num_blocks) + num_blocks (list): Number of RSBs (Residual Steps Block) in each + downsample unit. Default: [2, 2, 2, 2] + num_steps (int): Number of steps in a RSB. Default:4 + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + res_top_channels (int): Number of channels of feature from ResNet_top. + Default: 64. + expand_times (int): Times by which the in_channels are expanded in RSB. + Default:26. + Example: + >>> from mmpose.models import RSN + >>> import torch + >>> self = RSN(num_stages=2,num_units=2,num_blocks=[2,2]) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 511, 511) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... for feature in level_output: + ... print(tuple(feature.shape)) + ... + (1, 256, 64, 64) + (1, 256, 128, 128) + (1, 256, 64, 64) + (1, 256, 128, 128) + """ + + def __init__(self, + unit_channels=256, + num_stages=4, + num_units=4, + num_blocks=[2, 2, 2, 2], + num_steps=4, + norm_cfg=dict(type='BN'), + res_top_channels=64, + expand_times=26): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + num_blocks = cp.deepcopy(num_blocks) + super().__init__() + self.unit_channels = unit_channels + self.num_stages = num_stages + self.num_units = num_units + self.num_blocks = num_blocks + self.num_steps = num_steps + self.norm_cfg = norm_cfg + + assert self.num_stages > 0 + assert self.num_steps > 1 + assert self.num_units > 1 + assert self.num_units == len(self.num_blocks) + self.top = ResNet_top(norm_cfg=norm_cfg) + self.multi_stage_rsn = nn.ModuleList([]) + for i in range(self.num_stages): + if i == 0: + has_skip = False + else: + has_skip = True + if i != self.num_stages - 1: + gen_skip = True + gen_cross_conv = True + else: + gen_skip = False + gen_cross_conv = False + self.multi_stage_rsn.append( + Single_stage_RSN(has_skip, gen_skip, gen_cross_conv, + unit_channels, num_units, num_steps, + num_blocks, norm_cfg, res_top_channels, + expand_times)) + + def forward(self, x): + """Model forward function.""" + out_feats = [] + skip1 = None + skip2 = None + x = self.top(x) + for i in range(self.num_stages): + out, skip1, skip2, x = self.multi_stage_rsn[i](x, skip1, skip2) + out_feats.append(out) + + return out_feats + + def init_weights(self, pretrained=None): + """Initialize model weights.""" + for m in self.multi_stage_rsn.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + elif isinstance(m, nn.Linear): + normal_init(m, std=0.01) + + for m in self.top.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) diff --git a/mmpose/models/backbones/scnet.py b/mmpose/models/backbones/scnet.py new file mode 100644 index 0000000..3786c57 --- /dev/null +++ b/mmpose/models/backbones/scnet.py @@ -0,0 +1,248 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from .resnet import Bottleneck, ResNet + + +class SCConv(nn.Module): + """SCConv (Self-calibrated Convolution) + + Args: + in_channels (int): The input channels of the SCConv. + out_channels (int): The output channel of the SCConv. + stride (int): stride of SCConv. + pooling_r (int): size of pooling for scconv. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + """ + + def __init__(self, + in_channels, + out_channels, + stride, + pooling_r, + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.1)): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + + assert in_channels == out_channels + + self.k2 = nn.Sequential( + nn.AvgPool2d(kernel_size=pooling_r, stride=pooling_r), + build_conv_layer( + conv_cfg, + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(norm_cfg, in_channels)[1], + ) + self.k3 = nn.Sequential( + build_conv_layer( + conv_cfg, + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(norm_cfg, in_channels)[1], + ) + self.k4 = nn.Sequential( + build_conv_layer( + conv_cfg, + in_channels, + in_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False), + build_norm_layer(norm_cfg, out_channels)[1], + nn.ReLU(inplace=True), + ) + + def forward(self, x): + """Forward function.""" + identity = x + + out = torch.sigmoid( + torch.add(identity, F.interpolate(self.k2(x), + identity.size()[2:]))) + out = torch.mul(self.k3(x), out) + out = self.k4(out) + + return out + + +class SCBottleneck(Bottleneck): + """SC(Self-calibrated) Bottleneck. + + Args: + in_channels (int): The input channels of the SCBottleneck block. + out_channels (int): The output channel of the SCBottleneck block. + """ + + pooling_r = 4 + + def __init__(self, in_channels, out_channels, **kwargs): + super().__init__(in_channels, out_channels, **kwargs) + self.mid_channels = out_channels // self.expansion // 2 + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, out_channels, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + self.mid_channels, + kernel_size=1, + stride=1, + bias=False) + self.add_module(self.norm1_name, norm1) + + self.k1 = nn.Sequential( + build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.mid_channels, + kernel_size=3, + stride=self.stride, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, self.mid_channels)[1], + nn.ReLU(inplace=True)) + + self.conv2 = build_conv_layer( + self.conv_cfg, + in_channels, + self.mid_channels, + kernel_size=1, + stride=1, + bias=False) + self.add_module(self.norm2_name, norm2) + + self.scconv = SCConv(self.mid_channels, self.mid_channels, self.stride, + self.pooling_r, self.conv_cfg, self.norm_cfg) + + self.conv3 = build_conv_layer( + self.conv_cfg, + self.mid_channels * 2, + out_channels, + kernel_size=1, + stride=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out_a = self.conv1(x) + out_a = self.norm1(out_a) + out_a = self.relu(out_a) + + out_a = self.k1(out_a) + + out_b = self.conv2(x) + out_b = self.norm2(out_b) + out_b = self.relu(out_b) + + out_b = self.scconv(out_b) + + out = self.conv3(torch.cat([out_a, out_b], dim=1)) + out = self.norm3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@BACKBONES.register_module() +class SCNet(ResNet): + """SCNet backbone. + + Improving Convolutional Networks with Self-Calibrated Convolutions, + Jiang-Jiang Liu, Qibin Hou, Ming-Ming Cheng, Changhu Wang, Jiashi Feng, + IEEE CVPR, 2020. + http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf + + Args: + depth (int): Depth of scnet, from {50, 101}. + in_channels (int): Number of input image channels. Normally 3. + base_channels (int): Number of base channels of hidden layer. + num_stages (int): SCNet stages, normally 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + norm_cfg (dict): Dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. + + Example: + >>> from mmpose.models import SCNet + >>> import torch + >>> self = SCNet(depth=50, out_indices=(0, 1, 2, 3)) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 224, 224) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 256, 56, 56) + (1, 512, 28, 28) + (1, 1024, 14, 14) + (1, 2048, 7, 7) + """ + + arch_settings = { + 50: (SCBottleneck, [3, 4, 6, 3]), + 101: (SCBottleneck, [3, 4, 23, 3]) + } + + def __init__(self, depth, **kwargs): + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for SCNet') + super().__init__(depth, **kwargs) diff --git a/mmpose/models/backbones/seresnet.py b/mmpose/models/backbones/seresnet.py new file mode 100644 index 0000000..ac2d53b --- /dev/null +++ b/mmpose/models/backbones/seresnet.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.utils.checkpoint as cp + +from ..builder import BACKBONES +from .resnet import Bottleneck, ResLayer, ResNet +from .utils.se_layer import SELayer + + +class SEBottleneck(Bottleneck): + """SEBottleneck block for SEResNet. + + Args: + in_channels (int): The input channels of the SEBottleneck block. + out_channels (int): The output channel of the SEBottleneck block. + se_ratio (int): Squeeze ratio in SELayer. Default: 16 + """ + + def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs): + super().__init__(in_channels, out_channels, **kwargs) + self.se_layer = SELayer(out_channels, ratio=se_ratio) + + def forward(self, x): + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + + out = self.se_layer(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@BACKBONES.register_module() +class SEResNet(ResNet): + """SEResNet backbone. + + Please refer to the `paper `__ for + details. + + Args: + depth (int): Network depth, from {50, 101, 152}. + se_ratio (int): Squeeze ratio in SELayer. Default: 16. + in_channels (int): Number of input image channels. Default: 3. + stem_channels (int): Output channels of the stem layer. Default: 64. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + + Example: + >>> from mmpose.models import SEResNet + >>> import torch + >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3)) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 224, 224) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 256, 56, 56) + (1, 512, 28, 28) + (1, 1024, 14, 14) + (1, 2048, 7, 7) + """ + + arch_settings = { + 50: (SEBottleneck, (3, 4, 6, 3)), + 101: (SEBottleneck, (3, 4, 23, 3)), + 152: (SEBottleneck, (3, 8, 36, 3)) + } + + def __init__(self, depth, se_ratio=16, **kwargs): + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for SEResNet') + self.se_ratio = se_ratio + super().__init__(depth, **kwargs) + + def make_res_layer(self, **kwargs): + return ResLayer(se_ratio=self.se_ratio, **kwargs) diff --git a/mmpose/models/backbones/seresnext.py b/mmpose/models/backbones/seresnext.py new file mode 100644 index 0000000..c5c4e4c --- /dev/null +++ b/mmpose/models/backbones/seresnext.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from .resnet import ResLayer +from .seresnet import SEBottleneck as _SEBottleneck +from .seresnet import SEResNet + + +class SEBottleneck(_SEBottleneck): + """SEBottleneck block for SEResNeXt. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + base_channels (int): Middle channels of the first stage. Default: 64. + groups (int): Groups of conv2. + width_per_group (int): Width per group of conv2. 64x4d indicates + ``groups=64, width_per_group=4`` and 32x8d indicates + ``groups=32, width_per_group=8``. + stride (int): stride of the block. Default: 1 + dilation (int): dilation of convolution. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None + se_ratio (int): Squeeze ratio in SELayer. Default: 16 + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + def __init__(self, + in_channels, + out_channels, + base_channels=64, + groups=32, + width_per_group=4, + se_ratio=16, + **kwargs): + super().__init__(in_channels, out_channels, se_ratio, **kwargs) + self.groups = groups + self.width_per_group = width_per_group + + # We follow the same rational of ResNext to compute mid_channels. + # For SEResNet bottleneck, middle channels are determined by expansion + # and out_channels, but for SEResNeXt bottleneck, it is determined by + # groups and width_per_group and the stage it is located in. + if groups != 1: + assert self.mid_channels % base_channels == 0 + self.mid_channels = ( + groups * width_per_group * self.mid_channels // base_channels) + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.out_channels, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.in_channels, + self.mid_channels, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.mid_channels, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.out_channels, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + +@BACKBONES.register_module() +class SEResNeXt(SEResNet): + """SEResNeXt backbone. + + Please refer to the `paper `__ for + details. + + Args: + depth (int): Network depth, from {50, 101, 152}. + groups (int): Groups of conv2 in Bottleneck. Default: 32. + width_per_group (int): Width per group of conv2 in Bottleneck. + Default: 4. + se_ratio (int): Squeeze ratio in SELayer. Default: 16. + in_channels (int): Number of input image channels. Default: 3. + stem_channels (int): Output channels of the stem layer. Default: 64. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + + Example: + >>> from mmpose.models import SEResNeXt + >>> import torch + >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3)) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 224, 224) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 256, 56, 56) + (1, 512, 28, 28) + (1, 1024, 14, 14) + (1, 2048, 7, 7) + """ + + arch_settings = { + 50: (SEBottleneck, (3, 4, 6, 3)), + 101: (SEBottleneck, (3, 4, 23, 3)), + 152: (SEBottleneck, (3, 8, 36, 3)) + } + + def __init__(self, depth, groups=32, width_per_group=4, **kwargs): + self.groups = groups + self.width_per_group = width_per_group + super().__init__(depth, **kwargs) + + def make_res_layer(self, **kwargs): + return ResLayer( + groups=self.groups, + width_per_group=self.width_per_group, + base_channels=self.base_channels, + **kwargs) diff --git a/mmpose/models/backbones/shufflenet_v1.py b/mmpose/models/backbones/shufflenet_v1.py new file mode 100644 index 0000000..9f98cbd --- /dev/null +++ b/mmpose/models/backbones/shufflenet_v1.py @@ -0,0 +1,329 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import logging + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import (ConvModule, build_activation_layer, constant_init, + normal_init) +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import channel_shuffle, load_checkpoint, make_divisible + + +class ShuffleUnit(nn.Module): + """ShuffleUnit block. + + ShuffleNet unit with pointwise group convolution (GConv) and channel + shuffle. + + Args: + in_channels (int): The input channels of the ShuffleUnit. + out_channels (int): The output channels of the ShuffleUnit. + groups (int, optional): The number of groups to be used in grouped 1x1 + convolutions in each ShuffleUnit. Default: 3 + first_block (bool, optional): Whether it is the first ShuffleUnit of a + sequential ShuffleUnits. Default: True, which means not using the + grouped 1x1 convolution. + combine (str, optional): The ways to combine the input and output + branches. Default: 'add'. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool, optional): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + groups=3, + first_block=True, + combine='add', + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + act_cfg = copy.deepcopy(act_cfg) + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.first_block = first_block + self.combine = combine + self.groups = groups + self.bottleneck_channels = self.out_channels // 4 + self.with_cp = with_cp + + if self.combine == 'add': + self.depthwise_stride = 1 + self._combine_func = self._add + assert in_channels == out_channels, ( + 'in_channels must be equal to out_channels when combine ' + 'is add') + elif self.combine == 'concat': + self.depthwise_stride = 2 + self._combine_func = self._concat + self.out_channels -= self.in_channels + self.avgpool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1) + else: + raise ValueError(f'Cannot combine tensors with {self.combine}. ' + 'Only "add" and "concat" are supported') + + self.first_1x1_groups = 1 if first_block else self.groups + self.g_conv_1x1_compress = ConvModule( + in_channels=self.in_channels, + out_channels=self.bottleneck_channels, + kernel_size=1, + groups=self.first_1x1_groups, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.depthwise_conv3x3_bn = ConvModule( + in_channels=self.bottleneck_channels, + out_channels=self.bottleneck_channels, + kernel_size=3, + stride=self.depthwise_stride, + padding=1, + groups=self.bottleneck_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.g_conv_1x1_expand = ConvModule( + in_channels=self.bottleneck_channels, + out_channels=self.out_channels, + kernel_size=1, + groups=self.groups, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.act = build_activation_layer(act_cfg) + + @staticmethod + def _add(x, out): + # residual connection + return x + out + + @staticmethod + def _concat(x, out): + # concatenate along channel axis + return torch.cat((x, out), 1) + + def forward(self, x): + + def _inner_forward(x): + residual = x + + out = self.g_conv_1x1_compress(x) + out = self.depthwise_conv3x3_bn(out) + + if self.groups > 1: + out = channel_shuffle(out, self.groups) + + out = self.g_conv_1x1_expand(out) + + if self.combine == 'concat': + residual = self.avgpool(residual) + out = self.act(out) + out = self._combine_func(residual, out) + else: + out = self._combine_func(residual, out) + out = self.act(out) + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +@BACKBONES.register_module() +class ShuffleNetV1(BaseBackbone): + """ShuffleNetV1 backbone. + + Args: + groups (int, optional): The number of groups to be used in grouped 1x1 + convolutions in each ShuffleUnit. Default: 3. + widen_factor (float, optional): Width multiplier - adjusts the number + of channels in each layer by this amount. Default: 1.0. + out_indices (Sequence[int]): Output from which stages. + Default: (2, ) + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + groups=3, + widen_factor=1.0, + out_indices=(2, ), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + norm_eval=False, + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + act_cfg = copy.deepcopy(act_cfg) + super().__init__() + self.stage_blocks = [4, 8, 4] + self.groups = groups + + for index in out_indices: + if index not in range(0, 3): + raise ValueError('the item in out_indices must in ' + f'range(0, 3). But received {index}') + + if frozen_stages not in range(-1, 3): + raise ValueError('frozen_stages must be in range(-1, 3). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + if groups == 1: + channels = (144, 288, 576) + elif groups == 2: + channels = (200, 400, 800) + elif groups == 3: + channels = (240, 480, 960) + elif groups == 4: + channels = (272, 544, 1088) + elif groups == 8: + channels = (384, 768, 1536) + else: + raise ValueError(f'{groups} groups is not supported for 1x1 ' + 'Grouped Convolutions') + + channels = [make_divisible(ch * widen_factor, 8) for ch in channels] + + self.in_channels = int(24 * widen_factor) + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layers = nn.ModuleList() + for i, num_blocks in enumerate(self.stage_blocks): + first_block = (i == 0) + layer = self.make_layer(channels[i], num_blocks, first_block) + self.layers.append(layer) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(self.frozen_stages): + layer = self.layers[i] + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for name, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + if 'conv1' in name: + normal_init(m, mean=0, std=0.01) + else: + normal_init(m, mean=0, std=1.0 / m.weight.shape[1]) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, val=1, bias=0.0001) + if isinstance(m, _BatchNorm): + if m.running_mean is not None: + nn.init.constant_(m.running_mean, 0) + else: + raise TypeError('pretrained must be a str or None. But received ' + f'{type(pretrained)}') + + def make_layer(self, out_channels, num_blocks, first_block=False): + """Stack ShuffleUnit blocks to make a layer. + + Args: + out_channels (int): out_channels of the block. + num_blocks (int): Number of blocks. + first_block (bool, optional): Whether is the first ShuffleUnit of a + sequential ShuffleUnits. Default: False, which means using + the grouped 1x1 convolution. + """ + layers = [] + for i in range(num_blocks): + first_block = first_block if i == 0 else False + combine_mode = 'concat' if i == 0 else 'add' + layers.append( + ShuffleUnit( + self.in_channels, + out_channels, + groups=self.groups, + first_block=first_block, + combine=combine_mode, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.maxpool(x) + + outs = [] + for i, layer in enumerate(self.layers): + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + return tuple(outs) + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/shufflenet_v2.py b/mmpose/models/backbones/shufflenet_v2.py new file mode 100644 index 0000000..e935333 --- /dev/null +++ b/mmpose/models/backbones/shufflenet_v2.py @@ -0,0 +1,302 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import logging + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule, constant_init, normal_init +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import channel_shuffle, load_checkpoint + + +class InvertedResidual(nn.Module): + """InvertedResidual block for ShuffleNetV2 backbone. + + Args: + in_channels (int): The input channels of the block. + out_channels (int): The output channels of the block. + stride (int): Stride of the 3x3 convolution layer. Default: 1 + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + out_channels, + stride=1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + act_cfg = copy.deepcopy(act_cfg) + super().__init__() + self.stride = stride + self.with_cp = with_cp + + branch_features = out_channels // 2 + if self.stride == 1: + assert in_channels == branch_features * 2, ( + f'in_channels ({in_channels}) should equal to ' + f'branch_features * 2 ({branch_features * 2}) ' + 'when stride is 1') + + if in_channels != branch_features * 2: + assert self.stride != 1, ( + f'stride ({self.stride}) should not equal 1 when ' + f'in_channels != branch_features * 2') + + if self.stride > 1: + self.branch1 = nn.Sequential( + ConvModule( + in_channels, + in_channels, + kernel_size=3, + stride=self.stride, + padding=1, + groups=in_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), + ConvModule( + in_channels, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ) + + self.branch2 = nn.Sequential( + ConvModule( + in_channels if (self.stride > 1) else branch_features, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + branch_features, + branch_features, + kernel_size=3, + stride=self.stride, + padding=1, + groups=branch_features, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), + ConvModule( + branch_features, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x): + + def _inner_forward(x): + if self.stride > 1: + out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) + else: + x1, x2 = x.chunk(2, dim=1) + out = torch.cat((x1, self.branch2(x2)), dim=1) + + out = channel_shuffle(out, 2) + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +@BACKBONES.register_module() +class ShuffleNetV2(BaseBackbone): + """ShuffleNetV2 backbone. + + Args: + widen_factor (float): Width multiplier - adjusts the number of + channels in each layer by this amount. Default: 1.0. + out_indices (Sequence[int]): Output from which stages. + Default: (0, 1, 2, 3). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + widen_factor=1.0, + out_indices=(3, ), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + norm_eval=False, + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + act_cfg = copy.deepcopy(act_cfg) + super().__init__() + self.stage_blocks = [4, 8, 4] + for index in out_indices: + if index not in range(0, 4): + raise ValueError('the item in out_indices must in ' + f'range(0, 4). But received {index}') + + if frozen_stages not in range(-1, 4): + raise ValueError('frozen_stages must be in range(-1, 4). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + if widen_factor == 0.5: + channels = [48, 96, 192, 1024] + elif widen_factor == 1.0: + channels = [116, 232, 464, 1024] + elif widen_factor == 1.5: + channels = [176, 352, 704, 1024] + elif widen_factor == 2.0: + channels = [244, 488, 976, 2048] + else: + raise ValueError('widen_factor must be in [0.5, 1.0, 1.5, 2.0]. ' + f'But received {widen_factor}') + + self.in_channels = 24 + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layers = nn.ModuleList() + for i, num_blocks in enumerate(self.stage_blocks): + layer = self._make_layer(channels[i], num_blocks) + self.layers.append(layer) + + output_channels = channels[-1] + self.layers.append( + ConvModule( + in_channels=self.in_channels, + out_channels=output_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def _make_layer(self, out_channels, num_blocks): + """Stack blocks to make a layer. + + Args: + out_channels (int): out_channels of the block. + num_blocks (int): number of blocks. + """ + layers = [] + for i in range(num_blocks): + stride = 2 if i == 0 else 1 + layers.append( + InvertedResidual( + in_channels=self.in_channels, + out_channels=out_channels, + stride=stride, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + + for i in range(self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for name, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + if 'conv1' in name: + normal_init(m, mean=0, std=0.01) + else: + normal_init(m, mean=0, std=1.0 / m.weight.shape[1]) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m.weight, val=1, bias=0.0001) + if isinstance(m, _BatchNorm): + if m.running_mean is not None: + nn.init.constant_(m.running_mean, 0) + else: + raise TypeError('pretrained must be a str or None. But received ' + f'{type(pretrained)}') + + def forward(self, x): + x = self.conv1(x) + x = self.maxpool(x) + + outs = [] + for i, layer in enumerate(self.layers): + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + return tuple(outs) + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() diff --git a/mmpose/models/backbones/tcn.py b/mmpose/models/backbones/tcn.py new file mode 100644 index 0000000..deca229 --- /dev/null +++ b/mmpose/models/backbones/tcn.py @@ -0,0 +1,267 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +from mmcv.cnn import ConvModule, build_conv_layer, constant_init, kaiming_init +from mmcv.utils.parrots_wrapper import _BatchNorm + +from mmpose.core import WeightNormClipHook +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +class BasicTemporalBlock(nn.Module): + """Basic block for VideoPose3D. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + mid_channels (int): The output channels of conv1. Default: 1024. + kernel_size (int): Size of the convolving kernel. Default: 3. + dilation (int): Spacing between kernel elements. Default: 3. + dropout (float): Dropout rate. Default: 0.25. + causal (bool): Use causal convolutions instead of symmetric + convolutions (for real-time applications). Default: False. + residual (bool): Use residual connection. Default: True. + use_stride_conv (bool): Use optimized TCN that designed + specifically for single-frame batching, i.e. where batches have + input length = receptive field, and output length = 1. This + implementation replaces dilated convolutions with strided + convolutions to avoid generating unused intermediate results. + Default: False. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: dict(type='Conv1d'). + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN1d'). + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels=1024, + kernel_size=3, + dilation=3, + dropout=0.25, + causal=False, + residual=True, + use_stride_conv=False, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d')): + # Protect mutable default arguments + conv_cfg = copy.deepcopy(conv_cfg) + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.mid_channels = mid_channels + self.kernel_size = kernel_size + self.dilation = dilation + self.dropout = dropout + self.causal = causal + self.residual = residual + self.use_stride_conv = use_stride_conv + + self.pad = (kernel_size - 1) * dilation // 2 + if use_stride_conv: + self.stride = kernel_size + self.causal_shift = kernel_size // 2 if causal else 0 + self.dilation = 1 + else: + self.stride = 1 + self.causal_shift = kernel_size // 2 * dilation if causal else 0 + + self.conv1 = nn.Sequential( + ConvModule( + in_channels, + mid_channels, + kernel_size=kernel_size, + stride=self.stride, + dilation=self.dilation, + bias='auto', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg)) + self.conv2 = nn.Sequential( + ConvModule( + mid_channels, + out_channels, + kernel_size=1, + bias='auto', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg)) + + if residual and in_channels != out_channels: + self.short_cut = build_conv_layer(conv_cfg, in_channels, + out_channels, 1) + else: + self.short_cut = None + + self.dropout = nn.Dropout(dropout) if dropout > 0 else None + + def forward(self, x): + """Forward function.""" + if self.use_stride_conv: + assert self.causal_shift + self.kernel_size // 2 < x.shape[2] + else: + assert 0 <= self.pad + self.causal_shift < x.shape[2] - \ + self.pad + self.causal_shift <= x.shape[2] + + out = self.conv1(x) + if self.dropout is not None: + out = self.dropout(out) + + out = self.conv2(out) + if self.dropout is not None: + out = self.dropout(out) + + if self.residual: + if self.use_stride_conv: + res = x[:, :, self.causal_shift + + self.kernel_size // 2::self.kernel_size] + else: + res = x[:, :, + (self.pad + self.causal_shift):(x.shape[2] - self.pad + + self.causal_shift)] + + if self.short_cut is not None: + res = self.short_cut(res) + out = out + res + + return out + + +@BACKBONES.register_module() +class TCN(BaseBackbone): + """TCN backbone. + + Temporal Convolutional Networks. + More details can be found in the + `paper `__ . + + Args: + in_channels (int): Number of input channels, which equals to + num_keypoints * num_features. + stem_channels (int): Number of feature channels. Default: 1024. + num_blocks (int): NUmber of basic temporal convolutional blocks. + Default: 2. + kernel_sizes (Sequence[int]): Sizes of the convolving kernel of + each basic block. Default: ``(3, 3, 3)``. + dropout (float): Dropout rate. Default: 0.25. + causal (bool): Use causal convolutions instead of symmetric + convolutions (for real-time applications). + Default: False. + residual (bool): Use residual connection. Default: True. + use_stride_conv (bool): Use TCN backbone optimized for + single-frame batching, i.e. where batches have input length = + receptive field, and output length = 1. This implementation + replaces dilated convolutions with strided convolutions to avoid + generating unused intermediate results. The weights are + interchangeable with the reference implementation. Default: False + conv_cfg (dict): dictionary to construct and config conv layer. + Default: dict(type='Conv1d'). + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN1d'). + max_norm (float|None): if not None, the weight of convolution layers + will be clipped to have a maximum norm of max_norm. + + Example: + >>> from mmpose.models import TCN + >>> import torch + >>> self = TCN(in_channels=34) + >>> self.eval() + >>> inputs = torch.rand(1, 34, 243) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 1024, 235) + (1, 1024, 217) + """ + + def __init__(self, + in_channels, + stem_channels=1024, + num_blocks=2, + kernel_sizes=(3, 3, 3), + dropout=0.25, + causal=False, + residual=True, + use_stride_conv=False, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + max_norm=None): + # Protect mutable default arguments + conv_cfg = copy.deepcopy(conv_cfg) + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + self.in_channels = in_channels + self.stem_channels = stem_channels + self.num_blocks = num_blocks + self.kernel_sizes = kernel_sizes + self.dropout = dropout + self.causal = causal + self.residual = residual + self.use_stride_conv = use_stride_conv + self.max_norm = max_norm + + assert num_blocks == len(kernel_sizes) - 1 + for ks in kernel_sizes: + assert ks % 2 == 1, 'Only odd filter widths are supported.' + + self.expand_conv = ConvModule( + in_channels, + stem_channels, + kernel_size=kernel_sizes[0], + stride=kernel_sizes[0] if use_stride_conv else 1, + bias='auto', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + + dilation = kernel_sizes[0] + self.tcn_blocks = nn.ModuleList() + for i in range(1, num_blocks + 1): + self.tcn_blocks.append( + BasicTemporalBlock( + in_channels=stem_channels, + out_channels=stem_channels, + mid_channels=stem_channels, + kernel_size=kernel_sizes[i], + dilation=dilation, + dropout=dropout, + causal=causal, + residual=residual, + use_stride_conv=use_stride_conv, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg)) + dilation *= kernel_sizes[i] + + if self.max_norm is not None: + # Apply weight norm clip to conv layers + weight_clip = WeightNormClipHook(self.max_norm) + for module in self.modules(): + if isinstance(module, nn.modules.conv._ConvNd): + weight_clip.register(module) + + self.dropout = nn.Dropout(dropout) if dropout > 0 else None + + def forward(self, x): + """Forward function.""" + x = self.expand_conv(x) + + if self.dropout is not None: + x = self.dropout(x) + + outs = [] + for i in range(self.num_blocks): + x = self.tcn_blocks[i](x) + outs.append(x) + + return tuple(outs) + + def init_weights(self, pretrained=None): + """Initialize the weights.""" + super().init_weights(pretrained) + if pretrained is None: + for m in self.modules(): + if isinstance(m, nn.modules.conv._ConvNd): + kaiming_init(m, mode='fan_in', nonlinearity='relu') + elif isinstance(m, _BatchNorm): + constant_init(m, 1) diff --git a/mmpose/models/backbones/utils/__init__.py b/mmpose/models/backbones/utils/__init__.py new file mode 100644 index 0000000..52a30ca --- /dev/null +++ b/mmpose/models/backbones/utils/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .channel_shuffle import channel_shuffle +from .inverted_residual import InvertedResidual +from .make_divisible import make_divisible +from .se_layer import SELayer +from .utils import load_checkpoint + +__all__ = [ + 'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer', + 'load_checkpoint' +] diff --git a/mmpose/models/backbones/utils/channel_shuffle.py b/mmpose/models/backbones/utils/channel_shuffle.py new file mode 100644 index 0000000..27006a8 --- /dev/null +++ b/mmpose/models/backbones/utils/channel_shuffle.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def channel_shuffle(x, groups): + """Channel Shuffle operation. + + This function enables cross-group information flow for multiple groups + convolution layers. + + Args: + x (Tensor): The input tensor. + groups (int): The number of groups to divide the input tensor + in the channel dimension. + + Returns: + Tensor: The output tensor after channel shuffle operation. + """ + + batch_size, num_channels, height, width = x.size() + assert (num_channels % groups == 0), ('num_channels should be ' + 'divisible by groups') + channels_per_group = num_channels // groups + + x = x.view(batch_size, groups, channels_per_group, height, width) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(batch_size, -1, height, width) + + return x diff --git a/mmpose/models/backbones/utils/inverted_residual.py b/mmpose/models/backbones/utils/inverted_residual.py new file mode 100644 index 0000000..dff762c --- /dev/null +++ b/mmpose/models/backbones/utils/inverted_residual.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule + +from .se_layer import SELayer + + +class InvertedResidual(nn.Module): + """Inverted Residual Block. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + mid_channels (int): The input channels of the depthwise convolution. + kernel_size (int): The kernel size of the depthwise convolution. + Default: 3. + groups (None or int): The group number of the depthwise convolution. + Default: None, which means group number = mid_channels. + stride (int): The stride of the depthwise convolution. Default: 1. + se_cfg (dict): Config dict for se layer. Default: None, which means no + se layer. + with_expand_conv (bool): Use expand conv or not. If set False, + mid_channels must be the same with in_channels. + Default: True. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + groups=None, + stride=1, + se_cfg=None, + with_expand_conv=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + act_cfg = copy.deepcopy(act_cfg) + super().__init__() + self.with_res_shortcut = (stride == 1 and in_channels == out_channels) + assert stride in [1, 2] + self.with_cp = with_cp + self.with_se = se_cfg is not None + self.with_expand_conv = with_expand_conv + + if groups is None: + groups = mid_channels + + if self.with_se: + assert isinstance(se_cfg, dict) + if not self.with_expand_conv: + assert mid_channels == in_channels + + if self.with_expand_conv: + self.expand_conv = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.depthwise_conv = ConvModule( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=kernel_size // 2, + groups=groups, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + if self.with_se: + self.se = SELayer(**se_cfg) + self.linear_conv = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x): + + def _inner_forward(x): + out = x + + if self.with_expand_conv: + out = self.expand_conv(out) + + out = self.depthwise_conv(out) + + if self.with_se: + out = self.se(out) + + out = self.linear_conv(out) + + if self.with_res_shortcut: + return x + out + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out diff --git a/mmpose/models/backbones/utils/make_divisible.py b/mmpose/models/backbones/utils/make_divisible.py new file mode 100644 index 0000000..b7666be --- /dev/null +++ b/mmpose/models/backbones/utils/make_divisible.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +def make_divisible(value, divisor, min_value=None, min_ratio=0.9): + """Make divisible function. + + This function rounds the channel number down to the nearest value that can + be divisible by the divisor. + + Args: + value (int): The original channel number. + divisor (int): The divisor to fully divide the channel number. + min_value (int, optional): The minimum value of the output channel. + Default: None, means that the minimum value equal to the divisor. + min_ratio (float, optional): The minimum ratio of the rounded channel + number to the original channel number. Default: 0.9. + Returns: + int: The modified output channel number + """ + + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than (1-min_ratio). + if new_value < min_ratio * value: + new_value += divisor + return new_value diff --git a/mmpose/models/backbones/utils/se_layer.py b/mmpose/models/backbones/utils/se_layer.py new file mode 100644 index 0000000..07f7080 --- /dev/null +++ b/mmpose/models/backbones/utils/se_layer.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import torch.nn as nn +from mmcv.cnn import ConvModule + + +class SELayer(nn.Module): + """Squeeze-and-Excitation Module. + + Args: + channels (int): The input (and output) channels of the SE layer. + ratio (int): Squeeze ratio in SELayer, the intermediate channel will be + ``int(channels/ratio)``. Default: 16. + conv_cfg (None or dict): Config dict for convolution layer. + Default: None, which means using conv2d. + act_cfg (dict or Sequence[dict]): Config dict for activation layer. + If act_cfg is a dict, two activation layers will be configurated + by this dict. If act_cfg is a sequence of dicts, the first + activation layer will be configurated by the first dict and the + second activation layer will be configurated by the second dict. + Default: (dict(type='ReLU'), dict(type='Sigmoid')) + """ + + def __init__(self, + channels, + ratio=16, + conv_cfg=None, + act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))): + super().__init__() + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert mmcv.is_tuple_of(act_cfg, dict) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=int(channels / ratio), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=int(channels / ratio), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x): + out = self.global_avgpool(x) + out = self.conv1(out) + out = self.conv2(out) + return x * out diff --git a/mmpose/models/backbones/utils/utils.py b/mmpose/models/backbones/utils/utils.py new file mode 100644 index 0000000..a9ac948 --- /dev/null +++ b/mmpose/models/backbones/utils/utils.py @@ -0,0 +1,87 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict + + +def load_checkpoint(model, + filename, + map_location='cpu', + strict=False, + logger=None): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + checkpoint = _load_checkpoint(filename, map_location) + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict_tmp = checkpoint['state_dict'] + else: + state_dict_tmp = checkpoint + + state_dict = OrderedDict() + # strip prefix of state_dict + for k, v in state_dict_tmp.items(): + if k.startswith('module.backbone.'): + state_dict[k[16:]] = v + elif k.startswith('module.'): + state_dict[k[7:]] = v + elif k.startswith('backbone.'): + state_dict[k[9:]] = v + else: + state_dict[k] = v + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint + + +def get_state_dict(filename, map_location='cpu'): + """Get state_dict from a file or URI. + + Args: + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. + map_location (str): Same as :func:`torch.load`. + + Returns: + OrderedDict: The state_dict. + """ + checkpoint = _load_checkpoint(filename, map_location) + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict_tmp = checkpoint['state_dict'] + else: + state_dict_tmp = checkpoint + + state_dict = OrderedDict() + # strip prefix of state_dict + for k, v in state_dict_tmp.items(): + if k.startswith('module.backbone.'): + state_dict[k[16:]] = v + elif k.startswith('module.'): + state_dict[k[7:]] = v + elif k.startswith('backbone.'): + state_dict[k[9:]] = v + else: + state_dict[k] = v + + return state_dict diff --git a/mmpose/models/backbones/v2v_net.py b/mmpose/models/backbones/v2v_net.py new file mode 100644 index 0000000..99462af --- /dev/null +++ b/mmpose/models/backbones/v2v_net.py @@ -0,0 +1,257 @@ +# ------------------------------------------------------------------------------ +# Copyright and License Information +# Adapted from +# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models/v2v_net.py +# Original Licence: MIT License +# ------------------------------------------------------------------------------ + +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +class Basic3DBlock(nn.Module): + """A basic 3D convolutional block. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + kernel_size (int): Kernel size of the convolution operation + conv_cfg (dict): Dictionary to construct and config conv layer. + Default: dict(type='Conv3d') + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: dict(type='BN3d') + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d')): + super(Basic3DBlock, self).__init__() + self.block = ConvModule( + in_channels, + out_channels, + kernel_size, + stride=1, + padding=((kernel_size - 1) // 2), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + bias=True) + + def forward(self, x): + """Forward function.""" + return self.block(x) + + +class Res3DBlock(nn.Module): + """A residual 3D convolutional block. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + kernel_size (int): Kernel size of the convolution operation + Default: 3 + conv_cfg (dict): Dictionary to construct and config conv layer. + Default: dict(type='Conv3d') + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: dict(type='BN3d') + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d')): + super(Res3DBlock, self).__init__() + self.res_branch = nn.Sequential( + ConvModule( + in_channels, + out_channels, + kernel_size, + stride=1, + padding=((kernel_size - 1) // 2), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + bias=True), + ConvModule( + out_channels, + out_channels, + kernel_size, + stride=1, + padding=((kernel_size - 1) // 2), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None, + bias=True)) + + if in_channels == out_channels: + self.skip_con = nn.Sequential() + else: + self.skip_con = ConvModule( + in_channels, + out_channels, + 1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None, + bias=True) + + def forward(self, x): + """Forward function.""" + res = self.res_branch(x) + skip = self.skip_con(x) + return F.relu(res + skip, True) + + +class Pool3DBlock(nn.Module): + """A 3D max-pool block. + + Args: + pool_size (int): Pool size of the 3D max-pool layer + """ + + def __init__(self, pool_size): + super(Pool3DBlock, self).__init__() + self.pool_size = pool_size + + def forward(self, x): + """Forward function.""" + return F.max_pool3d( + x, kernel_size=self.pool_size, stride=self.pool_size) + + +class Upsample3DBlock(nn.Module): + """A 3D upsample block. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + kernel_size (int): Kernel size of the transposed convolution operation. + Default: 2 + stride (int): Kernel size of the transposed convolution operation. + Default: 2 + """ + + def __init__(self, in_channels, out_channels, kernel_size=2, stride=2): + super(Upsample3DBlock, self).__init__() + assert kernel_size == 2 + assert stride == 2 + self.block = nn.Sequential( + nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=0, + output_padding=0), nn.BatchNorm3d(out_channels), nn.ReLU(True)) + + def forward(self, x): + """Forward function.""" + return self.block(x) + + +class EncoderDecorder(nn.Module): + """An encoder-decoder block. + + Args: + in_channels (int): Input channels of this block + """ + + def __init__(self, in_channels=32): + super(EncoderDecorder, self).__init__() + + self.encoder_pool1 = Pool3DBlock(2) + self.encoder_res1 = Res3DBlock(in_channels, in_channels * 2) + self.encoder_pool2 = Pool3DBlock(2) + self.encoder_res2 = Res3DBlock(in_channels * 2, in_channels * 4) + + self.mid_res = Res3DBlock(in_channels * 4, in_channels * 4) + + self.decoder_res2 = Res3DBlock(in_channels * 4, in_channels * 4) + self.decoder_upsample2 = Upsample3DBlock(in_channels * 4, + in_channels * 2, 2, 2) + self.decoder_res1 = Res3DBlock(in_channels * 2, in_channels * 2) + self.decoder_upsample1 = Upsample3DBlock(in_channels * 2, in_channels, + 2, 2) + + self.skip_res1 = Res3DBlock(in_channels, in_channels) + self.skip_res2 = Res3DBlock(in_channels * 2, in_channels * 2) + + def forward(self, x): + """Forward function.""" + skip_x1 = self.skip_res1(x) + x = self.encoder_pool1(x) + x = self.encoder_res1(x) + + skip_x2 = self.skip_res2(x) + x = self.encoder_pool2(x) + x = self.encoder_res2(x) + + x = self.mid_res(x) + + x = self.decoder_res2(x) + x = self.decoder_upsample2(x) + x = x + skip_x2 + + x = self.decoder_res1(x) + x = self.decoder_upsample1(x) + x = x + skip_x1 + + return x + + +@BACKBONES.register_module() +class V2VNet(BaseBackbone): + """V2VNet. + + Please refer to the `paper ` + for details. + + Args: + input_channels (int): + Number of channels of the input feature volume. + output_channels (int): + Number of channels of the output volume. + mid_channels (int): + Input and output channels of the encoder-decoder block. + """ + + def __init__(self, input_channels, output_channels, mid_channels=32): + super(V2VNet, self).__init__() + + self.front_layers = nn.Sequential( + Basic3DBlock(input_channels, mid_channels // 2, 7), + Res3DBlock(mid_channels // 2, mid_channels), + ) + + self.encoder_decoder = EncoderDecorder(in_channels=mid_channels) + + self.output_layer = nn.Conv3d( + mid_channels, output_channels, kernel_size=1, stride=1, padding=0) + + self._initialize_weights() + + def forward(self, x): + """Forward function.""" + x = self.front_layers(x) + x = self.encoder_decoder(x) + x = self.output_layer(x) + + return x + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv3d): + nn.init.normal_(m.weight, 0, 0.001) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.ConvTranspose3d): + nn.init.normal_(m.weight, 0, 0.001) + nn.init.constant_(m.bias, 0) diff --git a/mmpose/models/backbones/vgg.py b/mmpose/models/backbones/vgg.py new file mode 100644 index 0000000..f7d4670 --- /dev/null +++ b/mmpose/models/backbones/vgg.py @@ -0,0 +1,193 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import ConvModule, constant_init, kaiming_init, normal_init +from mmcv.utils.parrots_wrapper import _BatchNorm + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +def make_vgg_layer(in_channels, + out_channels, + num_blocks, + conv_cfg=None, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + dilation=1, + with_norm=False, + ceil_mode=False): + layers = [] + for _ in range(num_blocks): + layer = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + dilation=dilation, + padding=dilation, + bias=True, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + layers.append(layer) + in_channels = out_channels + layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode)) + + return layers + + +@BACKBONES.register_module() +class VGG(BaseBackbone): + """VGG backbone. + + Args: + depth (int): Depth of vgg, from {11, 13, 16, 19}. + with_norm (bool): Use BatchNorm or not. + num_classes (int): number of classes for classification. + num_stages (int): VGG stages, normally 5. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. When it is None, the default behavior depends on + whether num_classes is specified. If num_classes <= 0, the default + value is (4, ), outputting the last feature map before classifier. + If num_classes > 0, the default value is (5, ), outputting the + classification score. Default: None. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + ceil_mode (bool): Whether to use ceil_mode of MaxPool. Default: False. + with_last_pool (bool): Whether to keep the last pooling before + classifier. Default: True. + """ + + # Parameters to build layers. Each element specifies the number of conv in + # each stage. For example, VGG11 contains 11 layers with learnable + # parameters. 11 is computed as 11 = (1 + 1 + 2 + 2 + 2) + 3, + # where 3 indicates the last three fully-connected layers. + arch_settings = { + 11: (1, 1, 2, 2, 2), + 13: (2, 2, 2, 2, 2), + 16: (2, 2, 3, 3, 3), + 19: (2, 2, 4, 4, 4) + } + + def __init__(self, + depth, + num_classes=-1, + num_stages=5, + dilations=(1, 1, 1, 1, 1), + out_indices=None, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + norm_eval=False, + ceil_mode=False, + with_last_pool=True): + super().__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for vgg') + assert num_stages >= 1 and num_stages <= 5 + stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + assert len(dilations) == num_stages + + self.num_classes = num_classes + self.frozen_stages = frozen_stages + self.norm_eval = norm_eval + with_norm = norm_cfg is not None + + if out_indices is None: + out_indices = (5, ) if num_classes > 0 else (4, ) + assert max(out_indices) <= num_stages + self.out_indices = out_indices + + self.in_channels = 3 + start_idx = 0 + vgg_layers = [] + self.range_sub_modules = [] + for i, num_blocks in enumerate(self.stage_blocks): + num_modules = num_blocks + 1 + end_idx = start_idx + num_modules + dilation = dilations[i] + out_channels = 64 * 2**i if i < 4 else 512 + vgg_layer = make_vgg_layer( + self.in_channels, + out_channels, + num_blocks, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + dilation=dilation, + with_norm=with_norm, + ceil_mode=ceil_mode) + vgg_layers.extend(vgg_layer) + self.in_channels = out_channels + self.range_sub_modules.append([start_idx, end_idx]) + start_idx = end_idx + if not with_last_pool: + vgg_layers.pop(-1) + self.range_sub_modules[-1][1] -= 1 + self.module_name = 'features' + self.add_module(self.module_name, nn.Sequential(*vgg_layers)) + + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Linear(512 * 7 * 7, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained=None): + super().init_weights(pretrained) + if pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + elif isinstance(m, nn.Linear): + normal_init(m, std=0.01) + + def forward(self, x): + outs = [] + vgg_layers = getattr(self, self.module_name) + for i in range(len(self.stage_blocks)): + for j in range(*self.range_sub_modules[i]): + vgg_layer = vgg_layers[j] + x = vgg_layer(x) + if i in self.out_indices: + outs.append(x) + if self.num_classes > 0: + x = x.view(x.size(0), -1) + x = self.classifier(x) + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def _freeze_stages(self): + vgg_layers = getattr(self, self.module_name) + for i in range(self.frozen_stages): + for j in range(*self.range_sub_modules[i]): + m = vgg_layers[j] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/vipnas_mbv3.py b/mmpose/models/backbones/vipnas_mbv3.py new file mode 100644 index 0000000..ed990e3 --- /dev/null +++ b/mmpose/models/backbones/vipnas_mbv3.py @@ -0,0 +1,179 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import logging + +import torch.nn as nn +from mmcv.cnn import ConvModule +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone +from .utils import InvertedResidual, load_checkpoint + + +@BACKBONES.register_module() +class ViPNAS_MobileNetV3(BaseBackbone): + """ViPNAS_MobileNetV3 backbone. + + "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search" + More details can be found in the `paper + `__ . + + Args: + wid (list(int)): Searched width config for each stage. + expan (list(int)): Searched expansion ratio config for each stage. + dep (list(int)): Searched depth config for each stage. + ks (list(int)): Searched kernel size config for each stage. + group (list(int)): Searched group number config for each stage. + att (list(bool)): Searched attention config for each stage. + stride (list(int)): Stride config for each stage. + act (list(dict)): Activation config for each stage. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save + some memory while slowing down the training speed. + Default: False. + """ + + def __init__(self, + wid=[16, 16, 24, 40, 80, 112, 160], + expan=[None, 1, 5, 4, 5, 5, 6], + dep=[None, 1, 4, 4, 4, 4, 4], + ks=[3, 3, 7, 7, 5, 7, 5], + group=[None, 8, 120, 20, 100, 280, 240], + att=[None, True, True, False, True, True, True], + stride=[2, 1, 2, 2, 2, 1, 2], + act=[ + 'HSwish', 'ReLU', 'ReLU', 'ReLU', 'HSwish', 'HSwish', + 'HSwish' + ], + conv_cfg=None, + norm_cfg=dict(type='BN'), + frozen_stages=-1, + norm_eval=False, + with_cp=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + self.wid = wid + self.expan = expan + self.dep = dep + self.ks = ks + self.group = group + self.att = att + self.stride = stride + self.act = act + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.frozen_stages = frozen_stages + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.wid[0], + kernel_size=self.ks[0], + stride=self.stride[0], + padding=self.ks[0] // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=dict(type=self.act[0])) + + self.layers = self._make_layer() + + def _make_layer(self): + layers = [] + layer_index = 0 + for i, dep in enumerate(self.dep[1:]): + mid_channels = self.wid[i + 1] * self.expan[i + 1] + + if self.att[i + 1]: + se_cfg = dict( + channels=mid_channels, + ratio=4, + act_cfg=(dict(type='ReLU'), dict(type='HSigmoid'))) + else: + se_cfg = None + + if self.expan[i + 1] == 1: + with_expand_conv = False + else: + with_expand_conv = True + + for j in range(dep): + if j == 0: + stride = self.stride[i + 1] + in_channels = self.wid[i] + else: + stride = 1 + in_channels = self.wid[i + 1] + + layer = InvertedResidual( + in_channels=in_channels, + out_channels=self.wid[i + 1], + mid_channels=mid_channels, + kernel_size=self.ks[i + 1], + groups=self.group[i + 1], + stride=stride, + se_cfg=se_cfg, + with_expand_conv=with_expand_conv, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type=self.act[i + 1]), + with_cp=self.with_cp) + layer_index += 1 + layer_name = f'layer{layer_index}' + self.add_module(layer_name, layer) + layers.append(layer_name) + return layers + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + + return x + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/vipnas_resnet.py b/mmpose/models/backbones/vipnas_resnet.py new file mode 100644 index 0000000..81b028e --- /dev/null +++ b/mmpose/models/backbones/vipnas_resnet.py @@ -0,0 +1,589 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer +from mmcv.cnn.bricks import ContextBlock +from mmcv.utils.parrots_wrapper import _BatchNorm + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +class ViPNAS_Bottleneck(nn.Module): + """Bottleneck block for ViPNAS_ResNet. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + expansion (int): The ratio of ``out_channels/mid_channels`` where + ``mid_channels`` is the input/output channels of conv2. Default: 4. + stride (int): stride of the block. Default: 1 + dilation (int): dilation of convolution. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None. + style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the + stride-two layer is the 3x3 conv layer, otherwise the stride-two + layer is the first 1x1 conv layer. Default: "pytorch". + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + kernel_size (int): kernel size of conv2 searched in ViPANS. + groups (int): group number of conv2 searched in ViPNAS. + attention (bool): whether to use attention module in the end of + the block. + """ + + def __init__(self, + in_channels, + out_channels, + expansion=4, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + kernel_size=3, + groups=1, + attention=False): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + assert style in ['pytorch', 'caffe'] + + self.in_channels = in_channels + self.out_channels = out_channels + self.expansion = expansion + assert out_channels % expansion == 0 + self.mid_channels = out_channels // expansion + self.stride = stride + self.dilation = dilation + self.style = style + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + if self.style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, self.mid_channels, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + norm_cfg, self.mid_channels, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + norm_cfg, out_channels, postfix=3) + + self.conv1 = build_conv_layer( + conv_cfg, + in_channels, + self.mid_channels, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + conv_cfg, + self.mid_channels, + self.mid_channels, + kernel_size=kernel_size, + stride=self.conv2_stride, + padding=kernel_size // 2, + groups=groups, + dilation=dilation, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + conv_cfg, + self.mid_channels, + out_channels, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + if attention: + self.attention = ContextBlock(out_channels, + max(1.0 / 16, 16.0 / out_channels)) + else: + self.attention = None + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + @property + def norm3(self): + """nn.Module: the normalization layer named "norm3" """ + return getattr(self, self.norm3_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + + if self.attention is not None: + out = self.attention(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +def get_expansion(block, expansion=None): + """Get the expansion of a residual block. + + The block expansion will be obtained by the following order: + + 1. If ``expansion`` is given, just return it. + 2. If ``block`` has the attribute ``expansion``, then return + ``block.expansion``. + 3. Return the default value according the the block type: + 4 for ``ViPNAS_Bottleneck``. + + Args: + block (class): The block class. + expansion (int | None): The given expansion ratio. + + Returns: + int: The expansion of the block. + """ + if isinstance(expansion, int): + assert expansion > 0 + elif expansion is None: + if hasattr(block, 'expansion'): + expansion = block.expansion + elif issubclass(block, ViPNAS_Bottleneck): + expansion = 1 + else: + raise TypeError(f'expansion is not specified for {block.__name__}') + else: + raise TypeError('expansion must be an integer or None') + + return expansion + + +class ViPNAS_ResLayer(nn.Sequential): + """ViPNAS_ResLayer to build ResNet style backbone. + + Args: + block (nn.Module): Residual block used to build ViPNAS ResLayer. + num_blocks (int): Number of blocks. + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + expansion (int, optional): The expansion for BasicBlock/Bottleneck. + If not specified, it will firstly be obtained via + ``block.expansion``. If the block has no attribute "expansion", + the following default values will be used: 1 for BasicBlock and + 4 for Bottleneck. Default: None. + stride (int): stride of the first block. Default: 1. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + downsample_first (bool): Downsample at the first block or last block. + False for Hourglass, True for ResNet. Default: True + kernel_size (int): Kernel Size of the corresponding convolution layer + searched in the block. + groups (int): Group number of the corresponding convolution layer + searched in the block. + attention (bool): Whether to use attention module in the end of the + block. + """ + + def __init__(self, + block, + num_blocks, + in_channels, + out_channels, + expansion=None, + stride=1, + avg_down=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + downsample_first=True, + kernel_size=3, + groups=1, + attention=False, + **kwargs): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + self.block = block + self.expansion = get_expansion(block, expansion) + + downsample = None + if stride != 1 or in_channels != out_channels: + downsample = [] + conv_stride = stride + if avg_down and stride != 1: + conv_stride = 1 + downsample.append( + nn.AvgPool2d( + kernel_size=stride, + stride=stride, + ceil_mode=True, + count_include_pad=False)) + downsample.extend([ + build_conv_layer( + conv_cfg, + in_channels, + out_channels, + kernel_size=1, + stride=conv_stride, + bias=False), + build_norm_layer(norm_cfg, out_channels)[1] + ]) + downsample = nn.Sequential(*downsample) + + layers = [] + if downsample_first: + layers.append( + block( + in_channels=in_channels, + out_channels=out_channels, + expansion=self.expansion, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + kernel_size=kernel_size, + groups=groups, + attention=attention, + **kwargs)) + in_channels = out_channels + for _ in range(1, num_blocks): + layers.append( + block( + in_channels=in_channels, + out_channels=out_channels, + expansion=self.expansion, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + kernel_size=kernel_size, + groups=groups, + attention=attention, + **kwargs)) + else: # downsample_first=False is for HourglassModule + for i in range(0, num_blocks - 1): + layers.append( + block( + in_channels=in_channels, + out_channels=in_channels, + expansion=self.expansion, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + kernel_size=kernel_size, + groups=groups, + attention=attention, + **kwargs)) + layers.append( + block( + in_channels=in_channels, + out_channels=out_channels, + expansion=self.expansion, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + kernel_size=kernel_size, + groups=groups, + attention=attention, + **kwargs)) + + super().__init__(*layers) + + +@BACKBONES.register_module() +class ViPNAS_ResNet(BaseBackbone): + """ViPNAS_ResNet backbone. + + "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search" + More details can be found in the `paper + `__ . + + Args: + depth (int): Network depth, from {18, 34, 50, 101, 152}. + in_channels (int): Number of input image channels. Default: 3. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + wid (list(int)): Searched width config for each stage. + expan (list(int)): Searched expansion ratio config for each stage. + dep (list(int)): Searched depth config for each stage. + ks (list(int)): Searched kernel size config for each stage. + group (list(int)): Searched group number config for each stage. + att (list(bool)): Searched attention config for each stage. + """ + + arch_settings = { + 50: ViPNAS_Bottleneck, + } + + def __init__(self, + depth, + in_channels=3, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(3, ), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + with_cp=False, + zero_init_residual=True, + wid=[48, 80, 160, 304, 608], + expan=[None, 1, 1, 1, 1], + dep=[None, 4, 6, 7, 3], + ks=[7, 3, 5, 5, 5], + group=[None, 16, 16, 16, 16], + att=[None, True, False, True, True]): + # Protect mutable default arguments + norm_cfg = copy.deepcopy(norm_cfg) + super().__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + self.depth = depth + self.stem_channels = dep[0] + self.num_stages = num_stages + assert 1 <= num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.zero_init_residual = zero_init_residual + self.block = self.arch_settings[depth] + self.stage_blocks = dep[1:1 + num_stages] + + self._make_stem_layer(in_channels, wid[0], ks[0]) + + self.res_layers = [] + _in_channels = wid[0] + for i, num_blocks in enumerate(self.stage_blocks): + expansion = get_expansion(self.block, expan[i + 1]) + _out_channels = wid[i + 1] * expansion + stride = strides[i] + dilation = dilations[i] + res_layer = self.make_res_layer( + block=self.block, + num_blocks=num_blocks, + in_channels=_in_channels, + out_channels=_out_channels, + expansion=expansion, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + kernel_size=ks[i + 1], + groups=group[i + 1], + attention=att[i + 1]) + _in_channels = _out_channels + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = res_layer[-1].out_channels + + def make_res_layer(self, **kwargs): + """Make a ViPNAS ResLayer.""" + return ViPNAS_ResLayer(**kwargs) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def _make_stem_layer(self, in_channels, stem_channels, kernel_size): + """Make stem layer.""" + if self.deep_stem: + self.stem = nn.Sequential( + ConvModule( + in_channels, + stem_channels // 2, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True), + ConvModule( + stem_channels // 2, + stem_channels // 2, + kernel_size=3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True), + ConvModule( + stem_channels // 2, + stem_channels, + kernel_size=3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + else: + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels, + kernel_size=kernel_size, + stride=2, + padding=kernel_size // 2, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, stem_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def _freeze_stages(self): + """Freeze parameters.""" + if self.frozen_stages >= 0: + if self.deep_stem: + self.stem.eval() + for param in self.stem.parameters(): + param.requires_grad = False + else: + self.norm1.eval() + for m in [self.conv1, self.norm1]: + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize model weights.""" + super().init_weights(pretrained) + if pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + """Forward function.""" + if self.deep_stem: + x = self.stem(x) + else: + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + return tuple(outs) + + def train(self, mode=True): + """Convert the model into training mode.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/vit.py b/mmpose/models/backbones/vit.py new file mode 100644 index 0000000..2719d1a --- /dev/null +++ b/mmpose/models/backbones/vit.py @@ -0,0 +1,341 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +from functools import partial +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from timm.models.layers import drop_path, to_2tuple, trunc_normal_ + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + +def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True): + """ + Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token + dimension for the original embeddings. + Args: + abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). + has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. + hw (Tuple): size of input image tokens. + + Returns: + Absolute positional embeddings after processing with shape (1, H, W, C) + """ + cls_token = None + B, L, C = abs_pos.shape + if has_cls_token: + cls_token = abs_pos[:, 0:1] + abs_pos = abs_pos[:, 1:] + + if ori_h != h or ori_w != w: + new_abs_pos = F.interpolate( + abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2), + size=(h, w), + mode="bicubic", + align_corners=False, + ).permute(0, 2, 3, 1).reshape(B, -1, C) + + else: + new_abs_pos = abs_pos + + if cls_token is not None: + new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1) + return new_abs_pos + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self): + return 'p={}'.format(self.drop_prob) + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + x = self.drop(x) + return x + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., attn_head_dim=None,): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.dim = dim + + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + + return x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, + norm_layer=nn.LayerNorm, attn_head_dim=None + ): + super().__init__() + + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim + ) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2) + self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio)) + self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1])) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1)) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + x = self.proj(x) + Hp, Wp = x.shape[2], x.shape[3] + + x = x.flatten(2).transpose(1, 2) + return x, (Hp, Wp) + + +class HybridEmbed(nn.Module): + """ CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + self.img_size = img_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + feature_dim = self.backbone.feature_info.channels()[-1] + self.num_patches = feature_size[0] * feature_size[1] + self.proj = nn.Linear(feature_dim, embed_dim) + + def forward(self, x): + x = self.backbone(x)[-1] + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +@BACKBONES.register_module() +class ViT(BaseBackbone): + + def __init__(self, + img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, + frozen_stages=-1, ratio=1, last_norm=True, + patch_padding='pad', freeze_attn=False, freeze_ffn=False, + ): + # Protect mutable default arguments + super(ViT, self).__init__() + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.frozen_stages = frozen_stages + self.use_checkpoint = use_checkpoint + self.patch_padding = patch_padding + self.freeze_attn = freeze_attn + self.freeze_ffn = freeze_ffn + self.depth = depth + + if hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) + else: + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio) + num_patches = self.patch_embed.num_patches + + # since the pretraining model has class token + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + ) + for i in range(depth)]) + + self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity() + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=.02) + + self._freeze_stages() + + def _freeze_stages(self): + """Freeze parameters.""" + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = self.blocks[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + if self.freeze_attn: + for i in range(0, self.depth): + m = self.blocks[i] + m.attn.eval() + m.norm1.eval() + for param in m.attn.parameters(): + param.requires_grad = False + for param in m.norm1.parameters(): + param.requires_grad = False + + if self.freeze_ffn: + self.pos_embed.requires_grad = False + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + for i in range(0, self.depth): + m = self.blocks[i] + m.mlp.eval() + m.norm2.eval() + for param in m.mlp.parameters(): + param.requires_grad = False + for param in m.norm2.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + super().init_weights(pretrained, patch_padding=self.patch_padding) + + if pretrained is None: + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + self.apply(_init_weights) + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def forward_features(self, x): + B, C, H, W = x.shape + x, (Hp, Wp) = self.patch_embed(x) + + if self.pos_embed is not None: + # fit for multiple GPU training + # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference + x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1] + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + + x = self.last_norm(x) + + xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous() + + return xp + + def forward(self, x): + x = self.forward_features(x) + return x + + def train(self, mode=True): + """Convert the model into training mode.""" + super().train(mode) + self._freeze_stages() diff --git a/mmpose/models/backbones/vit_moe.py b/mmpose/models/backbones/vit_moe.py new file mode 100644 index 0000000..880a58f --- /dev/null +++ b/mmpose/models/backbones/vit_moe.py @@ -0,0 +1,385 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +from functools import partial +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from timm.models.layers import drop_path, to_2tuple, trunc_normal_ + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + +def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True): + """ + Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token + dimension for the original embeddings. + Args: + abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). + has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. + hw (Tuple): size of input image tokens. + + Returns: + Absolute positional embeddings after processing with shape (1, H, W, C) + """ + cls_token = None + B, L, C = abs_pos.shape + if has_cls_token: + cls_token = abs_pos[:, 0:1] + abs_pos = abs_pos[:, 1:] + + if ori_h != h or ori_w != w: + new_abs_pos = F.interpolate( + abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2), + size=(h, w), + mode="bicubic", + align_corners=False, + ).permute(0, 2, 3, 1).reshape(B, -1, C) + + else: + new_abs_pos = abs_pos + + if cls_token is not None: + new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1) + return new_abs_pos + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self): + return 'p={}'.format(self.drop_prob) + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + x = self.drop(x) + return x + +class MoEMlp(nn.Module): + def __init__(self, num_expert=1, in_features=1024, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., part_features=256): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.part_features = part_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features - part_features) + self.drop = nn.Dropout(drop) + + self.num_expert = num_expert + experts = [] + + for i in range(num_expert): + experts.append( + nn.Linear(hidden_features, part_features) + ) + self.experts = nn.ModuleList(experts) + + def forward(self, x, indices): + + expert_x = torch.zeros_like(x[:, :, -self.part_features:], device=x.device, dtype=x.dtype) + + x = self.fc1(x) + x = self.act(x) + shared_x = self.fc2(x) + indices = indices.view(-1, 1, 1) + + # to support ddp training + for i in range(self.num_expert): + selectedIndex = (indices == i) + current_x = self.experts[i](x) * selectedIndex + expert_x = expert_x + current_x + + x = torch.cat([shared_x, expert_x], dim=-1) + + return x + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., attn_head_dim=None,): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.dim = dim + + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + + return x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, + norm_layer=nn.LayerNorm, attn_head_dim=None, num_expert=1, part_features=None + ): + super().__init__() + + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim + ) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = MoEMlp(num_expert=num_expert, in_features=dim, hidden_features=mlp_hidden_dim, + act_layer=act_layer, drop=drop, part_features=part_features) + + def forward(self, x, indices=None): + + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x), indices)) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2) + self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio)) + self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1])) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1)) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + x = self.proj(x) + Hp, Wp = x.shape[2], x.shape[3] + + x = x.flatten(2).transpose(1, 2) + return x, (Hp, Wp) + + +class HybridEmbed(nn.Module): + """ CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + self.img_size = img_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + feature_dim = self.backbone.feature_info.channels()[-1] + self.num_patches = feature_size[0] * feature_size[1] + self.proj = nn.Linear(feature_dim, embed_dim) + + def forward(self, x): + x = self.backbone(x)[-1] + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +@BACKBONES.register_module() +class ViTMoE(BaseBackbone): + + def __init__(self, + img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, + frozen_stages=-1, ratio=1, last_norm=True, + patch_padding='pad', freeze_attn=False, freeze_ffn=False, + num_expert=1, part_features=None + ): + # Protect mutable default arguments + super(ViTMoE, self).__init__() + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.frozen_stages = frozen_stages + self.use_checkpoint = use_checkpoint + self.patch_padding = patch_padding + self.freeze_attn = freeze_attn + self.freeze_ffn = freeze_ffn + self.depth = depth + + if hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) + else: + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio) + num_patches = self.patch_embed.num_patches + + self.part_features = part_features + + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + num_expert=num_expert, part_features=part_features + ) + for i in range(depth)]) + + self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity() + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=.02) + + self._freeze_stages() + + def _freeze_stages(self): + """Freeze parameters.""" + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = self.blocks[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + if self.freeze_attn: + for i in range(0, self.depth): + m = self.blocks[i] + m.attn.eval() + m.norm1.eval() + for param in m.attn.parameters(): + param.requires_grad = False + for param in m.norm1.parameters(): + param.requires_grad = False + + if self.freeze_ffn: + self.pos_embed.requires_grad = False + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + for i in range(0, self.depth): + m = self.blocks[i] + m.mlp.eval() + m.norm2.eval() + for param in m.mlp.parameters(): + param.requires_grad = False + for param in m.norm2.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + super().init_weights(pretrained, patch_padding=self.patch_padding, part_features=self.part_features) + + if pretrained is None: + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + self.apply(_init_weights) + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def forward_features(self, x, dataset_source=None): + B, C, H, W = x.shape + x, (Hp, Wp) = self.patch_embed(x) + + if self.pos_embed is not None: + # fit for multiple GPU training + # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference + x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1] + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, dataset_source) + else: + x = blk(x, dataset_source) + + x = self.last_norm(x) + + xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous() + + return xp + + def forward(self, x, dataset_source=None): + x = self.forward_features(x, dataset_source) + return x + + def train(self, mode=True): + """Convert the model into training mode.""" + super().train(mode) + self._freeze_stages() diff --git a/mmpose/models/builder.py b/mmpose/models/builder.py new file mode 100644 index 0000000..220839d --- /dev/null +++ b/mmpose/models/builder.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import MODELS as MMCV_MODELS +from mmcv.cnn import build_model_from_cfg +from mmcv.utils import Registry + +MODELS = Registry( + 'models', build_func=build_model_from_cfg, parent=MMCV_MODELS) + +BACKBONES = MODELS +NECKS = MODELS +HEADS = MODELS +LOSSES = MODELS +POSENETS = MODELS +MESH_MODELS = MODELS + + +def build_backbone(cfg): + """Build backbone.""" + return BACKBONES.build(cfg) + + +def build_neck(cfg): + """Build neck.""" + return NECKS.build(cfg) + + +def build_head(cfg): + """Build head.""" + return HEADS.build(cfg) + + +def build_loss(cfg): + """Build loss.""" + return LOSSES.build(cfg) + + +def build_posenet(cfg): + """Build posenet.""" + return POSENETS.build(cfg) + + +def build_mesh_model(cfg): + """Build mesh model.""" + return MESH_MODELS.build(cfg) diff --git a/mmpose/models/detectors/__init__.py b/mmpose/models/detectors/__init__.py new file mode 100644 index 0000000..e098209 --- /dev/null +++ b/mmpose/models/detectors/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .associative_embedding import AssociativeEmbedding +from .interhand_3d import Interhand3D +from .mesh import ParametricMesh +from .multi_task import MultiTask +from .multiview_pose import (DetectAndRegress, VoxelCenterDetector, + VoxelSinglePose) +from .pose_lifter import PoseLifter +from .posewarper import PoseWarper +from .top_down import TopDown +from .top_down_moe import TopDownMoE + +__all__ = [ + 'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask', + 'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress', + 'VoxelCenterDetector', 'VoxelSinglePose', 'TopDownMoE' +] diff --git a/mmpose/models/detectors/associative_embedding.py b/mmpose/models/detectors/associative_embedding.py new file mode 100644 index 0000000..100c780 --- /dev/null +++ b/mmpose/models/detectors/associative_embedding.py @@ -0,0 +1,420 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import mmcv +import torch +from mmcv.image import imwrite +from mmcv.utils.misc import deprecated_api_warning +from mmcv.visualization.image import imshow + +from mmpose.core.evaluation import (aggregate_scale, aggregate_stage_flip, + flip_feature_maps, get_group_preds, + split_ae_outputs) +from mmpose.core.post_processing.group import HeatmapParser +from mmpose.core.visualization import imshow_keypoints +from .. import builder +from ..builder import POSENETS +from .base import BasePose + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class AssociativeEmbedding(BasePose): + """Associative embedding pose detectors. + + Args: + backbone (dict): Backbone modules to extract feature. + keypoint_head (dict): Keypoint head to process feature. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + loss_pose (None): Deprecated arguments. Please use + ``loss_keypoint`` for heads instead. + """ + + def __init__(self, + backbone, + keypoint_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + loss_pose=None): + super().__init__() + self.fp16_enabled = False + + self.backbone = builder.build_backbone(backbone) + + if keypoint_head is not None: + if 'loss_keypoint' not in keypoint_head and loss_pose is not None: + warnings.warn( + '`loss_pose` for BottomUp is deprecated, ' + 'use `loss_keypoint` for heads instead. See ' + 'https://github.com/open-mmlab/mmpose/pull/382' + ' for more information.', DeprecationWarning) + keypoint_head['loss_keypoint'] = loss_pose + + self.keypoint_head = builder.build_head(keypoint_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.use_udp = test_cfg.get('use_udp', False) + self.parser = HeatmapParser(self.test_cfg) + self.init_weights(pretrained=pretrained) + + @property + def with_keypoint(self): + """Check if has keypoint_head.""" + return hasattr(self, 'keypoint_head') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.backbone.init_weights(pretrained) + if self.with_keypoint: + self.keypoint_head.init_weights() + + @auto_fp16(apply_to=('img', )) + def forward(self, + img=None, + targets=None, + masks=None, + joints=None, + img_metas=None, + return_loss=True, + return_heatmap=False, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss is True. + + Note: + - batch_size: N + - num_keypoints: K + - num_img_channel: C + - img_width: imgW + - img_height: imgH + - heatmaps weight: W + - heatmaps height: H + - max_num_people: M + + Args: + img (torch.Tensor[N,C,imgH,imgW]): Input image. + targets (list(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps. + masks (list(torch.Tensor[N,H,W])): Masks of multi-scale target + heatmaps + joints (list(torch.Tensor[N,M,K,2])): Joints of multi-scale target + heatmaps for ae loss + img_metas (dict): Information about val & test. + By default it includes: + + - "image_file": image path + - "aug_data": input + - "test_scale_factor": test scale factor + - "base_size": base size of input + - "center": center of image + - "scale": scale of image + - "flip_index": flip index of keypoints + return loss (bool): ``return_loss=True`` for training, + ``return_loss=False`` for validation & test. + return_heatmap (bool) : Option to return heatmap. + + Returns: + dict|tuple: if 'return_loss' is true, then return losses. \ + Otherwise, return predicted poses, scores, image \ + paths and heatmaps. + """ + + if return_loss: + return self.forward_train(img, targets, masks, joints, img_metas, + **kwargs) + return self.forward_test( + img, img_metas, return_heatmap=return_heatmap, **kwargs) + + def forward_train(self, img, targets, masks, joints, img_metas, **kwargs): + """Forward the bottom-up model and calculate the loss. + + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + heatmaps weight: W + heatmaps height: H + max_num_people: M + + Args: + img (torch.Tensor[N,C,imgH,imgW]): Input image. + targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps. + masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target + heatmaps + joints (List(torch.Tensor[N,M,K,2])): Joints of multi-scale target + heatmaps for ae loss + img_metas (dict):Information about val&test + By default this includes: + - "image_file": image path + - "aug_data": input + - "test_scale_factor": test scale factor + - "base_size": base size of input + - "center": center of image + - "scale": scale of image + - "flip_index": flip index of keypoints + + Returns: + dict: The total loss for bottom-up + """ + + output = self.backbone(img) + + if self.with_keypoint: + output = self.keypoint_head(output) + + # if return loss + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head.get_loss( + output, targets, masks, joints) + losses.update(keypoint_losses) + + return losses + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): Input image. + + Returns: + Tensor: Outputs. + """ + output = self.backbone(img) + if self.with_keypoint: + output = self.keypoint_head(output) + return output + + def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): + """Inference the bottom-up model. + + Note: + - Batchsize: N (currently support batchsize = 1) + - num_img_channel: C + - img_width: imgW + - img_height: imgH + + Args: + flip_index (List(int)): + aug_data (List(Tensor[NxCximgHximgW])): Multi-scale image + test_scale_factor (List(float)): Multi-scale factor + base_size (Tuple(int)): Base size of image when scale is 1 + center (np.ndarray): center of image + scale (np.ndarray): the scale of image + """ + assert img.size(0) == 1 + assert len(img_metas) == 1 + + img_metas = img_metas[0] + + aug_data = img_metas['aug_data'] + + test_scale_factor = img_metas['test_scale_factor'] + base_size = img_metas['base_size'] + center = img_metas['center'] + scale = img_metas['scale'] + + result = {} + + scale_heatmaps_list = [] + scale_tags_list = [] + + for idx, s in enumerate(sorted(test_scale_factor, reverse=True)): + image_resized = aug_data[idx].to(img.device) + + features = self.backbone(image_resized) + if self.with_keypoint: + outputs = self.keypoint_head(features) + + heatmaps, tags = split_ae_outputs( + outputs, self.test_cfg['num_joints'], + self.test_cfg['with_heatmaps'], self.test_cfg['with_ae'], + self.test_cfg.get('select_output_index', range(len(outputs)))) + + if self.test_cfg.get('flip_test', True): + # use flip test + features_flipped = self.backbone( + torch.flip(image_resized, [3])) + if self.with_keypoint: + outputs_flipped = self.keypoint_head(features_flipped) + + heatmaps_flipped, tags_flipped = split_ae_outputs( + outputs_flipped, self.test_cfg['num_joints'], + self.test_cfg['with_heatmaps'], self.test_cfg['with_ae'], + self.test_cfg.get('select_output_index', + range(len(outputs)))) + + heatmaps_flipped = flip_feature_maps( + heatmaps_flipped, flip_index=img_metas['flip_index']) + if self.test_cfg['tag_per_joint']: + tags_flipped = flip_feature_maps( + tags_flipped, flip_index=img_metas['flip_index']) + else: + tags_flipped = flip_feature_maps( + tags_flipped, flip_index=None, flip_output=True) + + else: + heatmaps_flipped = None + tags_flipped = None + + aggregated_heatmaps = aggregate_stage_flip( + heatmaps, + heatmaps_flipped, + index=-1, + project2image=self.test_cfg['project2image'], + size_projected=base_size, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_stage='average', + aggregate_flip='average') + + aggregated_tags = aggregate_stage_flip( + tags, + tags_flipped, + index=-1, + project2image=self.test_cfg['project2image'], + size_projected=base_size, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_stage='concat', + aggregate_flip='concat') + + if s == 1 or len(test_scale_factor) == 1: + if isinstance(aggregated_tags, list): + scale_tags_list.extend(aggregated_tags) + else: + scale_tags_list.append(aggregated_tags) + + if isinstance(aggregated_heatmaps, list): + scale_heatmaps_list.extend(aggregated_heatmaps) + else: + scale_heatmaps_list.append(aggregated_heatmaps) + + aggregated_heatmaps = aggregate_scale( + scale_heatmaps_list, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_scale='average') + + aggregated_tags = aggregate_scale( + scale_tags_list, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_scale='unsqueeze_concat') + + heatmap_size = aggregated_heatmaps.shape[2:4] + tag_size = aggregated_tags.shape[2:4] + if heatmap_size != tag_size: + tmp = [] + for idx in range(aggregated_tags.shape[-1]): + tmp.append( + torch.nn.functional.interpolate( + aggregated_tags[..., idx], + size=heatmap_size, + mode='bilinear', + align_corners=self.test_cfg.get('align_corners', + True)).unsqueeze(-1)) + aggregated_tags = torch.cat(tmp, dim=-1) + + # perform grouping + grouped, scores = self.parser.parse(aggregated_heatmaps, + aggregated_tags, + self.test_cfg['adjust'], + self.test_cfg['refine']) + + preds = get_group_preds( + grouped, + center, + scale, [aggregated_heatmaps.size(3), + aggregated_heatmaps.size(2)], + use_udp=self.use_udp) + + image_paths = [] + image_paths.append(img_metas['image_file']) + + if return_heatmap: + output_heatmap = aggregated_heatmaps.detach().cpu().numpy() + else: + output_heatmap = None + + result['preds'] = preds + result['scores'] = scores + result['image_paths'] = image_paths + result['output_heatmap'] = output_heatmap + + return result + + @deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, + cls_name='AssociativeEmbedding') + def show_result(self, + img, + result, + skeleton=None, + kpt_score_thr=0.3, + bbox_color=None, + pose_kpt_color=None, + pose_link_color=None, + radius=4, + thickness=1, + font_scale=0.5, + win_name='', + show=False, + show_keypoint_weight=False, + wait_time=0, + out_file=None): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + skeleton (list[list]): The connection of keypoints. + skeleton is 0-based indexing. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_link_color (np.array[Mx3]): Color of M links. + If None, do not draw links. + radius (int): Radius of circles. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + show_keypoint_weight (bool): Whether to change the transparency + using the predicted confidence scores of keypoints. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized image only if not `show` or `out_file` + """ + img = mmcv.imread(img) + img = img.copy() + img_h, img_w, _ = img.shape + + pose_result = [] + for res in result: + pose_result.append(res['keypoints']) + + imshow_keypoints(img, pose_result, skeleton, kpt_score_thr, + pose_kpt_color, pose_link_color, radius, thickness) + + if show: + imshow(img, win_name, wait_time) + + if out_file is not None: + imwrite(img, out_file) + + return img diff --git a/mmpose/models/detectors/base.py b/mmpose/models/detectors/base.py new file mode 100644 index 0000000..5d459b4 --- /dev/null +++ b/mmpose/models/detectors/base.py @@ -0,0 +1,131 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from collections import OrderedDict + +import torch +import torch.distributed as dist +import torch.nn as nn + + +class BasePose(nn.Module, metaclass=ABCMeta): + """Base class for pose detectors. + + All recognizers should subclass it. + All subclass should overwrite: + Methods:`forward_train`, supporting to forward when training. + Methods:`forward_test`, supporting to forward when testing. + + Args: + backbone (dict): Backbone modules to extract feature. + head (dict): Head modules to give output. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + """ + + @abstractmethod + def forward_train(self, img, img_metas, **kwargs): + """Defines the computation performed at training.""" + + @abstractmethod + def forward_test(self, img, img_metas, **kwargs): + """Defines the computation performed at testing.""" + + @abstractmethod + def forward(self, img, img_metas, return_loss=True, **kwargs): + """Forward function.""" + + @staticmethod + def _parse_losses(losses): + """Parse the raw outputs (losses) of the network. + + Args: + losses (dict): Raw output of the network, which usually contain + losses and other necessary information. + + Returns: + tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \ + which may be a weighted sum of all losses, log_vars \ + contains all the variables to be sent to the logger. + """ + log_vars = OrderedDict() + for loss_name, loss_value in losses.items(): + if isinstance(loss_value, torch.Tensor): + log_vars[loss_name] = loss_value.mean() + elif isinstance(loss_value, float): + log_vars[loss_name] = loss_value + elif isinstance(loss_value, list): + log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) + else: + raise TypeError( + f'{loss_name} is not a tensor or list of tensors or float') + + loss = sum(_value for _key, _value in log_vars.items() + if 'loss' in _key) + + log_vars['loss'] = loss + for loss_name, loss_value in log_vars.items(): + # reduce loss when distributed training + if not isinstance(loss_value, float): + if dist.is_available() and dist.is_initialized(): + loss_value = loss_value.data.clone() + dist.all_reduce(loss_value.div_(dist.get_world_size())) + log_vars[loss_name] = loss_value.item() + else: + log_vars[loss_name] = loss_value + + return loss, log_vars + + def train_step(self, data_batch, optimizer, **kwargs): + """The iteration step during training. + + This method defines an iteration step during training, except for the + back propagation and optimizer updating, which are done in an optimizer + hook. Note that in some complicated cases or models, the whole process + including back propagation and optimizer updating is also defined in + this method, such as GAN. + + Args: + data_batch (dict): The output of dataloader. + optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of + runner is passed to ``train_step()``. This argument is unused + and reserved. + + Returns: + dict: It should contain at least 3 keys: ``loss``, ``log_vars``, + ``num_samples``. + ``loss`` is a tensor for back propagation, which can be a + weighted sum of multiple losses. + ``log_vars`` contains all the variables to be sent to the + logger. + ``num_samples`` indicates the batch size (when the model is + DDP, it means the batch size on each GPU), which is used for + averaging the logs. + """ + losses = self.forward(**data_batch) + + loss, log_vars = self._parse_losses(losses) + + outputs = dict( + loss=loss, + log_vars=log_vars, + num_samples=len(next(iter(data_batch.values())))) + + return outputs + + def val_step(self, data_batch, optimizer, **kwargs): + """The iteration step during validation. + + This method shares the same signature as :func:`train_step`, but used + during val epochs. Note that the evaluation after training epochs is + not implemented with this method, but an evaluation hook. + """ + results = self.forward(return_loss=False, **data_batch) + + outputs = dict(results=results) + + return outputs + + @abstractmethod + def show_result(self, **kwargs): + """Visualize the results.""" + raise NotImplementedError diff --git a/mmpose/models/detectors/interhand_3d.py b/mmpose/models/detectors/interhand_3d.py new file mode 100644 index 0000000..5a4d6bd --- /dev/null +++ b/mmpose/models/detectors/interhand_3d.py @@ -0,0 +1,227 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +from mmcv.utils.misc import deprecated_api_warning + +from mmpose.core import imshow_keypoints, imshow_keypoints_3d +from ..builder import POSENETS +from .top_down import TopDown + + +@POSENETS.register_module() +class Interhand3D(TopDown): + """Top-down interhand 3D pose detector of paper ref: Gyeongsik Moon. + + "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose + Estimation from a Single RGB Image". A child class of TopDown detector. + """ + + def forward(self, + img, + target=None, + target_weight=None, + img_metas=None, + return_loss=True, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. list[Tensor], list[list[dict]]), with + the outer list indicating test time augmentations. + + Note: + - batch_size: N + - num_keypoints: K + - num_img_channel: C (Default: 3) + - img height: imgH + - img width: imgW + - heatmaps height: H + - heatmaps weight: W + + Args: + img (torch.Tensor[NxCximgHximgW]): Input images. + target (list[torch.Tensor]): Target heatmaps, relative hand + root depth and hand type. + target_weight (list[torch.Tensor]): Weights for target + heatmaps, relative hand root depth and hand type. + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + - "heatmap3d_depth_bound": depth bound of hand keypoint 3D + heatmap + - "root_depth_bound": depth bound of relative root depth 1D + heatmap + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + + Returns: + dict|tuple: if `return loss` is true, then return losses. \ + Otherwise, return predicted poses, boxes, image paths, \ + heatmaps, relative hand root depth and hand type. + """ + if return_loss: + return self.forward_train(img, target, target_weight, img_metas, + **kwargs) + return self.forward_test(img, img_metas, **kwargs) + + def forward_test(self, img, img_metas, **kwargs): + """Defines the computation performed at every call when testing.""" + assert img.size(0) == len(img_metas) + batch_size, _, img_height, img_width = img.shape + if batch_size > 1: + assert 'bbox_id' in img_metas[0] + + features = self.backbone(img) + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + output = self.keypoint_head.inference_model( + features, flip_pairs=None) + + if self.test_cfg.get('flip_test', True): + img_flipped = img.flip(3) + features_flipped = self.backbone(img_flipped) + if self.with_neck: + features_flipped = self.neck(features_flipped) + if self.with_keypoint: + output_flipped = self.keypoint_head.inference_model( + features_flipped, img_metas[0]['flip_pairs']) + output = [(out + out_flipped) * 0.5 + for out, out_flipped in zip(output, output_flipped)] + + if self.with_keypoint: + result = self.keypoint_head.decode( + img_metas, output, img_size=[img_width, img_height]) + else: + result = {} + return result + + @deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, + cls_name='Interhand3D') + def show_result(self, + result, + img=None, + skeleton=None, + kpt_score_thr=0.3, + radius=8, + bbox_color='green', + thickness=2, + pose_kpt_color=None, + pose_link_color=None, + vis_height=400, + num_instances=-1, + win_name='', + show=False, + wait_time=0, + out_file=None): + """Visualize 3D pose estimation results. + + Args: + result (list[dict]): The pose estimation results containing: + + - "keypoints_3d" ([K,4]): 3D keypoints + - "keypoints" ([K,3] or [T,K,3]): Optional for visualizing + 2D inputs. If a sequence is given, only the last frame + will be used for visualization + - "bbox" ([4,] or [T,4]): Optional for visualizing 2D inputs + - "title" (str): title for the subplot + img (str or Tensor): Optional. The image to visualize 2D inputs on. + skeleton (list of [idx_i,idx_j]): Skeleton described by a list of + links, each is a pair of joint indices. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + radius (int): Radius of circles. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + thickness (int): Thickness of lines. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_link_color (np.array[Mx3]): Color of M limbs. + If None, do not draw limbs. + vis_height (int): The image height of the visualization. The width + will be N*vis_height depending on the number of visualized + items. + num_instances (int): Number of instances to be shown in 3D. If + smaller than 0, all the instances in the pose_result will be + shown. Otherwise, pad or truncate the pose_result to a length + of num_instances. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized img, only if not `show` or `out_file`. + """ + if num_instances < 0: + assert len(result) > 0 + result = sorted(result, key=lambda x: x.get('track_id', 0)) + + # draw image and 2d poses + if img is not None: + img = mmcv.imread(img) + + bbox_result = [] + pose_2d = [] + for res in result: + if 'bbox' in res: + bbox = np.array(res['bbox']) + if bbox.ndim != 1: + assert bbox.ndim == 2 + bbox = bbox[-1] # Get bbox from the last frame + bbox_result.append(bbox) + if 'keypoints' in res: + kpts = np.array(res['keypoints']) + if kpts.ndim != 2: + assert kpts.ndim == 3 + kpts = kpts[-1] # Get 2D keypoints from the last frame + pose_2d.append(kpts) + + if len(bbox_result) > 0: + bboxes = np.vstack(bbox_result) + mmcv.imshow_bboxes( + img, + bboxes, + colors=bbox_color, + top_k=-1, + thickness=2, + show=False) + if len(pose_2d) > 0: + imshow_keypoints( + img, + pose_2d, + skeleton, + kpt_score_thr=kpt_score_thr, + pose_kpt_color=pose_kpt_color, + pose_link_color=pose_link_color, + radius=radius, + thickness=thickness) + img = mmcv.imrescale(img, scale=vis_height / img.shape[0]) + + img_vis = imshow_keypoints_3d( + result, + img, + skeleton, + pose_kpt_color, + pose_link_color, + vis_height, + axis_limit=300, + axis_azimuth=-115, + axis_elev=15, + kpt_score_thr=kpt_score_thr, + num_instances=num_instances) + + if show: + mmcv.visualization.imshow(img_vis, win_name, wait_time) + + if out_file is not None: + mmcv.imwrite(img_vis, out_file) + + return img_vis diff --git a/mmpose/models/detectors/mesh.py b/mmpose/models/detectors/mesh.py new file mode 100644 index 0000000..0af18e3 --- /dev/null +++ b/mmpose/models/detectors/mesh.py @@ -0,0 +1,438 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import mmcv +import numpy as np +import torch + +from mmpose.core.visualization.image import imshow_mesh_3d +from mmpose.models.misc.discriminator import SMPLDiscriminator +from .. import builder +from ..builder import POSENETS +from .base import BasePose + + +def set_requires_grad(nets, requires_grad=False): + """Set requies_grad for all the networks. + + Args: + nets (nn.Module | list[nn.Module]): A list of networks or a single + network. + requires_grad (bool): Whether the networks require gradients or not + """ + if not isinstance(nets, list): + nets = [nets] + for net in nets: + if net is not None: + for param in net.parameters(): + param.requires_grad = requires_grad + + +@POSENETS.register_module() +class ParametricMesh(BasePose): + """Model-based 3D human mesh detector. Take a single color image as input + and output 3D joints, SMPL parameters and camera parameters. + + Args: + backbone (dict): Backbone modules to extract feature. + mesh_head (dict): Mesh head to process feature. + smpl (dict): Config for SMPL model. + disc (dict): Discriminator for SMPL parameters. Default: None. + loss_gan (dict): Config for adversarial loss. Default: None. + loss_mesh (dict): Config for mesh loss. Default: None. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + """ + + def __init__(self, + backbone, + mesh_head, + smpl, + disc=None, + loss_gan=None, + loss_mesh=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super().__init__() + + self.backbone = builder.build_backbone(backbone) + self.mesh_head = builder.build_head(mesh_head) + self.generator = torch.nn.Sequential(self.backbone, self.mesh_head) + + self.smpl = builder.build_mesh_model(smpl) + + self.with_gan = disc is not None and loss_gan is not None + if self.with_gan: + self.discriminator = SMPLDiscriminator(**disc) + self.loss_gan = builder.build_loss(loss_gan) + self.disc_step_count = 0 + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.loss_mesh = builder.build_loss(loss_mesh) + self.init_weights(pretrained=pretrained) + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.backbone.init_weights(pretrained) + self.mesh_head.init_weights() + if self.with_gan: + self.discriminator.init_weights() + + def train_step(self, data_batch, optimizer, **kwargs): + """Train step function. + + In this function, the detector will finish the train step following + the pipeline: + + 1. get fake and real SMPL parameters + 2. optimize discriminator (if have) + 3. optimize generator + + If `self.train_cfg.disc_step > 1`, the train step will contain multiple + iterations for optimizing discriminator with different input data and + only one iteration for optimizing generator after `disc_step` + iterations for discriminator. + + Args: + data_batch (torch.Tensor): Batch of data as input. + optimizer (dict[torch.optim.Optimizer]): Dict with optimizers for + generator and discriminator (if have). + + Returns: + outputs (dict): Dict with loss, information for logger, + the number of samples. + """ + + img = data_batch['img'] + pred_smpl = self.generator(img) + pred_pose, pred_beta, pred_camera = pred_smpl + + # optimize discriminator (if have) + if self.train_cfg['disc_step'] > 0 and self.with_gan: + set_requires_grad(self.discriminator, True) + fake_data = (pred_camera.detach(), pred_pose.detach(), + pred_beta.detach()) + mosh_theta = data_batch['mosh_theta'] + real_data = (mosh_theta[:, :3], mosh_theta[:, + 3:75], mosh_theta[:, + 75:]) + fake_score = self.discriminator(fake_data) + real_score = self.discriminator(real_data) + + disc_losses = {} + disc_losses['real_loss'] = self.loss_gan( + real_score, target_is_real=True, is_disc=True) + disc_losses['fake_loss'] = self.loss_gan( + fake_score, target_is_real=False, is_disc=True) + loss_disc, log_vars_d = self._parse_losses(disc_losses) + + optimizer['discriminator'].zero_grad() + loss_disc.backward() + optimizer['discriminator'].step() + self.disc_step_count = \ + (self.disc_step_count + 1) % self.train_cfg['disc_step'] + + if self.disc_step_count != 0: + outputs = dict( + loss=loss_disc, + log_vars=log_vars_d, + num_samples=len(next(iter(data_batch.values())))) + return outputs + + # optimize generator + pred_out = self.smpl( + betas=pred_beta, + body_pose=pred_pose[:, 1:], + global_orient=pred_pose[:, :1]) + pred_vertices, pred_joints_3d = pred_out['vertices'], pred_out[ + 'joints'] + + gt_beta = data_batch['beta'] + gt_pose = data_batch['pose'] + gt_vertices = self.smpl( + betas=gt_beta, + body_pose=gt_pose[:, 3:], + global_orient=gt_pose[:, :3])['vertices'] + + pred = dict( + pose=pred_pose, + beta=pred_beta, + camera=pred_camera, + vertices=pred_vertices, + joints_3d=pred_joints_3d) + + target = { + key: data_batch[key] + for key in [ + 'pose', 'beta', 'has_smpl', 'joints_3d', 'joints_2d', + 'joints_3d_visible', 'joints_2d_visible' + ] + } + target['vertices'] = gt_vertices + + losses = self.loss_mesh(pred, target) + + if self.with_gan: + set_requires_grad(self.discriminator, False) + pred_theta = (pred_camera, pred_pose, pred_beta) + pred_score = self.discriminator(pred_theta) + loss_adv = self.loss_gan( + pred_score, target_is_real=True, is_disc=False) + losses['adv_loss'] = loss_adv + + loss, log_vars = self._parse_losses(losses) + optimizer['generator'].zero_grad() + loss.backward() + optimizer['generator'].step() + + outputs = dict( + loss=loss, + log_vars=log_vars, + num_samples=len(next(iter(data_batch.values())))) + + return outputs + + def forward_train(self, *args, **kwargs): + """Forward function for training. + + For ParametricMesh, we do not use this interface. + """ + raise NotImplementedError('This interface should not be used in ' + 'current training schedule. Please use ' + '`train_step` for training.') + + def val_step(self, data_batch, **kwargs): + """Forward function for evaluation. + + Args: + data_batch (dict): Contain data for forward. + + Returns: + dict: Contain the results from model. + """ + output = self.forward_test(**data_batch, **kwargs) + return output + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): Input image. + + Returns: + Tensor: Outputs. + """ + output = self.generator(img) + return output + + def forward_test(self, + img, + img_metas, + return_vertices=False, + return_faces=False, + **kwargs): + """Defines the computation performed at every call when testing.""" + + pred_smpl = self.generator(img) + pred_pose, pred_beta, pred_camera = pred_smpl + pred_out = self.smpl( + betas=pred_beta, + body_pose=pred_pose[:, 1:], + global_orient=pred_pose[:, :1]) + pred_vertices, pred_joints_3d = pred_out['vertices'], pred_out[ + 'joints'] + + all_preds = {} + all_preds['keypoints_3d'] = pred_joints_3d.detach().cpu().numpy() + all_preds['smpl_pose'] = pred_pose.detach().cpu().numpy() + all_preds['smpl_beta'] = pred_beta.detach().cpu().numpy() + all_preds['camera'] = pred_camera.detach().cpu().numpy() + + if return_vertices: + all_preds['vertices'] = pred_vertices.detach().cpu().numpy() + if return_faces: + all_preds['faces'] = self.smpl.get_faces() + + all_boxes = [] + image_path = [] + for img_meta in img_metas: + box = np.zeros(6, dtype=np.float32) + c = img_meta['center'] + s = img_meta['scale'] + if 'bbox_score' in img_metas: + score = np.array(img_metas['bbox_score']).reshape(-1) + else: + score = 1.0 + box[0:2] = c + box[2:4] = s + box[4] = np.prod(s * 200.0, axis=0) + box[5] = score + all_boxes.append(box) + image_path.append(img_meta['image_file']) + + all_preds['bboxes'] = np.stack(all_boxes, axis=0) + all_preds['image_path'] = image_path + return all_preds + + def get_3d_joints_from_mesh(self, vertices): + """Get 3D joints from 3D mesh using predefined joints regressor.""" + return torch.matmul( + self.joints_regressor.to(vertices.device), vertices) + + def forward(self, img, img_metas=None, return_loss=False, **kwargs): + """Forward function. + + Calls either forward_train or forward_test depending on whether + return_loss=True. + + Note: + - batch_size: N + - num_img_channel: C (Default: 3) + - img height: imgH + - img width: imgW + + Args: + img (torch.Tensor[N x C x imgH x imgW]): Input images. + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + + Returns: + Return predicted 3D joints, SMPL parameters, boxes and image paths. + """ + + if return_loss: + return self.forward_train(img, img_metas, **kwargs) + return self.forward_test(img, img_metas, **kwargs) + + def show_result(self, + result, + img, + show=False, + out_file=None, + win_name='', + wait_time=0, + bbox_color='green', + mesh_color=(76, 76, 204), + **kwargs): + """Visualize 3D mesh estimation results. + + Args: + result (list[dict]): The mesh estimation results containing: + + - "bbox" (ndarray[4]): instance bounding bbox + - "center" (ndarray[2]): bbox center + - "scale" (ndarray[2]): bbox scale + - "keypoints_3d" (ndarray[K,3]): predicted 3D keypoints + - "camera" (ndarray[3]): camera parameters + - "vertices" (ndarray[V, 3]): predicted 3D vertices + - "faces" (ndarray[F, 3]): mesh faces + img (str or Tensor): Optional. The image to visualize 2D inputs on. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + wait_time (int): Value of waitKey param. Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + mesh_color (str or tuple or :obj:`Color`): Color of mesh surface. + + Returns: + ndarray: Visualized img, only if not `show` or `out_file`. + """ + + if img is not None: + img = mmcv.imread(img) + + focal_length = self.loss_mesh.focal_length + H, W, C = img.shape + img_center = np.array([[0.5 * W], [0.5 * H]]) + + # show bounding boxes + bboxes = [res['bbox'] for res in result] + bboxes = np.vstack(bboxes) + mmcv.imshow_bboxes( + img, bboxes, colors=bbox_color, top_k=-1, thickness=2, show=False) + + vertex_list = [] + face_list = [] + for res in result: + vertices = res['vertices'] + faces = res['faces'] + camera = res['camera'] + camera_center = res['center'] + scale = res['scale'] + + # predicted vertices are in root-relative space, + # we need to translate them to camera space. + translation = np.array([ + camera[1], camera[2], + 2 * focal_length / (scale[0] * 200.0 * camera[0] + 1e-9) + ]) + mean_depth = vertices[:, -1].mean() + translation[-1] + translation[:2] += (camera_center - + img_center[:, 0]) / focal_length * mean_depth + vertices += translation[None, :] + + vertex_list.append(vertices) + face_list.append(faces) + + # render from front view + img_vis = imshow_mesh_3d( + img, + vertex_list, + face_list, + img_center, [focal_length, focal_length], + colors=mesh_color) + + # render from side view + # rotate mesh vertices + R = cv2.Rodrigues(np.array([0, np.radians(90.), 0]))[0] + rot_vertex_list = [np.dot(vert, R) for vert in vertex_list] + + # get the 3D bbox containing all meshes + rot_vertices = np.concatenate(rot_vertex_list, axis=0) + min_corner = rot_vertices.min(0) + max_corner = rot_vertices.max(0) + + center_3d = 0.5 * (min_corner + max_corner) + ratio = 0.8 + bbox3d_size = max_corner - min_corner + + # set appropriate translation to make all meshes appear in the image + z_x = bbox3d_size[0] * focal_length / (ratio * W) - min_corner[2] + z_y = bbox3d_size[1] * focal_length / (ratio * H) - min_corner[2] + z = max(z_x, z_y) + translation = -center_3d + translation[2] = z + translation = translation[None, :] + rot_vertex_list = [ + rot_vert + translation for rot_vert in rot_vertex_list + ] + + # render from side view + img_side = imshow_mesh_3d( + np.ones_like(img) * 255, rot_vertex_list, face_list, img_center, + [focal_length, focal_length]) + + # merger images from front view and side view + img_vis = np.concatenate([img_vis, img_side], axis=1) + + if show: + mmcv.visualization.imshow(img_vis, win_name, wait_time) + + if out_file is not None: + mmcv.imwrite(img_vis, out_file) + + return img_vis diff --git a/mmpose/models/detectors/multi_task.py b/mmpose/models/detectors/multi_task.py new file mode 100644 index 0000000..1b6f317 --- /dev/null +++ b/mmpose/models/detectors/multi_task.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .. import builder +from ..builder import POSENETS + + +@POSENETS.register_module() +class MultiTask(nn.Module): + """Multi-task detectors. + + Args: + backbone (dict): Backbone modules to extract feature. + heads (list[dict]): heads to output predictions. + necks (list[dict] | None): necks to process feature. + head2neck (dict{int:int}): head index to neck index. + pretrained (str): Path to the pretrained models. + """ + + def __init__(self, + backbone, + heads, + necks=None, + head2neck=None, + pretrained=None): + super().__init__() + + self.backbone = builder.build_backbone(backbone) + + if head2neck is None: + assert necks is None + head2neck = {} + + self.head2neck = {} + for i in range(len(heads)): + self.head2neck[i] = head2neck[i] if i in head2neck else -1 + + self.necks = nn.ModuleList([]) + if necks is not None: + for neck in necks: + self.necks.append(builder.build_neck(neck)) + self.necks.append(nn.Identity()) + + self.heads = nn.ModuleList([]) + assert heads is not None + for head in heads: + assert head is not None + self.heads.append(builder.build_head(head)) + + self.init_weights(pretrained=pretrained) + + @property + def with_necks(self): + """Check if has keypoint_head.""" + return hasattr(self, 'necks') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.backbone.init_weights(pretrained) + if self.with_necks: + for neck in self.necks: + if hasattr(neck, 'init_weights'): + neck.init_weights() + + for head in self.heads: + if hasattr(head, 'init_weights'): + head.init_weights() + + def forward(self, + img, + target=None, + target_weight=None, + img_metas=None, + return_loss=True, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + + Note: + - batch_size: N + - num_keypoints: K + - num_img_channel: C (Default: 3) + - img height: imgH + - img weight: imgW + - heatmaps height: H + - heatmaps weight: W + + Args: + img (torch.Tensor[N,C,imgH,imgW]): Input images. + target (list[torch.Tensor]): Targets. + target_weight (List[torch.Tensor]): Weights. + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + + Returns: + dict|tuple: if `return loss` is true, then return losses. \ + Otherwise, return predicted poses, boxes, image paths \ + and heatmaps. + """ + if return_loss: + return self.forward_train(img, target, target_weight, img_metas, + **kwargs) + return self.forward_test(img, img_metas, **kwargs) + + def forward_train(self, img, target, target_weight, img_metas, **kwargs): + """Defines the computation performed at every call when training.""" + features = self.backbone(img) + outputs = [] + + for head_id, head in enumerate(self.heads): + neck_id = self.head2neck[head_id] + outputs.append(head(self.necks[neck_id](features))) + + # if return loss + losses = dict() + + for head, output, gt, gt_weight in zip(self.heads, outputs, target, + target_weight): + loss = head.get_loss(output, gt, gt_weight) + assert len(set(losses.keys()).intersection(set(loss.keys()))) == 0 + losses.update(loss) + + if hasattr(head, 'get_accuracy'): + acc = head.get_accuracy(output, gt, gt_weight) + assert len(set(losses.keys()).intersection(set( + acc.keys()))) == 0 + losses.update(acc) + + return losses + + def forward_test(self, img, img_metas, **kwargs): + """Defines the computation performed at every call when testing.""" + assert img.size(0) == len(img_metas) + batch_size, _, img_height, img_width = img.shape + if batch_size > 1: + assert 'bbox_id' in img_metas[0] + + results = {} + + features = self.backbone(img) + outputs = [] + + for head_id, head in enumerate(self.heads): + neck_id = self.head2neck[head_id] + if hasattr(head, 'inference_model'): + head_output = head.inference_model( + self.necks[neck_id](features), flip_pairs=None) + else: + head_output = head( + self.necks[neck_id](features)).detach().cpu().numpy() + outputs.append(head_output) + + for head, output in zip(self.heads, outputs): + result = head.decode( + img_metas, output, img_size=[img_width, img_height]) + results.update(result) + return results + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): Input image. + + Returns: + list[Tensor]: Outputs. + """ + features = self.backbone(img) + outputs = [] + for head_id, head in enumerate(self.heads): + neck_id = self.head2neck[head_id] + outputs.append(head(self.necks[neck_id](features))) + return outputs diff --git a/mmpose/models/detectors/multiview_pose.py b/mmpose/models/detectors/multiview_pose.py new file mode 100644 index 0000000..c3d2221 --- /dev/null +++ b/mmpose/models/detectors/multiview_pose.py @@ -0,0 +1,889 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.runner import load_checkpoint + +from mmpose.core.camera import SimpleCameraTorch +from mmpose.core.post_processing.post_transforms import ( + affine_transform_torch, get_affine_transform) +from .. import builder +from ..builder import POSENETS +from .base import BasePose + + +class ProjectLayer(nn.Module): + + def __init__(self, image_size, heatmap_size): + """Project layer to get voxel feature. Adapted from + https://github.com/microsoft/voxelpose- + pytorch/blob/main/lib/models/project_layer.py. + + Args: + image_size (int or list): input size of the 2D model + heatmap_size (int or list): output size of the 2D model + """ + super(ProjectLayer, self).__init__() + self.image_size = image_size + self.heatmap_size = heatmap_size + if isinstance(self.image_size, int): + self.image_size = [self.image_size, self.image_size] + if isinstance(self.heatmap_size, int): + self.heatmap_size = [self.heatmap_size, self.heatmap_size] + + def compute_grid(self, box_size, box_center, num_bins, device=None): + if isinstance(box_size, int) or isinstance(box_size, float): + box_size = [box_size, box_size, box_size] + if isinstance(num_bins, int): + num_bins = [num_bins, num_bins, num_bins] + + grid_1D_x = torch.linspace( + -box_size[0] / 2, box_size[0] / 2, num_bins[0], device=device) + grid_1D_y = torch.linspace( + -box_size[1] / 2, box_size[1] / 2, num_bins[1], device=device) + grid_1D_z = torch.linspace( + -box_size[2] / 2, box_size[2] / 2, num_bins[2], device=device) + grid_x, grid_y, grid_z = torch.meshgrid( + grid_1D_x + box_center[0], + grid_1D_y + box_center[1], + grid_1D_z + box_center[2], + ) + grid_x = grid_x.contiguous().view(-1, 1) + grid_y = grid_y.contiguous().view(-1, 1) + grid_z = grid_z.contiguous().view(-1, 1) + grid = torch.cat([grid_x, grid_y, grid_z], dim=1) + + return grid + + def get_voxel(self, feature_maps, meta, grid_size, grid_center, cube_size): + device = feature_maps[0].device + batch_size = feature_maps[0].shape[0] + num_channels = feature_maps[0].shape[1] + num_bins = cube_size[0] * cube_size[1] * cube_size[2] + n = len(feature_maps) + cubes = torch.zeros( + batch_size, num_channels, 1, num_bins, n, device=device) + w, h = self.heatmap_size + grids = torch.zeros(batch_size, num_bins, 3, device=device) + bounding = torch.zeros(batch_size, 1, 1, num_bins, n, device=device) + for i in range(batch_size): + if len(grid_center[0]) == 3 or grid_center[i][3] >= 0: + if len(grid_center) == 1: + grid = self.compute_grid( + grid_size, grid_center[0], cube_size, device=device) + else: + grid = self.compute_grid( + grid_size, grid_center[i], cube_size, device=device) + grids[i:i + 1] = grid + for c in range(n): + center = meta[i]['center'][c] + scale = meta[i]['scale'][c] + + width, height = center * 2 + trans = torch.as_tensor( + get_affine_transform(center, scale / 200.0, 0, + self.image_size), + dtype=torch.float, + device=device) + + cam_param = meta[i]['camera'][c].copy() + + single_view_camera = SimpleCameraTorch( + param=cam_param, device=device) + xy = single_view_camera.world_to_pixel(grid) + + bounding[i, 0, 0, :, c] = (xy[:, 0] >= 0) & ( + xy[:, 1] >= 0) & (xy[:, 0] < width) & ( + xy[:, 1] < height) + xy = torch.clamp(xy, -1.0, max(width, height)) + xy = affine_transform_torch(xy, trans) + xy = xy * torch.tensor( + [w, h], dtype=torch.float, + device=device) / torch.tensor( + self.image_size, dtype=torch.float, device=device) + sample_grid = xy / torch.tensor([w - 1, h - 1], + dtype=torch.float, + device=device) * 2.0 - 1.0 + sample_grid = torch.clamp( + sample_grid.view(1, 1, num_bins, 2), -1.1, 1.1) + + cubes[i:i + 1, :, :, :, c] += F.grid_sample( + feature_maps[c][i:i + 1, :, :, :], + sample_grid, + align_corners=True) + + cubes = torch.sum( + torch.mul(cubes, bounding), dim=-1) / ( + torch.sum(bounding, dim=-1) + 1e-6) + cubes[cubes != cubes] = 0.0 + cubes = cubes.clamp(0.0, 1.0) + + cubes = cubes.view(batch_size, num_channels, cube_size[0], + cube_size[1], cube_size[2]) + return cubes, grids + + def forward(self, feature_maps, meta, grid_size, grid_center, cube_size): + cubes, grids = self.get_voxel(feature_maps, meta, grid_size, + grid_center, cube_size) + return cubes, grids + + +@POSENETS.register_module() +class DetectAndRegress(BasePose): + """DetectAndRegress approach for multiview human pose detection. + + Args: + backbone (ConfigDict): Dictionary to construct the 2D pose detector + human_detector (ConfigDict): dictionary to construct human detector + pose_regressor (ConfigDict): dictionary to construct pose regressor + train_cfg (ConfigDict): Config for training. Default: None. + test_cfg (ConfigDict): Config for testing. Default: None. + pretrained (str): Path to the pretrained 2D model. Default: None. + freeze_2d (bool): Whether to freeze the 2D model in training. + Default: True. + """ + + def __init__(self, + backbone, + human_detector, + pose_regressor, + train_cfg=None, + test_cfg=None, + pretrained=None, + freeze_2d=True): + super(DetectAndRegress, self).__init__() + if backbone is not None: + self.backbone = builder.build_posenet(backbone) + if self.training and pretrained is not None: + load_checkpoint(self.backbone, pretrained) + else: + self.backbone = None + + self.freeze_2d = freeze_2d + self.human_detector = builder.MODELS.build(human_detector) + self.pose_regressor = builder.MODELS.build(pose_regressor) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + @staticmethod + def _freeze(model): + """Freeze parameters.""" + model.eval() + for param in model.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """Sets the module in training mode. + Args: + mode (bool): whether to set training mode (``True``) + or evaluation mode (``False``). Default: ``True``. + + Returns: + Module: self + """ + super().train(mode) + if mode and self.freeze_2d and self.backbone is not None: + self._freeze(self.backbone) + + return self + + def forward(self, + img=None, + img_metas=None, + return_loss=True, + targets=None, + masks=None, + targets_3d=None, + input_heatmaps=None, + **kwargs): + """ + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + feature_maps width: W + feature_maps height: H + volume_length: cubeL + volume_width: cubeW + volume_height: cubeH + + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + return_loss: Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + targets (list(torch.Tensor[NxKxHxW])): + Multi-camera target feature_maps of the 2D model. + masks (list(torch.Tensor[NxHxW])): + Multi-camera masks of the input to the 2D model. + targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]): + Ground-truth 3D heatmap of human centers. + input_heatmaps (list(torch.Tensor[NxKxHxW])): + Multi-camera feature_maps when the 2D model is not available. + Default: None. + **kwargs: + + Returns: + dict: if 'return_loss' is true, then return losses. + Otherwise, return predicted poses, human centers and sample_id + + """ + if return_loss: + return self.forward_train(img, img_metas, targets, masks, + targets_3d, input_heatmaps) + else: + return self.forward_test(img, img_metas, input_heatmaps) + + def train_step(self, data_batch, optimizer, **kwargs): + """The iteration step during training. + + This method defines an iteration step during training, except for the + back propagation and optimizer updating, which are done in an optimizer + hook. Note that in some complicated cases or models, the whole process + including back propagation and optimizer updating is also defined in + this method, such as GAN. + + Args: + data_batch (dict): The output of dataloader. + optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of + runner is passed to ``train_step()``. This argument is unused + and reserved. + + Returns: + dict: It should contain at least 3 keys: ``loss``, ``log_vars``, + ``num_samples``. + ``loss`` is a tensor for back propagation, which can be a + weighted sum of multiple losses. + ``log_vars`` contains all the variables to be sent to the + logger. + ``num_samples`` indicates the batch size (when the model is + DDP, it means the batch size on each GPU), which is used for + averaging the logs. + """ + losses = self.forward(**data_batch) + + loss, log_vars = self._parse_losses(losses) + if 'img' in data_batch: + batch_size = data_batch['img'][0].shape[0] + else: + assert 'input_heatmaps' in data_batch + batch_size = data_batch['input_heatmaps'][0][0].shape[0] + + outputs = dict(loss=loss, log_vars=log_vars, num_samples=batch_size) + + return outputs + + def forward_train(self, + img, + img_metas, + targets=None, + masks=None, + targets_3d=None, + input_heatmaps=None): + """ + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + feature_maps width: W + feature_maps height: H + volume_length: cubeL + volume_width: cubeW + volume_height: cubeH + + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + targets (list(torch.Tensor[NxKxHxW])): + Multi-camera target feature_maps of the 2D model. + masks (list(torch.Tensor[NxHxW])): + Multi-camera masks of the input to the 2D model. + targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]): + Ground-truth 3D heatmap of human centers. + input_heatmaps (list(torch.Tensor[NxKxHxW])): + Multi-camera feature_maps when the 2D model is not available. + Default: None. + + Returns: + dict: losses. + + """ + if self.backbone is None: + assert input_heatmaps is not None + feature_maps = [] + for input_heatmap in input_heatmaps: + feature_maps.append(input_heatmap[0]) + else: + feature_maps = [] + assert isinstance(img, list) + for img_ in img: + feature_maps.append(self.backbone.forward_dummy(img_)[0]) + + losses = dict() + human_candidates, human_loss = self.human_detector.forward_train( + None, img_metas, feature_maps, targets_3d, return_preds=True) + losses.update(human_loss) + + pose_loss = self.pose_regressor( + None, + img_metas, + return_loss=True, + feature_maps=feature_maps, + human_candidates=human_candidates) + losses.update(pose_loss) + + if not self.freeze_2d: + losses_2d = {} + heatmaps_tensor = torch.cat(feature_maps, dim=0) + targets_tensor = torch.cat(targets, dim=0) + masks_tensor = torch.cat(masks, dim=0) + losses_2d_ = self.backbone.get_loss(heatmaps_tensor, + targets_tensor, masks_tensor) + for k, v in losses_2d_.items(): + losses_2d[k + '_2d'] = v + losses.update(losses_2d) + + return losses + + def forward_test( + self, + img, + img_metas, + input_heatmaps=None, + ): + """ + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + feature_maps width: W + feature_maps height: H + volume_length: cubeL + volume_width: cubeW + volume_height: cubeH + + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + input_heatmaps (list(torch.Tensor[NxKxHxW])): + Multi-camera feature_maps when the 2D model is not available. + Default: None. + + Returns: + dict: predicted poses, human centers and sample_id + + """ + if self.backbone is None: + assert input_heatmaps is not None + feature_maps = [] + for input_heatmap in input_heatmaps: + feature_maps.append(input_heatmap[0]) + else: + feature_maps = [] + assert isinstance(img, list) + for img_ in img: + feature_maps.append(self.backbone.forward_dummy(img_)[0]) + + human_candidates = self.human_detector.forward_test( + None, img_metas, feature_maps) + + human_poses = self.pose_regressor( + None, + img_metas, + return_loss=False, + feature_maps=feature_maps, + human_candidates=human_candidates) + + result = {} + result['pose_3d'] = human_poses.cpu().numpy() + result['human_detection_3d'] = human_candidates.cpu().numpy() + result['sample_id'] = [img_meta['sample_id'] for img_meta in img_metas] + + return result + + def show_result(self, **kwargs): + """Visualize the results.""" + raise NotImplementedError + + def forward_dummy(self, img, input_heatmaps=None, num_candidates=5): + """Used for computing network FLOPs.""" + if self.backbone is None: + assert input_heatmaps is not None + feature_maps = [] + for input_heatmap in input_heatmaps: + feature_maps.append(input_heatmap[0]) + else: + feature_maps = [] + assert isinstance(img, list) + for img_ in img: + feature_maps.append(self.backbone.forward_dummy(img_)[0]) + + _ = self.human_detector.forward_dummy(feature_maps) + + _ = self.pose_regressor.forward_dummy(feature_maps, num_candidates) + + +@POSENETS.register_module() +class VoxelSinglePose(BasePose): + """VoxelPose Please refer to the `paper ` + for details. + + Args: + image_size (list): input size of the 2D model. + heatmap_size (list): output size of the 2D model. + sub_space_size (list): Size of the cuboid human proposal. + sub_cube_size (list): Size of the input volume to the pose net. + pose_net (ConfigDict): Dictionary to construct the pose net. + pose_head (ConfigDict): Dictionary to construct the pose head. + train_cfg (ConfigDict): Config for training. Default: None. + test_cfg (ConfigDict): Config for testing. Default: None. + """ + + def __init__( + self, + image_size, + heatmap_size, + sub_space_size, + sub_cube_size, + num_joints, + pose_net, + pose_head, + train_cfg=None, + test_cfg=None, + ): + super(VoxelSinglePose, self).__init__() + self.project_layer = ProjectLayer(image_size, heatmap_size) + self.pose_net = builder.build_backbone(pose_net) + self.pose_head = builder.build_head(pose_head) + + self.sub_space_size = sub_space_size + self.sub_cube_size = sub_cube_size + + self.num_joints = num_joints + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + def forward(self, + img, + img_metas, + return_loss=True, + feature_maps=None, + human_candidates=None, + **kwargs): + """ + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + feature_maps width: W + feature_maps height: H + volume_length: cubeL + volume_width: cubeW + volume_height: cubeH + + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + feature_maps (list(torch.Tensor[NxCxHxW])): + Multi-camera input feature_maps. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + human_candidates (torch.Tensor[NxPx5]): + Human candidates. + return_loss: Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + + """ + if return_loss: + return self.forward_train(img, img_metas, feature_maps, + human_candidates) + else: + return self.forward_test(img, img_metas, feature_maps, + human_candidates) + + def forward_train(self, + img, + img_metas, + feature_maps=None, + human_candidates=None, + return_preds=False, + **kwargs): + """Defines the computation performed at training. + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + feature_maps width: W + feature_maps height: H + volume_length: cubeL + volume_width: cubeW + volume_height: cubeH + + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + feature_maps (list(torch.Tensor[NxCxHxW])): + Multi-camera input feature_maps. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + human_candidates (torch.Tensor[NxPx5]): + Human candidates. + return_preds (bool): Whether to return prediction results + + Returns: + dict: losses. + + """ + batch_size, num_candidates, _ = human_candidates.shape + pred = human_candidates.new_zeros(batch_size, num_candidates, + self.num_joints, 5) + pred[:, :, :, 3:] = human_candidates[:, :, None, 3:] + + device = feature_maps[0].device + gt_3d = torch.stack([ + torch.tensor(img_meta['joints_3d'], device=device) + for img_meta in img_metas + ]) + gt_3d_vis = torch.stack([ + torch.tensor(img_meta['joints_3d_visible'], device=device) + for img_meta in img_metas + ]) + valid_preds = [] + valid_targets = [] + valid_weights = [] + + for n in range(num_candidates): + index = pred[:, n, 0, 3] >= 0 + num_valid = index.sum() + if num_valid > 0: + pose_input_cube, coordinates \ + = self.project_layer(feature_maps, + img_metas, + self.sub_space_size, + human_candidates[:, n, :3], + self.sub_cube_size) + pose_heatmaps_3d = self.pose_net(pose_input_cube) + pose_3d = self.pose_head(pose_heatmaps_3d[index], + coordinates[index]) + + pred[index, n, :, 0:3] = pose_3d.detach() + valid_targets.append(gt_3d[index, pred[index, n, 0, 3].long()]) + valid_weights.append(gt_3d_vis[index, pred[index, n, 0, + 3].long(), :, + 0:1].float()) + valid_preds.append(pose_3d) + + losses = dict() + if len(valid_preds) > 0: + valid_targets = torch.cat(valid_targets, dim=0) + valid_weights = torch.cat(valid_weights, dim=0) + valid_preds = torch.cat(valid_preds, dim=0) + losses.update( + self.pose_head.get_loss(valid_preds, valid_targets, + valid_weights)) + else: + pose_input_cube = feature_maps[0].new_zeros( + batch_size, self.num_joints, *self.sub_cube_size) + coordinates = feature_maps[0].new_zeros(batch_size, + *self.sub_cube_size, + 3).view(batch_size, -1, 3) + pseudo_targets = feature_maps[0].new_zeros(batch_size, + self.num_joints, 3) + pseudo_weights = feature_maps[0].new_zeros(batch_size, + self.num_joints, 1) + pose_heatmaps_3d = self.pose_net(pose_input_cube) + pose_3d = self.pose_head(pose_heatmaps_3d, coordinates) + losses.update( + self.pose_head.get_loss(pose_3d, pseudo_targets, + pseudo_weights)) + if return_preds: + return pred, losses + else: + return losses + + def forward_test(self, + img, + img_metas, + feature_maps=None, + human_candidates=None, + **kwargs): + """Defines the computation performed at training. + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + feature_maps width: W + feature_maps height: H + volume_length: cubeL + volume_width: cubeW + volume_height: cubeH + + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + feature_maps (list(torch.Tensor[NxCxHxW])): + Multi-camera input feature_maps. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + human_candidates (torch.Tensor[NxPx5]): + Human candidates. + + Returns: + dict: predicted poses, human centers and sample_id + + """ + batch_size, num_candidates, _ = human_candidates.shape + pred = human_candidates.new_zeros(batch_size, num_candidates, + self.num_joints, 5) + pred[:, :, :, 3:] = human_candidates[:, :, None, 3:] + + for n in range(num_candidates): + index = pred[:, n, 0, 3] >= 0 + num_valid = index.sum() + if num_valid > 0: + pose_input_cube, coordinates \ + = self.project_layer(feature_maps, + img_metas, + self.sub_space_size, + human_candidates[:, n, :3], + self.sub_cube_size) + pose_heatmaps_3d = self.pose_net(pose_input_cube) + pose_3d = self.pose_head(pose_heatmaps_3d[index], + coordinates[index]) + + pred[index, n, :, 0:3] = pose_3d.detach() + + return pred + + def show_result(self, **kwargs): + """Visualize the results.""" + raise NotImplementedError + + def forward_dummy(self, feature_maps, num_candidates=5): + """Used for computing network FLOPs.""" + batch_size, num_channels = feature_maps[0].shape + pose_input_cube = feature_maps[0].new_zeros(batch_size, num_channels, + *self.sub_cube_size) + for n in range(num_candidates): + _ = self.pose_net(pose_input_cube) + + +@POSENETS.register_module() +class VoxelCenterDetector(BasePose): + """Detect human center by 3D CNN on voxels. + + Please refer to the + `paper ` for details. + Args: + image_size (list): input size of the 2D model. + heatmap_size (list): output size of the 2D model. + space_size (list): Size of the 3D space. + cube_size (list): Size of the input volume to the 3D CNN. + space_center (list): Coordinate of the center of the 3D space. + center_net (ConfigDict): Dictionary to construct the center net. + center_head (ConfigDict): Dictionary to construct the center head. + train_cfg (ConfigDict): Config for training. Default: None. + test_cfg (ConfigDict): Config for testing. Default: None. + """ + + def __init__( + self, + image_size, + heatmap_size, + space_size, + cube_size, + space_center, + center_net, + center_head, + train_cfg=None, + test_cfg=None, + ): + super(VoxelCenterDetector, self).__init__() + self.project_layer = ProjectLayer(image_size, heatmap_size) + self.center_net = builder.build_backbone(center_net) + self.center_head = builder.build_head(center_head) + + self.space_size = space_size + self.cube_size = cube_size + self.space_center = space_center + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + def assign2gt(self, center_candidates, gt_centers, gt_num_persons): + """"Assign gt id to each valid human center candidate.""" + det_centers = center_candidates[..., :3] + batch_size = center_candidates.shape[0] + cand_num = center_candidates.shape[1] + cand2gt = torch.zeros(batch_size, cand_num) + + for i in range(batch_size): + cand = det_centers[i].view(cand_num, 1, -1) + gt = gt_centers[None, i, :gt_num_persons[i]] + + dist = torch.sqrt(torch.sum((cand - gt)**2, dim=-1)) + min_dist, min_gt = torch.min(dist, dim=-1) + + cand2gt[i] = min_gt + cand2gt[i][min_dist > self.train_cfg['dist_threshold']] = -1.0 + + center_candidates[:, :, 3] = cand2gt + + return center_candidates + + def forward(self, + img, + img_metas, + return_loss=True, + feature_maps=None, + targets_3d=None): + """ + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + heatmaps width: W + heatmaps height: H + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + return_loss: Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]): + Ground-truth 3D heatmap of human centers. + feature_maps (list(torch.Tensor[NxKxHxW])): + Multi-camera feature_maps. + Returns: + dict: if 'return_loss' is true, then return losses. + Otherwise, return predicted poses + """ + if return_loss: + return self.forward_train(img, img_metas, feature_maps, targets_3d) + else: + return self.forward_test(img, img_metas, feature_maps) + + def forward_train(self, + img, + img_metas, + feature_maps=None, + targets_3d=None, + return_preds=False): + """ + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + heatmaps width: W + heatmaps height: H + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + targets_3d (torch.Tensor[NxcubeLxcubeWxcubeH]): + Ground-truth 3D heatmap of human centers. + feature_maps (list(torch.Tensor[NxKxHxW])): + Multi-camera feature_maps. + return_preds (bool): Whether to return prediction results + Returns: + dict: if 'return_pred' is true, then return losses + and human centers. Otherwise, return losses only + """ + initial_cubes, _ = self.project_layer(feature_maps, img_metas, + self.space_size, + [self.space_center], + self.cube_size) + center_heatmaps_3d = self.center_net(initial_cubes) + center_heatmaps_3d = center_heatmaps_3d.squeeze(1) + center_candidates = self.center_head(center_heatmaps_3d) + + device = center_candidates.device + + gt_centers = torch.stack([ + torch.tensor(img_meta['roots_3d'], device=device) + for img_meta in img_metas + ]) + gt_num_persons = torch.stack([ + torch.tensor(img_meta['num_persons'], device=device) + for img_meta in img_metas + ]) + center_candidates = self.assign2gt(center_candidates, gt_centers, + gt_num_persons) + + losses = dict() + losses.update( + self.center_head.get_loss(center_heatmaps_3d, targets_3d)) + + if return_preds: + return center_candidates, losses + else: + return losses + + def forward_test(self, img, img_metas, feature_maps=None): + """ + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + heatmaps width: W + heatmaps height: H + Args: + img (list(torch.Tensor[NxCximgHximgW])): + Multi-camera input images to the 2D model. + img_metas (list(dict)): + Information about image, 3D groundtruth and camera parameters. + feature_maps (list(torch.Tensor[NxKxHxW])): + Multi-camera feature_maps. + Returns: + human centers + """ + initial_cubes, _ = self.project_layer(feature_maps, img_metas, + self.space_size, + [self.space_center], + self.cube_size) + center_heatmaps_3d = self.center_net(initial_cubes) + center_heatmaps_3d = center_heatmaps_3d.squeeze(1) + center_candidates = self.center_head(center_heatmaps_3d) + center_candidates[..., 3] = \ + (center_candidates[..., 4] > + self.test_cfg['center_threshold']).float() - 1.0 + + return center_candidates + + def show_result(self, **kwargs): + """Visualize the results.""" + raise NotImplementedError + + def forward_dummy(self, feature_maps): + """Used for computing network FLOPs.""" + batch_size, num_channels, _, _ = feature_maps[0].shape + initial_cubes = feature_maps[0].new_zeros(batch_size, num_channels, + *self.cube_size) + _ = self.center_net(initial_cubes) diff --git a/mmpose/models/detectors/pose_lifter.py b/mmpose/models/detectors/pose_lifter.py new file mode 100644 index 0000000..ace6b9f --- /dev/null +++ b/mmpose/models/detectors/pose_lifter.py @@ -0,0 +1,392 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import mmcv +import numpy as np +from mmcv.utils.misc import deprecated_api_warning + +from mmpose.core import imshow_bboxes, imshow_keypoints, imshow_keypoints_3d +from .. import builder +from ..builder import POSENETS +from .base import BasePose + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class PoseLifter(BasePose): + """Pose lifter that lifts 2D pose to 3D pose. + + The basic model is a pose model that predicts root-relative pose. If + traj_head is not None, a trajectory model that predicts absolute root joint + position is also built. + + Args: + backbone (dict): Config for the backbone of pose model. + neck (dict|None): Config for the neck of pose model. + keypoint_head (dict|None): Config for the head of pose model. + traj_backbone (dict|None): Config for the backbone of trajectory model. + If traj_backbone is None and traj_head is not None, trajectory + model will share backbone with pose model. + traj_neck (dict|None): Config for the neck of trajectory model. + traj_head (dict|None): Config for the head of trajectory model. + loss_semi (dict|None): Config for semi-supervision loss. + train_cfg (dict|None): Config for keypoint head during training. + test_cfg (dict|None): Config for keypoint head during testing. + pretrained (str|None): Path to pretrained weights. + """ + + def __init__(self, + backbone, + neck=None, + keypoint_head=None, + traj_backbone=None, + traj_neck=None, + traj_head=None, + loss_semi=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super().__init__() + self.fp16_enabled = False + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + # pose model + self.backbone = builder.build_backbone(backbone) + + if neck is not None: + self.neck = builder.build_neck(neck) + + if keypoint_head is not None: + keypoint_head['train_cfg'] = train_cfg + keypoint_head['test_cfg'] = test_cfg + self.keypoint_head = builder.build_head(keypoint_head) + + # trajectory model + if traj_head is not None: + self.traj_head = builder.build_head(traj_head) + + if traj_backbone is not None: + self.traj_backbone = builder.build_backbone(traj_backbone) + else: + self.traj_backbone = self.backbone + + if traj_neck is not None: + self.traj_neck = builder.build_neck(traj_neck) + + # semi-supervised learning + self.semi = loss_semi is not None + if self.semi: + assert keypoint_head is not None and traj_head is not None + self.loss_semi = builder.build_loss(loss_semi) + + self.init_weights(pretrained=pretrained) + + @property + def with_neck(self): + """Check if has keypoint_neck.""" + return hasattr(self, 'neck') + + @property + def with_keypoint(self): + """Check if has keypoint_head.""" + return hasattr(self, 'keypoint_head') + + @property + def with_traj_backbone(self): + """Check if has trajectory_backbone.""" + return hasattr(self, 'traj_backbone') + + @property + def with_traj_neck(self): + """Check if has trajectory_neck.""" + return hasattr(self, 'traj_neck') + + @property + def with_traj(self): + """Check if has trajectory_head.""" + return hasattr(self, 'traj_head') + + @property + def causal(self): + if hasattr(self.backbone, 'causal'): + return self.backbone.causal + else: + raise AttributeError('A PoseLifter\'s backbone should have ' + 'the bool attribute "causal" to indicate if' + 'it performs causal inference.') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.backbone.init_weights(pretrained) + if self.with_neck: + self.neck.init_weights() + if self.with_keypoint: + self.keypoint_head.init_weights() + if self.with_traj_backbone: + self.traj_backbone.init_weights(pretrained) + if self.with_traj_neck: + self.traj_neck.init_weights() + if self.with_traj: + self.traj_head.init_weights() + + @auto_fp16(apply_to=('input', )) + def forward(self, + input, + target=None, + target_weight=None, + metas=None, + return_loss=True, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. + + Note: + - batch_size: N + - num_input_keypoints: Ki + - input_keypoint_dim: Ci + - input_sequence_len: Ti + - num_output_keypoints: Ko + - output_keypoint_dim: Co + - input_sequence_len: To + + Args: + input (torch.Tensor[NxKixCixTi]): Input keypoint coordinates. + target (torch.Tensor[NxKoxCoxTo]): Output keypoint coordinates. + Defaults to None. + target_weight (torch.Tensor[NxKox1]): Weights across different + joint types. Defaults to None. + metas (list(dict)): Information about data augmentation + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + + Returns: + dict|Tensor: If `reutrn_loss` is true, return losses. \ + Otherwise return predicted poses. + """ + if return_loss: + return self.forward_train(input, target, target_weight, metas, + **kwargs) + else: + return self.forward_test(input, metas, **kwargs) + + def forward_train(self, input, target, target_weight, metas, **kwargs): + """Defines the computation performed at every call when training.""" + assert input.size(0) == len(metas) + + # supervised learning + # pose model + features = self.backbone(input) + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + output = self.keypoint_head(features) + + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head.get_loss( + output, target, target_weight) + keypoint_accuracy = self.keypoint_head.get_accuracy( + output, target, target_weight, metas) + losses.update(keypoint_losses) + losses.update(keypoint_accuracy) + + # trajectory model + if self.with_traj: + traj_features = self.traj_backbone(input) + if self.with_traj_neck: + traj_features = self.traj_neck(traj_features) + traj_output = self.traj_head(traj_features) + + traj_losses = self.traj_head.get_loss(traj_output, + kwargs['traj_target'], None) + losses.update(traj_losses) + + # semi-supervised learning + if self.semi: + ul_input = kwargs['unlabeled_input'] + ul_features = self.backbone(ul_input) + if self.with_neck: + ul_features = self.neck(ul_features) + ul_output = self.keypoint_head(ul_features) + + ul_traj_features = self.traj_backbone(ul_input) + if self.with_traj_neck: + ul_traj_features = self.traj_neck(ul_traj_features) + ul_traj_output = self.traj_head(ul_traj_features) + + output_semi = dict( + labeled_pose=output, + unlabeled_pose=ul_output, + unlabeled_traj=ul_traj_output) + target_semi = dict( + unlabeled_target_2d=kwargs['unlabeled_target_2d'], + intrinsics=kwargs['intrinsics']) + + semi_losses = self.loss_semi(output_semi, target_semi) + losses.update(semi_losses) + + return losses + + def forward_test(self, input, metas, **kwargs): + """Defines the computation performed at every call when training.""" + assert input.size(0) == len(metas) + + results = {} + + features = self.backbone(input) + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + output = self.keypoint_head.inference_model(features) + keypoint_result = self.keypoint_head.decode(metas, output) + results.update(keypoint_result) + + if self.with_traj: + traj_features = self.traj_backbone(input) + if self.with_traj_neck: + traj_features = self.traj_neck(traj_features) + traj_output = self.traj_head.inference_model(traj_features) + results['traj_preds'] = traj_output + + return results + + def forward_dummy(self, input): + """Used for computing network FLOPs. See ``tools/get_flops.py``. + + Args: + input (torch.Tensor): Input pose + + Returns: + Tensor: Model output + """ + output = self.backbone(input) + if self.with_neck: + output = self.neck(output) + if self.with_keypoint: + output = self.keypoint_head(output) + + if self.with_traj: + traj_features = self.traj_backbone(input) + if self.with_neck: + traj_features = self.traj_neck(traj_features) + traj_output = self.traj_head(traj_features) + output = output + traj_output + + return output + + @deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, + cls_name='PoseLifter') + def show_result(self, + result, + img=None, + skeleton=None, + pose_kpt_color=None, + pose_link_color=None, + radius=8, + thickness=2, + vis_height=400, + num_instances=-1, + win_name='', + show=False, + wait_time=0, + out_file=None): + """Visualize 3D pose estimation results. + + Args: + result (list[dict]): The pose estimation results containing: + + - "keypoints_3d" ([K,4]): 3D keypoints + - "keypoints" ([K,3] or [T,K,3]): Optional for visualizing + 2D inputs. If a sequence is given, only the last frame + will be used for visualization + - "bbox" ([4,] or [T,4]): Optional for visualizing 2D inputs + - "title" (str): title for the subplot + img (str or Tensor): Optional. The image to visualize 2D inputs on. + skeleton (list of [idx_i,idx_j]): Skeleton described by a list of + links, each is a pair of joint indices. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_link_color (np.array[Mx3]): Color of M links. + If None, do not draw links. + radius (int): Radius of circles. + thickness (int): Thickness of lines. + vis_height (int): The image height of the visualization. The width + will be N*vis_height depending on the number of visualized + items. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized img, only if not `show` or `out_file`. + """ + if num_instances < 0: + assert len(result) > 0 + result = sorted(result, key=lambda x: x.get('track_id', 1e4)) + + # draw image and input 2d poses + if img is not None: + img = mmcv.imread(img) + + bbox_result = [] + pose_input_2d = [] + for res in result: + if 'bbox' in res: + bbox = np.array(res['bbox']) + if bbox.ndim != 1: + assert bbox.ndim == 2 + bbox = bbox[-1] # Get bbox from the last frame + bbox_result.append(bbox) + if 'keypoints' in res: + kpts = np.array(res['keypoints']) + if kpts.ndim != 2: + assert kpts.ndim == 3 + kpts = kpts[-1] # Get 2D keypoints from the last frame + pose_input_2d.append(kpts) + + if len(bbox_result) > 0: + bboxes = np.vstack(bbox_result) + imshow_bboxes( + img, + bboxes, + colors='green', + thickness=thickness, + show=False) + if len(pose_input_2d) > 0: + imshow_keypoints( + img, + pose_input_2d, + skeleton, + kpt_score_thr=0.3, + pose_kpt_color=pose_kpt_color, + pose_link_color=pose_link_color, + radius=radius, + thickness=thickness) + img = mmcv.imrescale(img, scale=vis_height / img.shape[0]) + + img_vis = imshow_keypoints_3d( + result, + img, + skeleton, + pose_kpt_color, + pose_link_color, + vis_height, + num_instances=num_instances) + + if show: + mmcv.visualization.imshow(img_vis, win_name, wait_time) + + if out_file is not None: + mmcv.imwrite(img_vis, out_file) + + return img_vis diff --git a/mmpose/models/detectors/posewarper.py b/mmpose/models/detectors/posewarper.py new file mode 100644 index 0000000..aa1d05f --- /dev/null +++ b/mmpose/models/detectors/posewarper.py @@ -0,0 +1,244 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np +import torch + +from ..builder import POSENETS +from .top_down import TopDown + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class PoseWarper(TopDown): + """Top-down pose detectors for multi-frame settings for video inputs. + + `"Learning temporal pose estimation from sparsely-labeled videos" + `_. + + A child class of TopDown detector. The main difference between PoseWarper + and TopDown lies in that the former takes a list of tensors as input image + while the latter takes a single tensor as input image in forward method. + + Args: + backbone (dict): Backbone modules to extract features. + neck (dict): intermediate modules to transform features. + keypoint_head (dict): Keypoint head to process feature. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + loss_pose (None): Deprecated arguments. Please use + `loss_keypoint` for heads instead. + concat_tensors (bool): Whether to concat the tensors on the batch dim, + which can speed up, Default: True + """ + + def __init__(self, + backbone, + neck=None, + keypoint_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + loss_pose=None, + concat_tensors=True): + super().__init__( + backbone=backbone, + neck=neck, + keypoint_head=keypoint_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + loss_pose=loss_pose) + self.concat_tensors = concat_tensors + + @auto_fp16(apply_to=('img', )) + def forward(self, + img, + target=None, + target_weight=None, + img_metas=None, + return_loss=True, + return_heatmap=False, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + + Note: + - number of frames: F + - batch_size: N + - num_keypoints: K + - num_img_channel: C (Default: 3) + - img height: imgH + - img width: imgW + - heatmaps height: H + - heatmaps weight: W + + Args: + imgs (list[F,torch.Tensor[N,C,imgH,imgW]]): multiple input frames + target (torch.Tensor[N,K,H,W]): Target heatmaps for one frame. + target_weight (torch.Tensor[N,K,1]): Weights across + different joint types. + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: paths to multiple video frames + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + return_heatmap (bool) : Option to return heatmap. + + Returns: + dict|tuple: if `return loss` is true, then return losses. \ + Otherwise, return predicted poses, boxes, image paths \ + and heatmaps. + """ + if return_loss: + return self.forward_train(img, target, target_weight, img_metas, + **kwargs) + return self.forward_test( + img, img_metas, return_heatmap=return_heatmap, **kwargs) + + def forward_train(self, imgs, target, target_weight, img_metas, **kwargs): + """Defines the computation performed at every call when training.""" + # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames + assert imgs[0].size(0) == len(img_metas) + num_frames = len(imgs) + frame_weight = img_metas[0]['frame_weight'] + + assert num_frames == len(frame_weight), f'The number of frames ' \ + f'({num_frames}) and the length of weights for each frame ' \ + f'({len(frame_weight)}) must match' + + if self.concat_tensors: + features = [self.backbone(torch.cat(imgs, 0))] + else: + features = [self.backbone(img) for img in imgs] + + if self.with_neck: + features = self.neck(features, frame_weight=frame_weight) + + if self.with_keypoint: + output = self.keypoint_head(features) + + # if return loss + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head.get_loss( + output, target, target_weight) + losses.update(keypoint_losses) + keypoint_accuracy = self.keypoint_head.get_accuracy( + output, target, target_weight) + losses.update(keypoint_accuracy) + + return losses + + def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): + """Defines the computation performed at every call when testing.""" + # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames + assert imgs[0].size(0) == len(img_metas) + num_frames = len(imgs) + frame_weight = img_metas[0]['frame_weight'] + + assert num_frames == len(frame_weight), f'The number of frames ' \ + f'({num_frames}) and the length of weights for each frame ' \ + f'({len(frame_weight)}) must match' + + batch_size, _, img_height, img_width = imgs[0].shape + + if batch_size > 1: + assert 'bbox_id' in img_metas[0] + + result = {} + + if self.concat_tensors: + features = [self.backbone(torch.cat(imgs, 0))] + else: + features = [self.backbone(img) for img in imgs] + + if self.with_neck: + features = self.neck(features, frame_weight=frame_weight) + + if self.with_keypoint: + output_heatmap = self.keypoint_head.inference_model( + features, flip_pairs=None) + + if self.test_cfg.get('flip_test', True): + imgs_flipped = [img.flip(3) for img in imgs] + + if self.concat_tensors: + features_flipped = [self.backbone(torch.cat(imgs_flipped, 0))] + else: + features_flipped = [ + self.backbone(img_flipped) for img_flipped in imgs_flipped + ] + + if self.with_neck: + features_flipped = self.neck( + features_flipped, frame_weight=frame_weight) + + if self.with_keypoint: + output_flipped_heatmap = self.keypoint_head.inference_model( + features_flipped, img_metas[0]['flip_pairs']) + output_heatmap = (output_heatmap + + output_flipped_heatmap) * 0.5 + + if self.with_keypoint: + keypoint_result = self.keypoint_head.decode( + img_metas, output_heatmap, img_size=[img_width, img_height]) + result.update(keypoint_result) + + if not return_heatmap: + output_heatmap = None + + result['output_heatmap'] = output_heatmap + + return result + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor[N,C,imgH,imgW], or list|tuple of tensors): + multiple input frames, N >= 2. + + Returns: + Tensor: Output heatmaps. + """ + # concat tensors if they are in a list + if isinstance(img, (list, tuple)): + img = torch.cat(img, 0) + + batch_size = img.size(0) + assert batch_size > 1, 'Input batch size to PoseWarper ' \ + 'should be larger than 1.' + if batch_size == 2: + warnings.warn('Current batch size: 2, for pytorch2onnx and ' + 'getting flops both.') + else: + warnings.warn( + f'Current batch size: {batch_size}, for getting flops only.') + + frame_weight = np.random.uniform(0, 1, batch_size) + output = [self.backbone(img)] + + if self.with_neck: + output = self.neck(output, frame_weight=frame_weight) + if self.with_keypoint: + output = self.keypoint_head(output) + return output diff --git a/mmpose/models/detectors/top_down.py b/mmpose/models/detectors/top_down.py new file mode 100644 index 0000000..af0ab51 --- /dev/null +++ b/mmpose/models/detectors/top_down.py @@ -0,0 +1,307 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import mmcv +import numpy as np +from mmcv.image import imwrite +from mmcv.utils.misc import deprecated_api_warning +from mmcv.visualization.image import imshow + +from mmpose.core import imshow_bboxes, imshow_keypoints +from .. import builder +from ..builder import POSENETS +from .base import BasePose + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class TopDown(BasePose): + """Top-down pose detectors. + + Args: + backbone (dict): Backbone modules to extract feature. + keypoint_head (dict): Keypoint head to process feature. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + loss_pose (None): Deprecated arguments. Please use + `loss_keypoint` for heads instead. + """ + + def __init__(self, + backbone, + neck=None, + keypoint_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + loss_pose=None): + super().__init__() + self.fp16_enabled = False + + self.backbone = builder.build_backbone(backbone) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if neck is not None: + self.neck = builder.build_neck(neck) + + if keypoint_head is not None: + keypoint_head['train_cfg'] = train_cfg + keypoint_head['test_cfg'] = test_cfg + + if 'loss_keypoint' not in keypoint_head and loss_pose is not None: + warnings.warn( + '`loss_pose` for TopDown is deprecated, ' + 'use `loss_keypoint` for heads instead. See ' + 'https://github.com/open-mmlab/mmpose/pull/382' + ' for more information.', DeprecationWarning) + keypoint_head['loss_keypoint'] = loss_pose + + self.keypoint_head = builder.build_head(keypoint_head) + + self.init_weights(pretrained=pretrained) + + @property + def with_neck(self): + """Check if has neck.""" + return hasattr(self, 'neck') + + @property + def with_keypoint(self): + """Check if has keypoint_head.""" + return hasattr(self, 'keypoint_head') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.backbone.init_weights(pretrained) + if self.with_neck: + self.neck.init_weights() + if self.with_keypoint: + self.keypoint_head.init_weights() + + @auto_fp16(apply_to=('img', )) + def forward(self, + img, + target=None, + target_weight=None, + img_metas=None, + return_loss=True, + return_heatmap=False, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + + Note: + - batch_size: N + - num_keypoints: K + - num_img_channel: C (Default: 3) + - img height: imgH + - img width: imgW + - heatmaps height: H + - heatmaps weight: W + + Args: + img (torch.Tensor[NxCximgHximgW]): Input images. + target (torch.Tensor[NxKxHxW]): Target heatmaps. + target_weight (torch.Tensor[NxKx1]): Weights across + different joint types. + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + return_heatmap (bool) : Option to return heatmap. + + Returns: + dict|tuple: if `return loss` is true, then return losses. \ + Otherwise, return predicted poses, boxes, image paths \ + and heatmaps. + """ + if return_loss: + return self.forward_train(img, target, target_weight, img_metas, + **kwargs) + return self.forward_test( + img, img_metas, return_heatmap=return_heatmap, **kwargs) + + def forward_train(self, img, target, target_weight, img_metas, **kwargs): + """Defines the computation performed at every call when training.""" + output = self.backbone(img) + if self.with_neck: + output = self.neck(output) + if self.with_keypoint: + output = self.keypoint_head(output) + + # if return loss + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head.get_loss( + output, target, target_weight) + losses.update(keypoint_losses) + keypoint_accuracy = self.keypoint_head.get_accuracy( + output, target, target_weight) + losses.update(keypoint_accuracy) + + return losses + + def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): + """Defines the computation performed at every call when testing.""" + assert img.size(0) == len(img_metas) + batch_size, _, img_height, img_width = img.shape + if batch_size > 1: + assert 'bbox_id' in img_metas[0] + + result = {} + + features = self.backbone(img) + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + output_heatmap = self.keypoint_head.inference_model( + features, flip_pairs=None) + + if self.test_cfg.get('flip_test', True): + img_flipped = img.flip(3) + features_flipped = self.backbone(img_flipped) + if self.with_neck: + features_flipped = self.neck(features_flipped) + if self.with_keypoint: + output_flipped_heatmap = self.keypoint_head.inference_model( + features_flipped, img_metas[0]['flip_pairs']) + output_heatmap = (output_heatmap + + output_flipped_heatmap) * 0.5 + + if self.with_keypoint: + keypoint_result = self.keypoint_head.decode( + img_metas, output_heatmap, img_size=[img_width, img_height]) + result.update(keypoint_result) + + if not return_heatmap: + output_heatmap = None + + result['output_heatmap'] = output_heatmap + + return result + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): Input image. + + Returns: + Tensor: Output heatmaps. + """ + output = self.backbone(img) + if self.with_neck: + output = self.neck(output) + if self.with_keypoint: + output = self.keypoint_head(output) + return output + + @deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, + cls_name='TopDown') + def show_result(self, + img, + result, + skeleton=None, + kpt_score_thr=0.3, + bbox_color='green', + pose_kpt_color=None, + pose_link_color=None, + text_color='white', + radius=4, + thickness=1, + font_scale=0.5, + bbox_thickness=1, + win_name='', + show=False, + show_keypoint_weight=False, + wait_time=0, + out_file=None): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + skeleton (list[list]): The connection of keypoints. + skeleton is 0-based indexing. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_link_color (np.array[Mx3]): Color of M links. + If None, do not draw links. + text_color (str or tuple or :obj:`Color`): Color of texts. + radius (int): Radius of circles. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + show_keypoint_weight (bool): Whether to change the transparency + using the predicted confidence scores of keypoints. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized img, only if not `show` or `out_file`. + """ + img = mmcv.imread(img) + img = img.copy() + + bbox_result = [] + bbox_labels = [] + pose_result = [] + for res in result: + if 'bbox' in res: + bbox_result.append(res['bbox']) + bbox_labels.append(res.get('label', None)) + pose_result.append(res['keypoints']) + + if bbox_result: + bboxes = np.vstack(bbox_result) + # draw bounding boxes + imshow_bboxes( + img, + bboxes, + labels=bbox_labels, + colors=bbox_color, + text_color=text_color, + thickness=bbox_thickness, + font_scale=font_scale, + show=False) + + if pose_result: + imshow_keypoints(img, pose_result, skeleton, kpt_score_thr, + pose_kpt_color, pose_link_color, radius, + thickness) + + if show: + imshow(img, win_name, wait_time) + + if out_file is not None: + imwrite(img, out_file) + + return img diff --git a/mmpose/models/detectors/top_down_moe.py b/mmpose/models/detectors/top_down_moe.py new file mode 100644 index 0000000..7d499b7 --- /dev/null +++ b/mmpose/models/detectors/top_down_moe.py @@ -0,0 +1,351 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch +import torch.nn as nn + +import mmcv +import numpy as np +from mmcv.image import imwrite +from mmcv.utils.misc import deprecated_api_warning +from mmcv.visualization.image import imshow + +from mmpose.core import imshow_bboxes, imshow_keypoints +from .. import builder +from ..builder import POSENETS +from .base import BasePose + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class TopDownMoE(BasePose): + """Top-down pose detectors. + + Args: + backbone (dict): Backbone modules to extract feature. + keypoint_head (dict): Keypoint head to process feature. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + loss_pose (None): Deprecated arguments. Please use + `loss_keypoint` for heads instead. + """ + + def __init__(self, + backbone, + neck=None, + keypoint_head=None, + associate_keypoint_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + loss_pose=None): + super().__init__() + self.fp16_enabled = False + + self.backbone = builder.build_backbone(backbone) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if neck is not None: + self.neck = builder.build_neck(neck) + + if keypoint_head is not None: + keypoint_head['train_cfg'] = train_cfg + keypoint_head['test_cfg'] = test_cfg + + if 'loss_keypoint' not in keypoint_head and loss_pose is not None: + warnings.warn( + '`loss_pose` for TopDown is deprecated, ' + 'use `loss_keypoint` for heads instead. See ' + 'https://github.com/open-mmlab/mmpose/pull/382' + ' for more information.', DeprecationWarning) + keypoint_head['loss_keypoint'] = loss_pose + + self.keypoint_head = builder.build_head(keypoint_head) + + + associate_keypoint_heads = [] + keypoint_heads_cnt = 1 + + if associate_keypoint_head is not None: + if not isinstance(associate_keypoint_head, list): + associate_keypoint_head = [associate_keypoint_head] + for single_keypoint_head in associate_keypoint_head: + single_keypoint_head['train_cfg'] = train_cfg + single_keypoint_head['test_cfg'] = test_cfg + associate_keypoint_heads.append(builder.build_head(single_keypoint_head)) + keypoint_heads_cnt += 1 + + self.associate_keypoint_heads = nn.ModuleList(associate_keypoint_heads) + + self.keypoint_heads_cnt = keypoint_heads_cnt + + self.init_weights(pretrained=pretrained) + + @property + def with_neck(self): + """Check if has neck.""" + return hasattr(self, 'neck') + + @property + def with_keypoint(self): + """Check if has keypoint_head.""" + return hasattr(self, 'keypoint_head') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.backbone.init_weights(pretrained) + if self.with_neck: + self.neck.init_weights() + if self.with_keypoint: + self.keypoint_head.init_weights() + for item in self.associate_keypoint_heads: + item.init_weights() + + @auto_fp16(apply_to=('img', )) + def forward(self, + img, + target=None, + target_weight=None, + img_metas=None, + return_loss=True, + return_heatmap=False, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + + Note: + - batch_size: N + - num_keypoints: K + - num_img_channel: C (Default: 3) + - img height: imgH + - img width: imgW + - heatmaps height: H + - heatmaps weight: W + + Args: + img (torch.Tensor[NxCximgHximgW]): Input images. + target (torch.Tensor[NxKxHxW]): Target heatmaps. + target_weight (torch.Tensor[NxKx1]): Weights across + different joint types. + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + return_heatmap (bool) : Option to return heatmap. + + Returns: + dict|tuple: if `return loss` is true, then return losses. \ + Otherwise, return predicted poses, boxes, image paths \ + and heatmaps. + """ + if return_loss: + return self.forward_train(img, target, target_weight, img_metas, + **kwargs) + return self.forward_test( + img, img_metas, return_heatmap=return_heatmap, **kwargs) + + def forward_train(self, img, target, target_weight, img_metas, **kwargs): + """Defines the computation performed at every call when training.""" + + img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device) + + output = self.backbone(img, img_sources) + if self.with_neck: + output = self.neck(output) + # if return loss + losses = dict() + + main_stream_select = (img_sources == 0) + # if torch.sum(main_stream_select) > 0: + output_select = self.keypoint_head(output) + + target_select = target * main_stream_select.view(-1, 1, 1, 1) + target_weight_select = target_weight * main_stream_select.view(-1, 1, 1) + + keypoint_losses = self.keypoint_head.get_loss( + output_select, target_select, target_weight_select) + losses['main_stream_loss'] = keypoint_losses['heatmap_loss'] + keypoint_accuracy = self.keypoint_head.get_accuracy( + output_select, target_select, target_weight_select) + losses['main_stream_acc'] = keypoint_accuracy['acc_pose'] + + for idx in range(1, self.keypoint_heads_cnt): + idx_select = (img_sources == idx) + target_select = target * idx_select.view(-1, 1, 1, 1) + target_weight_select = target_weight * idx_select.view(-1, 1, 1) + output_select = self.associate_keypoint_heads[idx - 1](output) + keypoint_losses = self.associate_keypoint_heads[idx - 1].get_loss( + output_select, target_select, target_weight_select) + losses[f'{idx}_loss'] = keypoint_losses['heatmap_loss'] + keypoint_accuracy = self.associate_keypoint_heads[idx - 1].get_accuracy( + output_select, target_select, target_weight_select) + losses[f'{idx}_acc'] = keypoint_accuracy['acc_pose'] + + return losses + + def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): + """Defines the computation performed at every call when testing.""" + assert img.size(0) == len(img_metas) + batch_size, _, img_height, img_width = img.shape + if batch_size > 1: + assert 'bbox_id' in img_metas[0] + + result = {} + img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device) + + features = self.backbone(img, img_sources) + + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + output_heatmap = self.keypoint_head.inference_model( + features, flip_pairs=None) + + if self.test_cfg.get('flip_test', True): + img_flipped = img.flip(3) + features_flipped = self.backbone(img_flipped, img_sources) + if self.with_neck: + features_flipped = self.neck(features_flipped) + if self.with_keypoint: + output_flipped_heatmap = self.keypoint_head.inference_model( + features_flipped, img_metas[0]['flip_pairs']) + output_heatmap = (output_heatmap + + output_flipped_heatmap) * 0.5 + + if self.with_keypoint: + keypoint_result = self.keypoint_head.decode( + img_metas, output_heatmap, img_size=[img_width, img_height]) + result.update(keypoint_result) + + if not return_heatmap: + output_heatmap = None + + result['output_heatmap'] = output_heatmap + + return result + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): Input image. + + Returns: + Tensor: Output heatmaps. + """ + output = self.backbone(img) + if self.with_neck: + output = self.neck(output) + if self.with_keypoint: + output = self.keypoint_head(output) + return output + + @deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, + cls_name='TopDown') + def show_result(self, + img, + result, + skeleton=None, + kpt_score_thr=0.3, + bbox_color='green', + pose_kpt_color=None, + pose_link_color=None, + text_color='white', + radius=4, + thickness=1, + font_scale=0.5, + bbox_thickness=1, + win_name='', + show=False, + show_keypoint_weight=False, + wait_time=0, + out_file=None): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + skeleton (list[list]): The connection of keypoints. + skeleton is 0-based indexing. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_link_color (np.array[Mx3]): Color of M links. + If None, do not draw links. + text_color (str or tuple or :obj:`Color`): Color of texts. + radius (int): Radius of circles. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + show_keypoint_weight (bool): Whether to change the transparency + using the predicted confidence scores of keypoints. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized img, only if not `show` or `out_file`. + """ + img = mmcv.imread(img) + img = img.copy() + + bbox_result = [] + bbox_labels = [] + pose_result = [] + for res in result: + if 'bbox' in res: + bbox_result.append(res['bbox']) + bbox_labels.append(res.get('label', None)) + pose_result.append(res['keypoints']) + + if bbox_result: + bboxes = np.vstack(bbox_result) + # draw bounding boxes + imshow_bboxes( + img, + bboxes, + labels=bbox_labels, + colors=bbox_color, + text_color=text_color, + thickness=bbox_thickness, + font_scale=font_scale, + show=False) + + if pose_result: + imshow_keypoints(img, pose_result, skeleton, kpt_score_thr, + pose_kpt_color, pose_link_color, radius, + thickness) + + if show: + imshow(img, win_name, wait_time) + + if out_file is not None: + imwrite(img, out_file) + + return img diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py new file mode 100644 index 0000000..a98e911 --- /dev/null +++ b/mmpose/models/heads/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ae_higher_resolution_head import AEHigherResolutionHead +from .ae_multi_stage_head import AEMultiStageHead +from .ae_simple_head import AESimpleHead +from .deconv_head import DeconvHead +from .deeppose_regression_head import DeepposeRegressionHead +from .hmr_head import HMRMeshHead +from .interhand_3d_head import Interhand3DHead +from .temporal_regression_head import TemporalRegressionHead +from .topdown_heatmap_base_head import TopdownHeatmapBaseHead +from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead, + TopdownHeatmapMultiStageHead) +from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead +from .vipnas_heatmap_simple_head import ViPNASHeatmapSimpleHead +from .voxelpose_head import CuboidCenterHead, CuboidPoseHead + +__all__ = [ + 'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead', + 'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead', + 'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead', + 'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead', + 'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead', + 'CuboidPoseHead' +] diff --git a/mmpose/models/heads/ae_higher_resolution_head.py b/mmpose/models/heads/ae_higher_resolution_head.py new file mode 100644 index 0000000..9bf3399 --- /dev/null +++ b/mmpose/models/heads/ae_higher_resolution_head.py @@ -0,0 +1,249 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init, + normal_init) + +from mmpose.models.builder import build_loss +from ..backbones.resnet import BasicBlock +from ..builder import HEADS + + +@HEADS.register_module() +class AEHigherResolutionHead(nn.Module): + """Associative embedding with higher resolution head. paper ref: Bowen + Cheng et al. "HigherHRNet: Scale-Aware Representation Learning for Bottom- + Up Human Pose Estimation". + + Args: + in_channels (int): Number of input channels. + num_joints (int): Number of joints + tag_per_joint (bool): If tag_per_joint is True, + the dimension of tags equals to num_joints, + else the dimension of tags is 1. Default: True + extra (dict): Configs for extra conv layers. Default: None + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. + cat_output (list[bool]): Option to concat outputs. + with_ae_loss (list[bool]): Option to use ae loss. + loss_keypoint (dict): Config for loss. Default: None. + """ + + def __init__(self, + in_channels, + num_joints, + tag_per_joint=True, + extra=None, + num_deconv_layers=1, + num_deconv_filters=(32, ), + num_deconv_kernels=(4, ), + num_basic_blocks=4, + cat_output=None, + with_ae_loss=None, + loss_keypoint=None): + super().__init__() + + self.loss = build_loss(loss_keypoint) + dim_tag = num_joints if tag_per_joint else 1 + + self.num_deconvs = num_deconv_layers + self.cat_output = cat_output + + final_layer_output_channels = [] + + if with_ae_loss[0]: + out_channels = num_joints + dim_tag + else: + out_channels = num_joints + + final_layer_output_channels.append(out_channels) + for i in range(num_deconv_layers): + if with_ae_loss[i + 1]: + out_channels = num_joints + dim_tag + else: + out_channels = num_joints + final_layer_output_channels.append(out_channels) + + deconv_layer_output_channels = [] + for i in range(num_deconv_layers): + if with_ae_loss[i]: + out_channels = num_joints + dim_tag + else: + out_channels = num_joints + deconv_layer_output_channels.append(out_channels) + + self.final_layers = self._make_final_layers( + in_channels, final_layer_output_channels, extra, num_deconv_layers, + num_deconv_filters) + self.deconv_layers = self._make_deconv_layers( + in_channels, deconv_layer_output_channels, num_deconv_layers, + num_deconv_filters, num_deconv_kernels, num_basic_blocks, + cat_output) + + @staticmethod + def _make_final_layers(in_channels, final_layer_output_channels, extra, + num_deconv_layers, num_deconv_filters): + """Make final layers.""" + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + else: + padding = 0 + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + final_layers = [] + final_layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=in_channels, + out_channels=final_layer_output_channels[0], + kernel_size=kernel_size, + stride=1, + padding=padding)) + + for i in range(num_deconv_layers): + in_channels = num_deconv_filters[i] + final_layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=in_channels, + out_channels=final_layer_output_channels[i + 1], + kernel_size=kernel_size, + stride=1, + padding=padding)) + + return nn.ModuleList(final_layers) + + def _make_deconv_layers(self, in_channels, deconv_layer_output_channels, + num_deconv_layers, num_deconv_filters, + num_deconv_kernels, num_basic_blocks, cat_output): + """Make deconv layers.""" + deconv_layers = [] + for i in range(num_deconv_layers): + if cat_output[i]: + in_channels += deconv_layer_output_channels[i] + + planes = num_deconv_filters[i] + deconv_kernel, padding, output_padding = \ + self._get_deconv_cfg(num_deconv_kernels[i]) + + layers = [] + layers.append( + nn.Sequential( + build_upsample_layer( + dict(type='deconv'), + in_channels=in_channels, + out_channels=planes, + kernel_size=deconv_kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False), nn.BatchNorm2d(planes, momentum=0.1), + nn.ReLU(inplace=True))) + for _ in range(num_basic_blocks): + layers.append(nn.Sequential(BasicBlock(planes, planes), )) + deconv_layers.append(nn.Sequential(*layers)) + in_channels = planes + + return nn.ModuleList(deconv_layers) + + @staticmethod + def _get_deconv_cfg(deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding + + def get_loss(self, outputs, targets, masks, joints): + """Calculate bottom-up keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - num_outputs: O + - heatmaps height: H + - heatmaps weight: W + + Args: + outputs (list(torch.Tensor[N,K,H,W])): Multi-scale output heatmaps. + targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps. + masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target + heatmaps + joints (List(torch.Tensor[N,M,K,2])): Joints of multi-scale target + heatmaps for ae loss + """ + + losses = dict() + + heatmaps_losses, push_losses, pull_losses = self.loss( + outputs, targets, masks, joints) + + for idx in range(len(targets)): + if heatmaps_losses[idx] is not None: + heatmaps_loss = heatmaps_losses[idx].mean(dim=0) + if 'heatmap_loss' not in losses: + losses['heatmap_loss'] = heatmaps_loss + else: + losses['heatmap_loss'] += heatmaps_loss + if push_losses[idx] is not None: + push_loss = push_losses[idx].mean(dim=0) + if 'push_loss' not in losses: + losses['push_loss'] = push_loss + else: + losses['push_loss'] += push_loss + if pull_losses[idx] is not None: + pull_loss = pull_losses[idx].mean(dim=0) + if 'pull_loss' not in losses: + losses['pull_loss'] = pull_loss + else: + losses['pull_loss'] += pull_loss + + return losses + + def forward(self, x): + """Forward function.""" + if isinstance(x, list): + x = x[0] + + final_outputs = [] + y = self.final_layers[0](x) + final_outputs.append(y) + + for i in range(self.num_deconvs): + if self.cat_output[i]: + x = torch.cat((x, y), 1) + + x = self.deconv_layers[i](x) + y = self.final_layers[i + 1](x) + final_outputs.append(y) + + return final_outputs + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for _, m in self.final_layers.named_modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) diff --git a/mmpose/models/heads/ae_multi_stage_head.py b/mmpose/models/heads/ae_multi_stage_head.py new file mode 100644 index 0000000..195666b --- /dev/null +++ b/mmpose/models/heads/ae_multi_stage_head.py @@ -0,0 +1,222 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init, + normal_init) + +from mmpose.models.builder import build_loss +from ..builder import HEADS + + +@HEADS.register_module() +class AEMultiStageHead(nn.Module): + """Associative embedding multi-stage head. + paper ref: Alejandro Newell et al. "Associative + Embedding: End-to-end Learning for Joint Detection + and Grouping" + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. + loss_keypoint (dict): Config for loss. Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + num_stages=1, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=None, + loss_keypoint=None): + super().__init__() + + self.loss = build_loss(loss_keypoint) + + self.in_channels = in_channels + self.num_stages = num_stages + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + # build multi-stage deconv layers + self.multi_deconv_layers = nn.ModuleList([]) + for _ in range(self.num_stages): + if num_deconv_layers > 0: + deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + elif num_deconv_layers == 0: + deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + self.multi_deconv_layers.append(deconv_layers) + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0, 1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + elif extra['final_conv_kernel'] == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_final_layer = True + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + # build multi-stage final layers + self.multi_final_layers = nn.ModuleList([]) + for i in range(self.num_stages): + if identity_final_layer: + final_layer = nn.Identity() + else: + final_layer = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=num_deconv_filters[-1] + if num_deconv_layers > 0 else in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=padding) + self.multi_final_layers.append(final_layer) + + def get_loss(self, output, targets, masks, joints): + """Calculate bottom-up keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (List(torch.Tensor[NxKxHxW])): Output heatmaps. + targets(List(List(torch.Tensor[NxKxHxW]))): + Multi-stage and multi-scale target heatmaps. + masks(List(List(torch.Tensor[NxHxW]))): + Masks of multi-stage and multi-scale target heatmaps + joints(List(List(torch.Tensor[NxMxKx2]))): + Joints of multi-stage multi-scale target heatmaps for ae loss + """ + + losses = dict() + + # Flatten list: + # [stage_1_scale_1, stage_1_scale_2, ... , stage_1_scale_m, + # ... + # stage_n_scale_1, stage_n_scale_2, ... , stage_n_scale_m] + targets = [target for _targets in targets for target in _targets] + masks = [mask for _masks in masks for mask in _masks] + joints = [joint for _joints in joints for joint in _joints] + + heatmaps_losses, push_losses, pull_losses = self.loss( + output, targets, masks, joints) + + for idx in range(len(targets)): + if heatmaps_losses[idx] is not None: + heatmaps_loss = heatmaps_losses[idx].mean(dim=0) + if 'heatmap_loss' not in losses: + losses['heatmap_loss'] = heatmaps_loss + else: + losses['heatmap_loss'] += heatmaps_loss + if push_losses[idx] is not None: + push_loss = push_losses[idx].mean(dim=0) + if 'push_loss' not in losses: + losses['push_loss'] = push_loss + else: + losses['push_loss'] += push_loss + if pull_losses[idx] is not None: + pull_loss = pull_losses[idx].mean(dim=0) + if 'pull_loss' not in losses: + losses['pull_loss'] = pull_loss + else: + losses['pull_loss'] += pull_loss + + return losses + + def forward(self, x): + """Forward function. + + Returns: + out (list[Tensor]): a list of heatmaps from multiple stages. + """ + out = [] + assert isinstance(x, list) + for i in range(self.num_stages): + y = self.multi_deconv_layers[i](x[i]) + y = self.multi_final_layers[i](y) + out.append(y) + return out + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + @staticmethod + def _get_deconv_cfg(deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.multi_deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.multi_final_layers.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) diff --git a/mmpose/models/heads/ae_simple_head.py b/mmpose/models/heads/ae_simple_head.py new file mode 100644 index 0000000..9297f71 --- /dev/null +++ b/mmpose/models/heads/ae_simple_head.py @@ -0,0 +1,99 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..builder import HEADS +from .deconv_head import DeconvHead + + +@HEADS.register_module() +class AESimpleHead(DeconvHead): + """Associative embedding simple head. + paper ref: Alejandro Newell et al. "Associative + Embedding: End-to-end Learning for Joint Detection + and Grouping" + + Args: + in_channels (int): Number of input channels. + num_joints (int): Number of joints. + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. + tag_per_joint (bool): If tag_per_joint is True, + the dimension of tags equals to num_joints, + else the dimension of tags is 1. Default: True + with_ae_loss (list[bool]): Option to use ae loss or not. + loss_keypoint (dict): Config for loss. Default: None. + """ + + def __init__(self, + in_channels, + num_joints, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + tag_per_joint=True, + with_ae_loss=None, + extra=None, + loss_keypoint=None): + + dim_tag = num_joints if tag_per_joint else 1 + if with_ae_loss[0]: + out_channels = num_joints + dim_tag + else: + out_channels = num_joints + + super().__init__( + in_channels, + out_channels, + num_deconv_layers=num_deconv_layers, + num_deconv_filters=num_deconv_filters, + num_deconv_kernels=num_deconv_kernels, + extra=extra, + loss_keypoint=loss_keypoint) + + def get_loss(self, outputs, targets, masks, joints): + """Calculate bottom-up keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - num_outputs: O + - heatmaps height: H + - heatmaps weight: W + + Args: + outputs (list(torch.Tensor[N,K,H,W])): Multi-scale output heatmaps. + targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps. + masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target + heatmaps + joints(List(torch.Tensor[N,M,K,2])): Joints of multi-scale target + heatmaps for ae loss + """ + + losses = dict() + + heatmaps_losses, push_losses, pull_losses = self.loss( + outputs, targets, masks, joints) + + for idx in range(len(targets)): + if heatmaps_losses[idx] is not None: + heatmaps_loss = heatmaps_losses[idx].mean(dim=0) + if 'heatmap_loss' not in losses: + losses['heatmap_loss'] = heatmaps_loss + else: + losses['heatmap_loss'] += heatmaps_loss + if push_losses[idx] is not None: + push_loss = push_losses[idx].mean(dim=0) + if 'push_loss' not in losses: + losses['push_loss'] = push_loss + else: + losses['push_loss'] += push_loss + if pull_losses[idx] is not None: + pull_loss = pull_losses[idx].mean(dim=0) + if 'pull_loss' not in losses: + losses['pull_loss'] = pull_loss + else: + losses['pull_loss'] += pull_loss + + return losses diff --git a/mmpose/models/heads/deconv_head.py b/mmpose/models/heads/deconv_head.py new file mode 100644 index 0000000..90846d2 --- /dev/null +++ b/mmpose/models/heads/deconv_head.py @@ -0,0 +1,295 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, + constant_init, normal_init) + +from mmpose.models.builder import HEADS, build_loss +from mmpose.models.utils.ops import resize + + +@HEADS.register_module() +class DeconvHead(nn.Module): + """Simple deconv head. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. + in_index (int|Sequence[int]): Input feature index. Default: 0 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + Default: None. + + - 'resize_concat': Multiple feature maps will be resized to the + same size as the first one and then concat together. + Usually used in FCN head of HRNet. + - 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + - None: Only one select feature map is allowed. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + loss_keypoint (dict): Config for loss. Default: None. + """ + + def __init__(self, + in_channels=3, + out_channels=17, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=None, + in_index=0, + input_transform=None, + align_corners=False, + loss_keypoint=None): + super().__init__() + + self.in_channels = in_channels + self.loss = build_loss(loss_keypoint) + + self._init_inputs(in_channels, in_index, input_transform) + self.in_index = in_index + self.align_corners = align_corners + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + if num_deconv_layers > 0: + self.deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + elif num_deconv_layers == 0: + self.deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0, 1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + elif extra['final_conv_kernel'] == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_final_layer = True + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + if identity_final_layer: + self.final_layer = nn.Identity() + else: + conv_channels = num_deconv_filters[ + -1] if num_deconv_layers > 0 else self.in_channels + + layers = [] + if extra is not None: + num_conv_layers = extra.get('num_conv_layers', 0) + num_conv_kernels = extra.get('num_conv_kernels', + [1] * num_conv_layers) + + for i in range(num_conv_layers): + layers.append( + build_conv_layer( + dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=conv_channels, + kernel_size=num_conv_kernels[i], + stride=1, + padding=(num_conv_kernels[i] - 1) // 2)) + layers.append( + build_norm_layer(dict(type='BN'), conv_channels)[1]) + layers.append(nn.ReLU(inplace=True)) + + layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=padding)) + + if len(layers) > 1: + self.final_layer = nn.Sequential(*layers) + else: + self.final_layer = layers[0] + + def _init_inputs(self, in_channels, in_index, input_transform): + """Check and initialize input transforms. + + The in_channels, in_index and input_transform must match. + Specifically, when input_transform is None, only single feature map + will be selected. So in_channels and in_index must be of type int. + When input_transform is not None, in_channels and in_index must be + list or tuple, with the same length. + + Args: + in_channels (int|Sequence[int]): Input channels. + in_index (int|Sequence[int]): Input feature index. + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + + - 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + - 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + - None: Only one select feature map is allowed. + """ + + if input_transform is not None: + assert input_transform in ['resize_concat', 'multiple_select'] + self.input_transform = input_transform + self.in_index = in_index + if input_transform is not None: + assert isinstance(in_channels, (list, tuple)) + assert isinstance(in_index, (list, tuple)) + assert len(in_channels) == len(in_index) + if input_transform == 'resize_concat': + self.in_channels = sum(in_channels) + else: + self.in_channels = in_channels + else: + assert isinstance(in_channels, int) + assert isinstance(in_index, int) + self.in_channels = in_channels + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor] | Tensor): multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + if not isinstance(inputs, list): + return inputs + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = torch.cat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + @staticmethod + def _get_deconv_cfg(deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding + + def get_loss(self, outputs, targets, masks): + """Calculate bottom-up masked mse loss. + + Note: + - batch_size: N + - num_channels: C + - heatmaps height: H + - heatmaps weight: W + + Args: + outputs (List(torch.Tensor[N,C,H,W])): Multi-scale outputs. + targets (List(torch.Tensor[N,C,H,W])): Multi-scale targets. + masks (List(torch.Tensor[N,H,W])): Masks of multi-scale targets. + """ + + losses = dict() + + for idx in range(len(targets)): + if 'loss' not in losses: + losses['loss'] = self.loss(outputs[idx], targets[idx], + masks[idx]) + else: + losses['loss'] += self.loss(outputs[idx], targets[idx], + masks[idx]) + + return losses + + def forward(self, x): + """Forward function.""" + x = self._transform_inputs(x) + final_outputs = [] + x = self.deconv_layers(x) + y = self.final_layer(x) + final_outputs.append(y) + return final_outputs + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.final_layer.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) diff --git a/mmpose/models/heads/deeppose_regression_head.py b/mmpose/models/heads/deeppose_regression_head.py new file mode 100644 index 0000000..f326e26 --- /dev/null +++ b/mmpose/models/heads/deeppose_regression_head.py @@ -0,0 +1,176 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch.nn as nn +from mmcv.cnn import normal_init + +from mmpose.core.evaluation import (keypoint_pck_accuracy, + keypoints_from_regression) +from mmpose.core.post_processing import fliplr_regression +from mmpose.models.builder import HEADS, build_loss + + +@HEADS.register_module() +class DeepposeRegressionHead(nn.Module): + """Deeppose regression head with fully connected layers. + + "DeepPose: Human Pose Estimation via Deep Neural Networks". + + Args: + in_channels (int): Number of input channels + num_joints (int): Number of joints + loss_keypoint (dict): Config for keypoint loss. Default: None. + """ + + def __init__(self, + in_channels, + num_joints, + loss_keypoint=None, + train_cfg=None, + test_cfg=None): + super().__init__() + + self.in_channels = in_channels + self.num_joints = num_joints + + self.loss = build_loss(loss_keypoint) + + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + + self.fc = nn.Linear(self.in_channels, self.num_joints * 2) + + def forward(self, x): + """Forward function.""" + output = self.fc(x) + N, C = output.shape + return output.reshape([N, C // 2, 2]) + + def get_loss(self, output, target, target_weight): + """Calculate top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + output (torch.Tensor[N, K, 2]): Output keypoints. + target (torch.Tensor[N, K, 2]): Target keypoints. + target_weight (torch.Tensor[N, K, 2]): + Weights across different joint types. + """ + + losses = dict() + assert not isinstance(self.loss, nn.Sequential) + assert target.dim() == 3 and target_weight.dim() == 3 + losses['reg_loss'] = self.loss(output, target, target_weight) + + return losses + + def get_accuracy(self, output, target, target_weight): + """Calculate accuracy for top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + output (torch.Tensor[N, K, 2]): Output keypoints. + target (torch.Tensor[N, K, 2]): Target keypoints. + target_weight (torch.Tensor[N, K, 2]): + Weights across different joint types. + """ + + accuracy = dict() + + N = output.shape[0] + + _, avg_acc, cnt = keypoint_pck_accuracy( + output.detach().cpu().numpy(), + target.detach().cpu().numpy(), + target_weight[:, :, 0].detach().cpu().numpy() > 0, + thr=0.05, + normalize=np.ones((N, 2), dtype=np.float32)) + accuracy['acc_pose'] = avg_acc + + return accuracy + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output_regression (np.ndarray): Output regression. + + Args: + x (torch.Tensor[N, K, 2]): Input features. + flip_pairs (None | list[tuple()): + Pairs of keypoints which are mirrored. + """ + output = self.forward(x) + + if flip_pairs is not None: + output_regression = fliplr_regression( + output.detach().cpu().numpy(), flip_pairs) + else: + output_regression = output.detach().cpu().numpy() + return output_regression + + def decode(self, img_metas, output, **kwargs): + """Decode the keypoints from output regression. + + Args: + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + output (np.ndarray[N, K, 2]): predicted regression vector. + kwargs: dict contains 'img_size'. + img_size (tuple(img_width, img_height)): input image size. + """ + batch_size = len(img_metas) + + if 'bbox_id' in img_metas[0]: + bbox_ids = [] + else: + bbox_ids = None + + c = np.zeros((batch_size, 2), dtype=np.float32) + s = np.zeros((batch_size, 2), dtype=np.float32) + image_paths = [] + score = np.ones(batch_size) + for i in range(batch_size): + c[i, :] = img_metas[i]['center'] + s[i, :] = img_metas[i]['scale'] + image_paths.append(img_metas[i]['image_file']) + + if 'bbox_score' in img_metas[i]: + score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1) + if bbox_ids is not None: + bbox_ids.append(img_metas[i]['bbox_id']) + + preds, maxvals = keypoints_from_regression(output, c, s, + kwargs['img_size']) + + all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32) + all_boxes = np.zeros((batch_size, 6), dtype=np.float32) + all_preds[:, :, 0:2] = preds[:, :, 0:2] + all_preds[:, :, 2:3] = maxvals + all_boxes[:, 0:2] = c[:, 0:2] + all_boxes[:, 2:4] = s[:, 0:2] + all_boxes[:, 4] = np.prod(s * 200.0, axis=1) + all_boxes[:, 5] = score + + result = {} + + result['preds'] = all_preds + result['boxes'] = all_boxes + result['image_paths'] = image_paths + result['bbox_ids'] = bbox_ids + + return result + + def init_weights(self): + normal_init(self.fc, mean=0, std=0.01, bias=0) diff --git a/mmpose/models/heads/hmr_head.py b/mmpose/models/heads/hmr_head.py new file mode 100644 index 0000000..015a307 --- /dev/null +++ b/mmpose/models/heads/hmr_head.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init + +from ..builder import HEADS +from ..utils.geometry import rot6d_to_rotmat + + +@HEADS.register_module() +class HMRMeshHead(nn.Module): + """SMPL parameters regressor head of simple baseline. "End-to-end Recovery + of Human Shape and Pose", CVPR'2018. + + Args: + in_channels (int): Number of input channels + smpl_mean_params (str): The file name of the mean SMPL parameters + n_iter (int): The iterations of estimating delta parameters + """ + + def __init__(self, in_channels, smpl_mean_params=None, n_iter=3): + super().__init__() + + self.in_channels = in_channels + self.n_iter = n_iter + + npose = 24 * 6 + nbeta = 10 + ncam = 3 + hidden_dim = 1024 + + self.fc1 = nn.Linear(in_channels + npose + nbeta + ncam, hidden_dim) + self.drop1 = nn.Dropout() + self.fc2 = nn.Linear(hidden_dim, hidden_dim) + self.drop2 = nn.Dropout() + self.decpose = nn.Linear(hidden_dim, npose) + self.decshape = nn.Linear(hidden_dim, nbeta) + self.deccam = nn.Linear(hidden_dim, ncam) + + # Load mean SMPL parameters + if smpl_mean_params is None: + init_pose = torch.zeros([1, npose]) + init_shape = torch.zeros([1, nbeta]) + init_cam = torch.FloatTensor([[1, 0, 0]]) + else: + mean_params = np.load(smpl_mean_params) + init_pose = torch.from_numpy( + mean_params['pose'][:]).unsqueeze(0).float() + init_shape = torch.from_numpy( + mean_params['shape'][:]).unsqueeze(0).float() + init_cam = torch.from_numpy( + mean_params['cam']).unsqueeze(0).float() + self.register_buffer('init_pose', init_pose) + self.register_buffer('init_shape', init_shape) + self.register_buffer('init_cam', init_cam) + + def forward(self, x): + """Forward function. + + x is the image feature map and is expected to be in shape (batch size x + channel number x height x width) + """ + batch_size = x.shape[0] + # extract the global feature vector by average along + # spatial dimension. + x = x.mean(dim=-1).mean(dim=-1) + + init_pose = self.init_pose.expand(batch_size, -1) + init_shape = self.init_shape.expand(batch_size, -1) + init_cam = self.init_cam.expand(batch_size, -1) + + pred_pose = init_pose + pred_shape = init_shape + pred_cam = init_cam + for _ in range(self.n_iter): + xc = torch.cat([x, pred_pose, pred_shape, pred_cam], 1) + xc = self.fc1(xc) + xc = self.drop1(xc) + xc = self.fc2(xc) + xc = self.drop2(xc) + pred_pose = self.decpose(xc) + pred_pose + pred_shape = self.decshape(xc) + pred_shape + pred_cam = self.deccam(xc) + pred_cam + + pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3) + out = (pred_rotmat, pred_shape, pred_cam) + return out + + def init_weights(self): + """Initialize model weights.""" + xavier_init(self.decpose, gain=0.01) + xavier_init(self.decshape, gain=0.01) + xavier_init(self.deccam, gain=0.01) diff --git a/mmpose/models/heads/interhand_3d_head.py b/mmpose/models/heads/interhand_3d_head.py new file mode 100644 index 0000000..aebe4a5 --- /dev/null +++ b/mmpose/models/heads/interhand_3d_head.py @@ -0,0 +1,521 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, + constant_init, normal_init) + +from mmpose.core.evaluation.top_down_eval import ( + keypoints_from_heatmaps3d, multilabel_classification_accuracy) +from mmpose.core.post_processing import flip_back +from mmpose.models.builder import build_loss +from mmpose.models.necks import GlobalAveragePooling +from ..builder import HEADS + + +class Heatmap3DHead(nn.Module): + """Heatmap3DHead is a sub-module of Interhand3DHead, and outputs 3D + heatmaps. Heatmap3DHead is composed of (>=0) number of deconv layers and a + simple conv2d layer. + + Args: + in_channels (int): Number of input channels + out_channels (int): Number of output channels + depth_size (int): Number of depth discretization size + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + num_deconv_kernels (list|tuple): Kernel sizes. + extra (dict): Configs for extra conv layers. Default: None + """ + + def __init__(self, + in_channels, + out_channels, + depth_size=64, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=None): + + super().__init__() + + assert out_channels % depth_size == 0 + self.depth_size = depth_size + self.in_channels = in_channels + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + if num_deconv_layers > 0: + self.deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + elif num_deconv_layers == 0: + self.deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0, 1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + elif extra['final_conv_kernel'] == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_final_layer = True + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + if identity_final_layer: + self.final_layer = nn.Identity() + else: + conv_channels = num_deconv_filters[ + -1] if num_deconv_layers > 0 else self.in_channels + + layers = [] + if extra is not None: + num_conv_layers = extra.get('num_conv_layers', 0) + num_conv_kernels = extra.get('num_conv_kernels', + [1] * num_conv_layers) + + for i in range(num_conv_layers): + layers.append( + build_conv_layer( + dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=conv_channels, + kernel_size=num_conv_kernels[i], + stride=1, + padding=(num_conv_kernels[i] - 1) // 2)) + layers.append( + build_norm_layer(dict(type='BN'), conv_channels)[1]) + layers.append(nn.ReLU(inplace=True)) + + layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=padding)) + + if len(layers) > 1: + self.final_layer = nn.Sequential(*layers) + else: + self.final_layer = layers[0] + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + @staticmethod + def _get_deconv_cfg(deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding + + def forward(self, x): + """Forward function.""" + x = self.deconv_layers(x) + x = self.final_layer(x) + N, C, H, W = x.shape + # reshape the 2D heatmap to 3D heatmap + x = x.reshape(N, C // self.depth_size, self.depth_size, H, W) + return x + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.final_layer.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + + +class Heatmap1DHead(nn.Module): + """Heatmap1DHead is a sub-module of Interhand3DHead, and outputs 1D + heatmaps. + + Args: + in_channels (int): Number of input channels + heatmap_size (int): Heatmap size + hidden_dims (list|tuple): Number of feature dimension of FC layers. + """ + + def __init__(self, in_channels=2048, heatmap_size=64, hidden_dims=(512, )): + super().__init__() + + self.in_channels = in_channels + self.heatmap_size = heatmap_size + + feature_dims = [in_channels, *hidden_dims, heatmap_size] + self.fc = self._make_linear_layers(feature_dims, relu_final=False) + + def soft_argmax_1d(self, heatmap1d): + heatmap1d = F.softmax(heatmap1d, 1) + accu = heatmap1d * torch.arange( + self.heatmap_size, dtype=heatmap1d.dtype, + device=heatmap1d.device)[None, :] + coord = accu.sum(dim=1) + return coord + + def _make_linear_layers(self, feat_dims, relu_final=False): + """Make linear layers.""" + layers = [] + for i in range(len(feat_dims) - 1): + layers.append(nn.Linear(feat_dims[i], feat_dims[i + 1])) + if i < len(feat_dims) - 2 or \ + (i == len(feat_dims) - 2 and relu_final): + layers.append(nn.ReLU(inplace=True)) + return nn.Sequential(*layers) + + def forward(self, x): + """Forward function.""" + heatmap1d = self.fc(x) + value = self.soft_argmax_1d(heatmap1d).view(-1, 1) + return value + + def init_weights(self): + """Initialize model weights.""" + for m in self.fc.modules(): + if isinstance(m, nn.Linear): + normal_init(m, mean=0, std=0.01, bias=0) + + +class MultilabelClassificationHead(nn.Module): + """MultilabelClassificationHead is a sub-module of Interhand3DHead, and + outputs hand type classification. + + Args: + in_channels (int): Number of input channels + num_labels (int): Number of labels + hidden_dims (list|tuple): Number of hidden dimension of FC layers. + """ + + def __init__(self, in_channels=2048, num_labels=2, hidden_dims=(512, )): + super().__init__() + + self.in_channels = in_channels + self.num_labesl = num_labels + + feature_dims = [in_channels, *hidden_dims, num_labels] + self.fc = self._make_linear_layers(feature_dims, relu_final=False) + + def _make_linear_layers(self, feat_dims, relu_final=False): + """Make linear layers.""" + layers = [] + for i in range(len(feat_dims) - 1): + layers.append(nn.Linear(feat_dims[i], feat_dims[i + 1])) + if i < len(feat_dims) - 2 or \ + (i == len(feat_dims) - 2 and relu_final): + layers.append(nn.ReLU(inplace=True)) + return nn.Sequential(*layers) + + def forward(self, x): + """Forward function.""" + labels = torch.sigmoid(self.fc(x)) + return labels + + def init_weights(self): + for m in self.fc.modules(): + if isinstance(m, nn.Linear): + normal_init(m, mean=0, std=0.01, bias=0) + + +@HEADS.register_module() +class Interhand3DHead(nn.Module): + """Interhand 3D head of paper ref: Gyeongsik Moon. "InterHand2.6M: A + Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single + RGB Image". + + Args: + keypoint_head_cfg (dict): Configs of Heatmap3DHead for hand + keypoint estimation. + root_head_cfg (dict): Configs of Heatmap1DHead for relative + hand root depth estimation. + hand_type_head_cfg (dict): Configs of MultilabelClassificationHead + for hand type classification. + loss_keypoint (dict): Config for keypoint loss. Default: None. + loss_root_depth (dict): Config for relative root depth loss. + Default: None. + loss_hand_type (dict): Config for hand type classification + loss. Default: None. + """ + + def __init__(self, + keypoint_head_cfg, + root_head_cfg, + hand_type_head_cfg, + loss_keypoint=None, + loss_root_depth=None, + loss_hand_type=None, + train_cfg=None, + test_cfg=None): + super().__init__() + + # build sub-module heads + self.right_hand_head = Heatmap3DHead(**keypoint_head_cfg) + self.left_hand_head = Heatmap3DHead(**keypoint_head_cfg) + self.root_head = Heatmap1DHead(**root_head_cfg) + self.hand_type_head = MultilabelClassificationHead( + **hand_type_head_cfg) + self.neck = GlobalAveragePooling() + + # build losses + self.keypoint_loss = build_loss(loss_keypoint) + self.root_depth_loss = build_loss(loss_root_depth) + self.hand_type_loss = build_loss(loss_hand_type) + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap') + + def init_weights(self): + self.left_hand_head.init_weights() + self.right_hand_head.init_weights() + self.root_head.init_weights() + self.hand_type_head.init_weights() + + def get_loss(self, output, target, target_weight): + """Calculate loss for hand keypoint heatmaps, relative root depth and + hand type. + + Args: + output (list[Tensor]): a list of outputs from multiple heads. + target (list[Tensor]): a list of targets for multiple heads. + target_weight (list[Tensor]): a list of targets weight for + multiple heads. + """ + losses = dict() + + # hand keypoint loss + assert not isinstance(self.keypoint_loss, nn.Sequential) + out, tar, tar_weight = output[0], target[0], target_weight[0] + assert tar.dim() == 5 and tar_weight.dim() == 3 + losses['hand_loss'] = self.keypoint_loss(out, tar, tar_weight) + + # relative root depth loss + assert not isinstance(self.root_depth_loss, nn.Sequential) + out, tar, tar_weight = output[1], target[1], target_weight[1] + assert tar.dim() == 2 and tar_weight.dim() == 2 + losses['rel_root_loss'] = self.root_depth_loss(out, tar, tar_weight) + + # hand type loss + assert not isinstance(self.hand_type_loss, nn.Sequential) + out, tar, tar_weight = output[2], target[2], target_weight[2] + assert tar.dim() == 2 and tar_weight.dim() in [1, 2] + losses['hand_type_loss'] = self.hand_type_loss(out, tar, tar_weight) + + return losses + + def get_accuracy(self, output, target, target_weight): + """Calculate accuracy for hand type. + + Args: + output (list[Tensor]): a list of outputs from multiple heads. + target (list[Tensor]): a list of targets for multiple heads. + target_weight (list[Tensor]): a list of targets weight for + multiple heads. + """ + accuracy = dict() + avg_acc = multilabel_classification_accuracy( + output[2].detach().cpu().numpy(), + target[2].detach().cpu().numpy(), + target_weight[2].detach().cpu().numpy(), + ) + accuracy['acc_classification'] = float(avg_acc) + return accuracy + + def forward(self, x): + """Forward function.""" + outputs = [] + outputs.append( + torch.cat([self.right_hand_head(x), + self.left_hand_head(x)], dim=1)) + x = self.neck(x) + outputs.append(self.root_head(x)) + outputs.append(self.hand_type_head(x)) + return outputs + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output (list[np.ndarray]): list of output hand keypoint + heatmaps, relative root depth and hand type. + + Args: + x (torch.Tensor[N,K,H,W]): Input features. + flip_pairs (None | list[tuple()): + Pairs of keypoints which are mirrored. + """ + + output = self.forward(x) + + if flip_pairs is not None: + # flip 3D heatmap + heatmap_3d = output[0] + N, K, D, H, W = heatmap_3d.shape + # reshape 3D heatmap to 2D heatmap + heatmap_3d = heatmap_3d.reshape(N, K * D, H, W) + # 2D heatmap flip + heatmap_3d_flipped_back = flip_back( + heatmap_3d.detach().cpu().numpy(), + flip_pairs, + target_type=self.target_type) + # reshape back to 3D heatmap + heatmap_3d_flipped_back = heatmap_3d_flipped_back.reshape( + N, K, D, H, W) + # feature is not aligned, shift flipped heatmap for higher accuracy + if self.test_cfg.get('shift_heatmap', False): + heatmap_3d_flipped_back[..., + 1:] = heatmap_3d_flipped_back[..., :-1] + output[0] = heatmap_3d_flipped_back + + # flip relative hand root depth + output[1] = -output[1].detach().cpu().numpy() + + # flip hand type + hand_type = output[2].detach().cpu().numpy() + hand_type_flipped_back = hand_type.copy() + hand_type_flipped_back[:, 0] = hand_type[:, 1] + hand_type_flipped_back[:, 1] = hand_type[:, 0] + output[2] = hand_type_flipped_back + else: + output = [out.detach().cpu().numpy() for out in output] + + return output + + def decode(self, img_metas, output, **kwargs): + """Decode hand keypoint, relative root depth and hand type. + + Args: + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + - "heatmap3d_depth_bound": depth bound of hand keypoint + 3D heatmap + - "root_depth_bound": depth bound of relative root depth + 1D heatmap + output (list[np.ndarray]): model predicted 3D heatmaps, relative + root depth and hand type. + """ + + batch_size = len(img_metas) + result = {} + + heatmap3d_depth_bound = np.ones(batch_size, dtype=np.float32) + root_depth_bound = np.ones(batch_size, dtype=np.float32) + center = np.zeros((batch_size, 2), dtype=np.float32) + scale = np.zeros((batch_size, 2), dtype=np.float32) + image_paths = [] + score = np.ones(batch_size, dtype=np.float32) + if 'bbox_id' in img_metas[0]: + bbox_ids = [] + else: + bbox_ids = None + + for i in range(batch_size): + heatmap3d_depth_bound[i] = img_metas[i]['heatmap3d_depth_bound'] + root_depth_bound[i] = img_metas[i]['root_depth_bound'] + center[i, :] = img_metas[i]['center'] + scale[i, :] = img_metas[i]['scale'] + image_paths.append(img_metas[i]['image_file']) + + if 'bbox_score' in img_metas[i]: + score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1) + if bbox_ids is not None: + bbox_ids.append(img_metas[i]['bbox_id']) + + all_boxes = np.zeros((batch_size, 6), dtype=np.float32) + all_boxes[:, 0:2] = center[:, 0:2] + all_boxes[:, 2:4] = scale[:, 0:2] + # scale is defined as: bbox_size / 200.0, so we + # need multiply 200.0 to get bbox size + all_boxes[:, 4] = np.prod(scale * 200.0, axis=1) + all_boxes[:, 5] = score + result['boxes'] = all_boxes + result['image_paths'] = image_paths + result['bbox_ids'] = bbox_ids + + # decode 3D heatmaps of hand keypoints + heatmap3d = output[0] + preds, maxvals = keypoints_from_heatmaps3d(heatmap3d, center, scale) + keypoints_3d = np.zeros((batch_size, preds.shape[1], 4), + dtype=np.float32) + keypoints_3d[:, :, 0:3] = preds[:, :, 0:3] + keypoints_3d[:, :, 3:4] = maxvals + # transform keypoint depth to camera space + keypoints_3d[:, :, 2] = \ + (keypoints_3d[:, :, 2] / self.right_hand_head.depth_size - 0.5) \ + * heatmap3d_depth_bound[:, np.newaxis] + + result['preds'] = keypoints_3d + + # decode relative hand root depth + # transform relative root depth to camera space + result['rel_root_depth'] = (output[1] / self.root_head.heatmap_size - + 0.5) * root_depth_bound + + # decode hand type + result['hand_type'] = output[2] > 0.5 + return result diff --git a/mmpose/models/heads/temporal_regression_head.py b/mmpose/models/heads/temporal_regression_head.py new file mode 100644 index 0000000..97a07f9 --- /dev/null +++ b/mmpose/models/heads/temporal_regression_head.py @@ -0,0 +1,319 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch.nn as nn +from mmcv.cnn import build_conv_layer, constant_init, kaiming_init +from mmcv.utils.parrots_wrapper import _BatchNorm + +from mmpose.core import (WeightNormClipHook, compute_similarity_transform, + fliplr_regression) +from mmpose.models.builder import HEADS, build_loss + + +@HEADS.register_module() +class TemporalRegressionHead(nn.Module): + """Regression head of VideoPose3D. + + "3D human pose estimation in video with temporal convolutions and + semi-supervised training", CVPR'2019. + + Args: + in_channels (int): Number of input channels + num_joints (int): Number of joints + loss_keypoint (dict): Config for keypoint loss. Default: None. + max_norm (float|None): if not None, the weight of convolution layers + will be clipped to have a maximum norm of max_norm. + is_trajectory (bool): If the model only predicts root joint + position, then this arg should be set to True. In this case, + traj_loss will be calculated. Otherwise, it should be set to + False. Default: False. + """ + + def __init__(self, + in_channels, + num_joints, + max_norm=None, + loss_keypoint=None, + is_trajectory=False, + train_cfg=None, + test_cfg=None): + super().__init__() + + self.in_channels = in_channels + self.num_joints = num_joints + self.max_norm = max_norm + self.loss = build_loss(loss_keypoint) + self.is_trajectory = is_trajectory + if self.is_trajectory: + assert self.num_joints == 1 + + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + + self.conv = build_conv_layer( + dict(type='Conv1d'), in_channels, num_joints * 3, 1) + + if self.max_norm is not None: + # Apply weight norm clip to conv layers + weight_clip = WeightNormClipHook(self.max_norm) + for module in self.modules(): + if isinstance(module, nn.modules.conv._ConvNd): + weight_clip.register(module) + + @staticmethod + def _transform_inputs(x): + """Transform inputs for decoder. + + Args: + inputs (tuple or list of Tensor | Tensor): multi-level features. + + Returns: + Tensor: The transformed inputs + """ + if not isinstance(x, (list, tuple)): + return x + + assert len(x) > 0 + + # return the top-level feature of the 1D feature pyramid + return x[-1] + + def forward(self, x): + """Forward function.""" + x = self._transform_inputs(x) + + assert x.ndim == 3 and x.shape[2] == 1, f'Invalid shape {x.shape}' + output = self.conv(x) + N = output.shape[0] + return output.reshape(N, self.num_joints, 3) + + def get_loss(self, output, target, target_weight): + """Calculate keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + output (torch.Tensor[N, K, 3]): Output keypoints. + target (torch.Tensor[N, K, 3]): Target keypoints. + target_weight (torch.Tensor[N, K, 3]): + Weights across different joint types. + If self.is_trajectory is True and target_weight is None, + target_weight will be set inversely proportional to joint + depth. + """ + losses = dict() + assert not isinstance(self.loss, nn.Sequential) + + # trajectory model + if self.is_trajectory: + if target.dim() == 2: + target.unsqueeze_(1) + + if target_weight is None: + target_weight = (1 / target[:, :, 2:]).expand(target.shape) + assert target.dim() == 3 and target_weight.dim() == 3 + + losses['traj_loss'] = self.loss(output, target, target_weight) + + # pose model + else: + if target_weight is None: + target_weight = target.new_ones(target.shape) + assert target.dim() == 3 and target_weight.dim() == 3 + losses['reg_loss'] = self.loss(output, target, target_weight) + + return losses + + def get_accuracy(self, output, target, target_weight, metas): + """Calculate accuracy for keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + output (torch.Tensor[N, K, 3]): Output keypoints. + target (torch.Tensor[N, K, 3]): Target keypoints. + target_weight (torch.Tensor[N, K, 3]): + Weights across different joint types. + metas (list(dict)): Information about data augmentation including: + + - target_image_path (str): Optional, path to the image file + - target_mean (float): Optional, normalization parameter of + the target pose. + - target_std (float): Optional, normalization parameter of the + target pose. + - root_position (np.ndarray[3,1]): Optional, global + position of the root joint. + - root_index (torch.ndarray[1,]): Optional, original index of + the root joint before root-centering. + """ + + accuracy = dict() + + N = output.shape[0] + output_ = output.detach().cpu().numpy() + target_ = target.detach().cpu().numpy() + # Denormalize the predicted pose + if 'target_mean' in metas[0] and 'target_std' in metas[0]: + target_mean = np.stack([m['target_mean'] for m in metas]) + target_std = np.stack([m['target_std'] for m in metas]) + output_ = self._denormalize_joints(output_, target_mean, + target_std) + target_ = self._denormalize_joints(target_, target_mean, + target_std) + + # Restore global position + if self.test_cfg.get('restore_global_position', False): + root_pos = np.stack([m['root_position'] for m in metas]) + root_idx = metas[0].get('root_position_index', None) + output_ = self._restore_global_position(output_, root_pos, + root_idx) + target_ = self._restore_global_position(target_, root_pos, + root_idx) + # Get target weight + if target_weight is None: + target_weight_ = np.ones_like(target_) + else: + target_weight_ = target_weight.detach().cpu().numpy() + if self.test_cfg.get('restore_global_position', False): + root_idx = metas[0].get('root_position_index', None) + root_weight = metas[0].get('root_joint_weight', 1.0) + target_weight_ = self._restore_root_target_weight( + target_weight_, root_weight, root_idx) + + mpjpe = np.mean( + np.linalg.norm((output_ - target_) * target_weight_, axis=-1)) + + transformed_output = np.zeros_like(output_) + for i in range(N): + transformed_output[i, :, :] = compute_similarity_transform( + output_[i, :, :], target_[i, :, :]) + p_mpjpe = np.mean( + np.linalg.norm( + (transformed_output - target_) * target_weight_, axis=-1)) + + accuracy['mpjpe'] = output.new_tensor(mpjpe) + accuracy['p_mpjpe'] = output.new_tensor(p_mpjpe) + + return accuracy + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output_regression (np.ndarray): Output regression. + + Args: + x (torch.Tensor[N, K, 2]): Input features. + flip_pairs (None | list[tuple()): + Pairs of keypoints which are mirrored. + """ + output = self.forward(x) + + if flip_pairs is not None: + output_regression = fliplr_regression( + output.detach().cpu().numpy(), + flip_pairs, + center_mode='static', + center_x=0) + else: + output_regression = output.detach().cpu().numpy() + return output_regression + + def decode(self, metas, output): + """Decode the keypoints from output regression. + + Args: + metas (list(dict)): Information about data augmentation. + By default this includes: + + - "target_image_path": path to the image file + output (np.ndarray[N, K, 3]): predicted regression vector. + metas (list(dict)): Information about data augmentation including: + + - target_image_path (str): Optional, path to the image file + - target_mean (float): Optional, normalization parameter of + the target pose. + - target_std (float): Optional, normalization parameter of the + target pose. + - root_position (np.ndarray[3,1]): Optional, global + position of the root joint. + - root_index (torch.ndarray[1,]): Optional, original index of + the root joint before root-centering. + """ + + # Denormalize the predicted pose + if 'target_mean' in metas[0] and 'target_std' in metas[0]: + target_mean = np.stack([m['target_mean'] for m in metas]) + target_std = np.stack([m['target_std'] for m in metas]) + output = self._denormalize_joints(output, target_mean, target_std) + + # Restore global position + if self.test_cfg.get('restore_global_position', False): + root_pos = np.stack([m['root_position'] for m in metas]) + root_idx = metas[0].get('root_position_index', None) + output = self._restore_global_position(output, root_pos, root_idx) + + target_image_paths = [m.get('target_image_path', None) for m in metas] + result = {'preds': output, 'target_image_paths': target_image_paths} + + return result + + @staticmethod + def _denormalize_joints(x, mean, std): + """Denormalize joint coordinates with given statistics mean and std. + + Args: + x (np.ndarray[N, K, 3]): Normalized joint coordinates. + mean (np.ndarray[K, 3]): Mean value. + std (np.ndarray[K, 3]): Std value. + """ + assert x.ndim == 3 + assert x.shape == mean.shape == std.shape + + return x * std + mean + + @staticmethod + def _restore_global_position(x, root_pos, root_idx=None): + """Restore global position of the root-centered joints. + + Args: + x (np.ndarray[N, K, 3]): root-centered joint coordinates + root_pos (np.ndarray[N,1,3]): The global position of the + root joint. + root_idx (int|None): If not none, the root joint will be inserted + back to the pose at the given index. + """ + x = x + root_pos + if root_idx is not None: + x = np.insert(x, root_idx, root_pos.squeeze(1), axis=1) + return x + + @staticmethod + def _restore_root_target_weight(target_weight, root_weight, root_idx=None): + """Restore the target weight of the root joint after the restoration of + the global position. + + Args: + target_weight (np.ndarray[N, K, 1]): Target weight of relativized + joints. + root_weight (float): The target weight value of the root joint. + root_idx (int|None): If not none, the root joint weight will be + inserted back to the target weight at the given index. + """ + if root_idx is not None: + root_weight = np.full( + target_weight.shape[0], root_weight, dtype=target_weight.dtype) + target_weight = np.insert( + target_weight, root_idx, root_weight[:, None], axis=1) + return target_weight + + def init_weights(self): + """Initialize the weights.""" + for m in self.modules(): + if isinstance(m, nn.modules.conv._ConvNd): + kaiming_init(m, mode='fan_in', nonlinearity='relu') + elif isinstance(m, _BatchNorm): + constant_init(m, 1) diff --git a/mmpose/models/heads/topdown_heatmap_base_head.py b/mmpose/models/heads/topdown_heatmap_base_head.py new file mode 100644 index 0000000..09646ea --- /dev/null +++ b/mmpose/models/heads/topdown_heatmap_base_head.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + +import numpy as np +import torch.nn as nn + +from mmpose.core.evaluation.top_down_eval import keypoints_from_heatmaps + + +class TopdownHeatmapBaseHead(nn.Module): + """Base class for top-down heatmap heads. + + All top-down heatmap heads should subclass it. + All subclass should overwrite: + + Methods:`get_loss`, supporting to calculate loss. + Methods:`get_accuracy`, supporting to calculate accuracy. + Methods:`forward`, supporting to forward model. + Methods:`inference_model`, supporting to inference model. + """ + + __metaclass__ = ABCMeta + + @abstractmethod + def get_loss(self, **kwargs): + """Gets the loss.""" + + @abstractmethod + def get_accuracy(self, **kwargs): + """Gets the accuracy.""" + + @abstractmethod + def forward(self, **kwargs): + """Forward function.""" + + @abstractmethod + def inference_model(self, **kwargs): + """Inference function.""" + + def decode(self, img_metas, output, **kwargs): + """Decode keypoints from heatmaps. + + Args: + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + output (np.ndarray[N, K, H, W]): model predicted heatmaps. + """ + batch_size = len(img_metas) + + if 'bbox_id' in img_metas[0]: + bbox_ids = [] + else: + bbox_ids = None + + c = np.zeros((batch_size, 2), dtype=np.float32) + s = np.zeros((batch_size, 2), dtype=np.float32) + image_paths = [] + score = np.ones(batch_size) + for i in range(batch_size): + c[i, :] = img_metas[i]['center'] + s[i, :] = img_metas[i]['scale'] + image_paths.append(img_metas[i]['image_file']) + + if 'bbox_score' in img_metas[i]: + score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1) + if bbox_ids is not None: + bbox_ids.append(img_metas[i]['bbox_id']) + + preds, maxvals = keypoints_from_heatmaps( + output, + c, + s, + unbiased=self.test_cfg.get('unbiased_decoding', False), + post_process=self.test_cfg.get('post_process', 'default'), + kernel=self.test_cfg.get('modulate_kernel', 11), + valid_radius_factor=self.test_cfg.get('valid_radius_factor', + 0.0546875), + use_udp=self.test_cfg.get('use_udp', False), + target_type=self.test_cfg.get('target_type', 'GaussianHeatmap')) + + all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32) + all_boxes = np.zeros((batch_size, 6), dtype=np.float32) + all_preds[:, :, 0:2] = preds[:, :, 0:2] + all_preds[:, :, 2:3] = maxvals + all_boxes[:, 0:2] = c[:, 0:2] + all_boxes[:, 2:4] = s[:, 0:2] + all_boxes[:, 4] = np.prod(s * 200.0, axis=1) + all_boxes[:, 5] = score + + result = {} + + result['preds'] = all_preds + result['boxes'] = all_boxes + result['image_paths'] = image_paths + result['bbox_ids'] = bbox_ids + + return result + + @staticmethod + def _get_deconv_cfg(deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding diff --git a/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/mmpose/models/heads/topdown_heatmap_multi_stage_head.py new file mode 100644 index 0000000..c439f5b --- /dev/null +++ b/mmpose/models/heads/topdown_heatmap_multi_stage_head.py @@ -0,0 +1,572 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp + +import torch.nn as nn +from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, Linear, + build_activation_layer, build_conv_layer, + build_norm_layer, build_upsample_layer, constant_init, + kaiming_init, normal_init) + +from mmpose.core.evaluation import pose_pck_accuracy +from mmpose.core.post_processing import flip_back +from mmpose.models.builder import build_loss +from ..builder import HEADS +from .topdown_heatmap_base_head import TopdownHeatmapBaseHead + + +@HEADS.register_module() +class TopdownHeatmapMultiStageHead(TopdownHeatmapBaseHead): + """Top-down heatmap multi-stage head. + + TopdownHeatmapMultiStageHead is consisted of multiple branches, + each of which has num_deconv_layers(>=0) number of deconv layers + and a simple conv2d layer. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + num_stages (int): Number of stages. + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. + loss_keypoint (dict): Config for keypoint loss. Default: None. + """ + + def __init__(self, + in_channels=512, + out_channels=17, + num_stages=1, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=None, + loss_keypoint=None, + train_cfg=None, + test_cfg=None): + super().__init__() + + self.in_channels = in_channels + self.num_stages = num_stages + self.loss = build_loss(loss_keypoint) + + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap') + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + # build multi-stage deconv layers + self.multi_deconv_layers = nn.ModuleList([]) + for _ in range(self.num_stages): + if num_deconv_layers > 0: + deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + elif num_deconv_layers == 0: + deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + self.multi_deconv_layers.append(deconv_layers) + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0, 1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + elif extra['final_conv_kernel'] == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_final_layer = True + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + # build multi-stage final layers + self.multi_final_layers = nn.ModuleList([]) + for i in range(self.num_stages): + if identity_final_layer: + final_layer = nn.Identity() + else: + final_layer = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=num_deconv_filters[-1] + if num_deconv_layers > 0 else in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=padding) + self.multi_final_layers.append(final_layer) + + def get_loss(self, output, target, target_weight): + """Calculate top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - num_outputs: O + - heatmaps height: H + - heatmaps weight: W + + Args: + output (torch.Tensor[N,K,H,W]): + Output heatmaps. + target (torch.Tensor[N,K,H,W]): + Target heatmaps. + target_weight (torch.Tensor[N,K,1]): + Weights across different joint types. + """ + + losses = dict() + + assert isinstance(output, list) + assert target.dim() == 4 and target_weight.dim() == 3 + + if isinstance(self.loss, nn.Sequential): + assert len(self.loss) == len(output) + for i in range(len(output)): + target_i = target + target_weight_i = target_weight + if isinstance(self.loss, nn.Sequential): + loss_func = self.loss[i] + else: + loss_func = self.loss + loss_i = loss_func(output[i], target_i, target_weight_i) + if 'heatmap_loss' not in losses: + losses['heatmap_loss'] = loss_i + else: + losses['heatmap_loss'] += loss_i + + return losses + + def get_accuracy(self, output, target, target_weight): + """Calculate accuracy for top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (torch.Tensor[N,K,H,W]): Output heatmaps. + target (torch.Tensor[N,K,H,W]): Target heatmaps. + target_weight (torch.Tensor[N,K,1]): + Weights across different joint types. + """ + + accuracy = dict() + + if self.target_type == 'GaussianHeatmap': + _, avg_acc, _ = pose_pck_accuracy( + output[-1].detach().cpu().numpy(), + target.detach().cpu().numpy(), + target_weight.detach().cpu().numpy().squeeze(-1) > 0) + accuracy['acc_pose'] = float(avg_acc) + + return accuracy + + def forward(self, x): + """Forward function. + + Returns: + out (list[Tensor]): a list of heatmaps from multiple stages. + """ + out = [] + assert isinstance(x, list) + for i in range(self.num_stages): + y = self.multi_deconv_layers[i](x[i]) + y = self.multi_final_layers[i](y) + out.append(y) + return out + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output_heatmap (np.ndarray): Output heatmaps. + + Args: + x (List[torch.Tensor[NxKxHxW]]): Input features. + flip_pairs (None | list[tuple()): + Pairs of keypoints which are mirrored. + """ + output = self.forward(x) + assert isinstance(output, list) + output = output[-1] + + if flip_pairs is not None: + # perform flip + output_heatmap = flip_back( + output.detach().cpu().numpy(), + flip_pairs, + target_type=self.target_type) + # feature is not aligned, shift flipped heatmap for higher accuracy + if self.test_cfg.get('shift_heatmap', False): + output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] + else: + output_heatmap = output.detach().cpu().numpy() + + return output_heatmap + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.multi_deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.multi_final_layers.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) + + +class PredictHeatmap(nn.Module): + """Predict the heat map for an input feature. + + Args: + unit_channels (int): Number of input channels. + out_channels (int): Number of output channels. + out_shape (tuple): Shape of the output heatmap. + use_prm (bool): Whether to use pose refine machine. Default: False. + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + """ + + def __init__(self, + unit_channels, + out_channels, + out_shape, + use_prm=False, + norm_cfg=dict(type='BN')): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.unit_channels = unit_channels + self.out_channels = out_channels + self.out_shape = out_shape + self.use_prm = use_prm + if use_prm: + self.prm = PRM(out_channels, norm_cfg=norm_cfg) + self.conv_layers = nn.Sequential( + ConvModule( + unit_channels, + unit_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=norm_cfg, + inplace=False), + ConvModule( + unit_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + act_cfg=None, + inplace=False)) + + def forward(self, feature): + feature = self.conv_layers(feature) + output = nn.functional.interpolate( + feature, size=self.out_shape, mode='bilinear', align_corners=True) + if self.use_prm: + output = self.prm(output) + return output + + +class PRM(nn.Module): + """Pose Refine Machine. + + Please refer to "Learning Delicate Local Representations + for Multi-Person Pose Estimation" (ECCV 2020). + + Args: + out_channels (int): Channel number of the output. Equals to + the number of key points. + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + """ + + def __init__(self, out_channels, norm_cfg=dict(type='BN')): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + self.out_channels = out_channels + self.global_pooling = nn.AdaptiveAvgPool2d((1, 1)) + self.middle_path = nn.Sequential( + Linear(self.out_channels, self.out_channels), + build_norm_layer(dict(type='BN1d'), out_channels)[1], + build_activation_layer(dict(type='ReLU')), + Linear(self.out_channels, self.out_channels), + build_norm_layer(dict(type='BN1d'), out_channels)[1], + build_activation_layer(dict(type='ReLU')), + build_activation_layer(dict(type='Sigmoid'))) + + self.bottom_path = nn.Sequential( + ConvModule( + self.out_channels, + self.out_channels, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=norm_cfg, + inplace=False), + DepthwiseSeparableConvModule( + self.out_channels, + 1, + kernel_size=9, + stride=1, + padding=4, + norm_cfg=norm_cfg, + inplace=False), build_activation_layer(dict(type='Sigmoid'))) + self.conv_bn_relu_prm_1 = ConvModule( + self.out_channels, + self.out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + inplace=False) + + def forward(self, x): + out = self.conv_bn_relu_prm_1(x) + out_1 = out + + out_2 = self.global_pooling(out_1) + out_2 = out_2.view(out_2.size(0), -1) + out_2 = self.middle_path(out_2) + out_2 = out_2.unsqueeze(2) + out_2 = out_2.unsqueeze(3) + + out_3 = self.bottom_path(out_1) + out = out_1 * (1 + out_2 * out_3) + + return out + + +@HEADS.register_module() +class TopdownHeatmapMSMUHead(TopdownHeatmapBaseHead): + """Heads for multi-stage multi-unit heads used in Multi-Stage Pose + estimation Network (MSPN), and Residual Steps Networks (RSN). + + Args: + unit_channels (int): Number of input channels. + out_channels (int): Number of output channels. + out_shape (tuple): Shape of the output heatmap. + num_stages (int): Number of stages. + num_units (int): Number of units in each stage. + use_prm (bool): Whether to use pose refine machine (PRM). + Default: False. + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + loss_keypoint (dict): Config for keypoint loss. Default: None. + """ + + def __init__(self, + out_shape, + unit_channels=256, + out_channels=17, + num_stages=4, + num_units=4, + use_prm=False, + norm_cfg=dict(type='BN'), + loss_keypoint=None, + train_cfg=None, + test_cfg=None): + # Protect mutable default arguments + norm_cfg = cp.deepcopy(norm_cfg) + super().__init__() + + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap') + + self.out_shape = out_shape + self.unit_channels = unit_channels + self.out_channels = out_channels + self.num_stages = num_stages + self.num_units = num_units + + self.loss = build_loss(loss_keypoint) + + self.predict_layers = nn.ModuleList([]) + for i in range(self.num_stages): + for j in range(self.num_units): + self.predict_layers.append( + PredictHeatmap( + unit_channels, + out_channels, + out_shape, + use_prm, + norm_cfg=norm_cfg)) + + def get_loss(self, output, target, target_weight): + """Calculate top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - num_outputs: O + - heatmaps height: H + - heatmaps weight: W + + Args: + output (torch.Tensor[N,O,K,H,W]): Output heatmaps. + target (torch.Tensor[N,O,K,H,W]): Target heatmaps. + target_weight (torch.Tensor[N,O,K,1]): + Weights across different joint types. + """ + + losses = dict() + + assert isinstance(output, list) + assert target.dim() == 5 and target_weight.dim() == 4 + assert target.size(1) == len(output) + + if isinstance(self.loss, nn.Sequential): + assert len(self.loss) == len(output) + for i in range(len(output)): + target_i = target[:, i, :, :, :] + target_weight_i = target_weight[:, i, :, :] + + if isinstance(self.loss, nn.Sequential): + loss_func = self.loss[i] + else: + loss_func = self.loss + + loss_i = loss_func(output[i], target_i, target_weight_i) + if 'heatmap_loss' not in losses: + losses['heatmap_loss'] = loss_i + else: + losses['heatmap_loss'] += loss_i + + return losses + + def get_accuracy(self, output, target, target_weight): + """Calculate accuracy for top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (torch.Tensor[N,K,H,W]): Output heatmaps. + target (torch.Tensor[N,K,H,W]): Target heatmaps. + target_weight (torch.Tensor[N,K,1]): + Weights across different joint types. + """ + + accuracy = dict() + + if self.target_type == 'GaussianHeatmap': + assert isinstance(output, list) + assert target.dim() == 5 and target_weight.dim() == 4 + _, avg_acc, _ = pose_pck_accuracy( + output[-1].detach().cpu().numpy(), + target[:, -1, ...].detach().cpu().numpy(), + target_weight[:, -1, + ...].detach().cpu().numpy().squeeze(-1) > 0) + accuracy['acc_pose'] = float(avg_acc) + + return accuracy + + def forward(self, x): + """Forward function. + + Returns: + out (list[Tensor]): a list of heatmaps from multiple stages + and units. + """ + out = [] + assert isinstance(x, list) + assert len(x) == self.num_stages + assert isinstance(x[0], list) + assert len(x[0]) == self.num_units + assert x[0][0].shape[1] == self.unit_channels + for i in range(self.num_stages): + for j in range(self.num_units): + y = self.predict_layers[i * self.num_units + j](x[i][j]) + out.append(y) + + return out + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output_heatmap (np.ndarray): Output heatmaps. + + Args: + x (list[torch.Tensor[N,K,H,W]]): Input features. + flip_pairs (None | list[tuple]): + Pairs of keypoints which are mirrored. + """ + output = self.forward(x) + assert isinstance(output, list) + output = output[-1] + if flip_pairs is not None: + output_heatmap = flip_back( + output.detach().cpu().numpy(), + flip_pairs, + target_type=self.target_type) + # feature is not aligned, shift flipped heatmap for higher accuracy + if self.test_cfg.get('shift_heatmap', False): + output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] + else: + output_heatmap = output.detach().cpu().numpy() + return output_heatmap + + def init_weights(self): + """Initialize model weights.""" + for m in self.predict_layers.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + elif isinstance(m, nn.Linear): + normal_init(m, std=0.01) diff --git a/mmpose/models/heads/topdown_heatmap_simple_head.py b/mmpose/models/heads/topdown_heatmap_simple_head.py new file mode 100644 index 0000000..72f3348 --- /dev/null +++ b/mmpose/models/heads/topdown_heatmap_simple_head.py @@ -0,0 +1,350 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, + constant_init, normal_init) + +from mmpose.core.evaluation import pose_pck_accuracy +from mmpose.core.post_processing import flip_back +from mmpose.models.builder import build_loss +from mmpose.models.utils.ops import resize +from ..builder import HEADS +import torch.nn.functional as F +from .topdown_heatmap_base_head import TopdownHeatmapBaseHead + + +@HEADS.register_module() +class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead): + """Top-down heatmap simple head. paper ref: Bin Xiao et al. ``Simple + Baselines for Human Pose Estimation and Tracking``. + + TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers + and a simple conv2d layer. + + Args: + in_channels (int): Number of input channels + out_channels (int): Number of output channels + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. + in_index (int|Sequence[int]): Input feature index. Default: 0 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + Default: None. + + - 'resize_concat': Multiple feature maps will be resized to the + same size as the first one and then concat together. + Usually used in FCN head of HRNet. + - 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + - None: Only one select feature map is allowed. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + loss_keypoint (dict): Config for keypoint loss. Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=None, + in_index=0, + input_transform=None, + align_corners=False, + loss_keypoint=None, + train_cfg=None, + test_cfg=None, + upsample=0,): + super().__init__() + + self.in_channels = in_channels + self.loss = build_loss(loss_keypoint) + self.upsample = upsample + + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap') + + self._init_inputs(in_channels, in_index, input_transform) + self.in_index = in_index + self.align_corners = align_corners + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + if num_deconv_layers > 0: + self.deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + elif num_deconv_layers == 0: + self.deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0, 1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + elif extra['final_conv_kernel'] == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_final_layer = True + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + if identity_final_layer: + self.final_layer = nn.Identity() + else: + conv_channels = num_deconv_filters[ + -1] if num_deconv_layers > 0 else self.in_channels + + layers = [] + if extra is not None: + num_conv_layers = extra.get('num_conv_layers', 0) + num_conv_kernels = extra.get('num_conv_kernels', + [1] * num_conv_layers) + + for i in range(num_conv_layers): + layers.append( + build_conv_layer( + dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=conv_channels, + kernel_size=num_conv_kernels[i], + stride=1, + padding=(num_conv_kernels[i] - 1) // 2)) + layers.append( + build_norm_layer(dict(type='BN'), conv_channels)[1]) + layers.append(nn.ReLU(inplace=True)) + + layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=padding)) + + if len(layers) > 1: + self.final_layer = nn.Sequential(*layers) + else: + self.final_layer = layers[0] + + def get_loss(self, output, target, target_weight): + """Calculate top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (torch.Tensor[N,K,H,W]): Output heatmaps. + target (torch.Tensor[N,K,H,W]): Target heatmaps. + target_weight (torch.Tensor[N,K,1]): + Weights across different joint types. + """ + + losses = dict() + + assert not isinstance(self.loss, nn.Sequential) + assert target.dim() == 4 and target_weight.dim() == 3 + losses['heatmap_loss'] = self.loss(output, target, target_weight) + + return losses + + def get_accuracy(self, output, target, target_weight): + """Calculate accuracy for top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (torch.Tensor[N,K,H,W]): Output heatmaps. + target (torch.Tensor[N,K,H,W]): Target heatmaps. + target_weight (torch.Tensor[N,K,1]): + Weights across different joint types. + """ + + accuracy = dict() + + if self.target_type == 'GaussianHeatmap': + _, avg_acc, _ = pose_pck_accuracy( + output.detach().cpu().numpy(), + target.detach().cpu().numpy(), + target_weight.detach().cpu().numpy().squeeze(-1) > 0) + accuracy['acc_pose'] = float(avg_acc) + + return accuracy + + def forward(self, x): + """Forward function.""" + x = self._transform_inputs(x) + x = self.deconv_layers(x) + x = self.final_layer(x) + return x + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output_heatmap (np.ndarray): Output heatmaps. + + Args: + x (torch.Tensor[N,K,H,W]): Input features. + flip_pairs (None | list[tuple]): + Pairs of keypoints which are mirrored. + """ + output = self.forward(x) + + if flip_pairs is not None: + output_heatmap = flip_back( + output.detach().cpu().numpy(), + flip_pairs, + target_type=self.target_type) + # feature is not aligned, shift flipped heatmap for higher accuracy + if self.test_cfg.get('shift_heatmap', False): + output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] + else: + output_heatmap = output.detach().cpu().numpy() + return output_heatmap + + def _init_inputs(self, in_channels, in_index, input_transform): + """Check and initialize input transforms. + + The in_channels, in_index and input_transform must match. + Specifically, when input_transform is None, only single feature map + will be selected. So in_channels and in_index must be of type int. + When input_transform is not None, in_channels and in_index must be + list or tuple, with the same length. + + Args: + in_channels (int|Sequence[int]): Input channels. + in_index (int|Sequence[int]): Input feature index. + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + + - 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + - 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + - None: Only one select feature map is allowed. + """ + + if input_transform is not None: + assert input_transform in ['resize_concat', 'multiple_select'] + self.input_transform = input_transform + self.in_index = in_index + if input_transform is not None: + assert isinstance(in_channels, (list, tuple)) + assert isinstance(in_index, (list, tuple)) + assert len(in_channels) == len(in_index) + if input_transform == 'resize_concat': + self.in_channels = sum(in_channels) + else: + self.in_channels = in_channels + else: + assert isinstance(in_channels, int) + assert isinstance(in_index, int) + self.in_channels = in_channels + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor] | Tensor): multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + if not isinstance(inputs, list): + if not isinstance(inputs, list): + if self.upsample > 0: + inputs = resize( + input=F.relu(inputs), + scale_factor=self.upsample, + mode='bilinear', + align_corners=self.align_corners + ) + return inputs + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = torch.cat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.final_layer.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) diff --git a/mmpose/models/heads/vipnas_heatmap_simple_head.py b/mmpose/models/heads/vipnas_heatmap_simple_head.py new file mode 100644 index 0000000..4170312 --- /dev/null +++ b/mmpose/models/heads/vipnas_heatmap_simple_head.py @@ -0,0 +1,349 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, + constant_init, normal_init) + +from mmpose.core.evaluation import pose_pck_accuracy +from mmpose.core.post_processing import flip_back +from mmpose.models.builder import build_loss +from mmpose.models.utils.ops import resize +from ..builder import HEADS +from .topdown_heatmap_base_head import TopdownHeatmapBaseHead + + +@HEADS.register_module() +class ViPNASHeatmapSimpleHead(TopdownHeatmapBaseHead): + """ViPNAS heatmap simple head. + + ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search. + More details can be found in the `paper + `__ . + + TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers + and a simple conv2d layer. + + Args: + in_channels (int): Number of input channels + out_channels (int): Number of output channels + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. + num_deconv_groups (list|tuple): Group number. + in_index (int|Sequence[int]): Input feature index. Default: -1 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + Default: None. + + - 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + - 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + - None: Only one select feature map is allowed. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + loss_keypoint (dict): Config for keypoint loss. Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + num_deconv_layers=3, + num_deconv_filters=(144, 144, 144), + num_deconv_kernels=(4, 4, 4), + num_deconv_groups=(16, 16, 16), + extra=None, + in_index=0, + input_transform=None, + align_corners=False, + loss_keypoint=None, + train_cfg=None, + test_cfg=None): + super().__init__() + + self.in_channels = in_channels + self.loss = build_loss(loss_keypoint) + + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap') + + self._init_inputs(in_channels, in_index, input_transform) + self.in_index = in_index + self.align_corners = align_corners + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + if num_deconv_layers > 0: + self.deconv_layers = self._make_deconv_layer( + num_deconv_layers, num_deconv_filters, num_deconv_kernels, + num_deconv_groups) + elif num_deconv_layers == 0: + self.deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0, 1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + elif extra['final_conv_kernel'] == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_final_layer = True + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + if identity_final_layer: + self.final_layer = nn.Identity() + else: + conv_channels = num_deconv_filters[ + -1] if num_deconv_layers > 0 else self.in_channels + + layers = [] + if extra is not None: + num_conv_layers = extra.get('num_conv_layers', 0) + num_conv_kernels = extra.get('num_conv_kernels', + [1] * num_conv_layers) + + for i in range(num_conv_layers): + layers.append( + build_conv_layer( + dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=conv_channels, + kernel_size=num_conv_kernels[i], + stride=1, + padding=(num_conv_kernels[i] - 1) // 2)) + layers.append( + build_norm_layer(dict(type='BN'), conv_channels)[1]) + layers.append(nn.ReLU(inplace=True)) + + layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=conv_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=padding)) + + if len(layers) > 1: + self.final_layer = nn.Sequential(*layers) + else: + self.final_layer = layers[0] + + def get_loss(self, output, target, target_weight): + """Calculate top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (torch.Tensor[N,K,H,W]): Output heatmaps. + target (torch.Tensor[N,K,H,W]): Target heatmaps. + target_weight (torch.Tensor[N,K,1]): + Weights across different joint types. + """ + + losses = dict() + + assert not isinstance(self.loss, nn.Sequential) + assert target.dim() == 4 and target_weight.dim() == 3 + losses['heatmap_loss'] = self.loss(output, target, target_weight) + + return losses + + def get_accuracy(self, output, target, target_weight): + """Calculate accuracy for top-down keypoint loss. + + Note: + - batch_size: N + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (torch.Tensor[N,K,H,W]): Output heatmaps. + target (torch.Tensor[N,K,H,W]): Target heatmaps. + target_weight (torch.Tensor[N,K,1]): + Weights across different joint types. + """ + + accuracy = dict() + + if self.target_type.lower() == 'GaussianHeatmap'.lower(): + _, avg_acc, _ = pose_pck_accuracy( + output.detach().cpu().numpy(), + target.detach().cpu().numpy(), + target_weight.detach().cpu().numpy().squeeze(-1) > 0) + accuracy['acc_pose'] = float(avg_acc) + + return accuracy + + def forward(self, x): + """Forward function.""" + x = self._transform_inputs(x) + x = self.deconv_layers(x) + x = self.final_layer(x) + return x + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output_heatmap (np.ndarray): Output heatmaps. + + Args: + x (torch.Tensor[N,K,H,W]): Input features. + flip_pairs (None | list[tuple]): + Pairs of keypoints which are mirrored. + """ + output = self.forward(x) + + if flip_pairs is not None: + output_heatmap = flip_back( + output.detach().cpu().numpy(), + flip_pairs, + target_type=self.target_type) + # feature is not aligned, shift flipped heatmap for higher accuracy + if self.test_cfg.get('shift_heatmap', False): + output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] + else: + output_heatmap = output.detach().cpu().numpy() + return output_heatmap + + def _init_inputs(self, in_channels, in_index, input_transform): + """Check and initialize input transforms. + + The in_channels, in_index and input_transform must match. + Specifically, when input_transform is None, only single feature map + will be selected. So in_channels and in_index must be of type int. + When input_transform is not None, in_channels and in_index must be + list or tuple, with the same length. + + Args: + in_channels (int|Sequence[int]): Input channels. + in_index (int|Sequence[int]): Input feature index. + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + + - 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + - 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + - None: Only one select feature map is allowed. + """ + + if input_transform is not None: + assert input_transform in ['resize_concat', 'multiple_select'] + self.input_transform = input_transform + self.in_index = in_index + if input_transform is not None: + assert isinstance(in_channels, (list, tuple)) + assert isinstance(in_index, (list, tuple)) + assert len(in_channels) == len(in_index) + if input_transform == 'resize_concat': + self.in_channels = sum(in_channels) + else: + self.in_channels = in_channels + else: + assert isinstance(in_channels, int) + assert isinstance(in_index, int) + self.in_channels = in_channels + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor] | Tensor): multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + if not isinstance(inputs, list): + return inputs + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = torch.cat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels, + num_groups): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + if num_layers != len(num_groups): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_groups({len(num_groups)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + groups = num_groups[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + groups=groups, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.final_layer.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) diff --git a/mmpose/models/heads/voxelpose_head.py b/mmpose/models/heads/voxelpose_head.py new file mode 100644 index 0000000..8799bdc --- /dev/null +++ b/mmpose/models/heads/voxelpose_head.py @@ -0,0 +1,167 @@ +# ------------------------------------------------------------------------------ +# Copyright and License Information +# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models +# Original Licence: MIT License +# ------------------------------------------------------------------------------ + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import HEADS + + +@HEADS.register_module() +class CuboidCenterHead(nn.Module): + """Get results from the 3D human center heatmap. In this module, human 3D + centers are local maximums obtained from the 3D heatmap via NMS (max- + pooling). + + Args: + space_size (list[3]): The size of the 3D space. + cube_size (list[3]): The size of the heatmap volume. + space_center (list[3]): The coordinate of space center. + max_num (int): Maximum of human center detections. + max_pool_kernel (int): Kernel size of the max-pool kernel in nms. + """ + + def __init__(self, + space_size, + space_center, + cube_size, + max_num=10, + max_pool_kernel=3): + super(CuboidCenterHead, self).__init__() + # use register_buffer + self.register_buffer('grid_size', torch.tensor(space_size)) + self.register_buffer('cube_size', torch.tensor(cube_size)) + self.register_buffer('grid_center', torch.tensor(space_center)) + + self.num_candidates = max_num + self.max_pool_kernel = max_pool_kernel + self.loss = nn.MSELoss() + + def _get_real_locations(self, indices): + """ + Args: + indices (torch.Tensor(NXP)): Indices of points in the 3D tensor + + Returns: + real_locations (torch.Tensor(NXPx3)): Locations of points + in the world coordinate system + """ + real_locations = indices.float() / ( + self.cube_size - 1) * self.grid_size + \ + self.grid_center - self.grid_size / 2.0 + return real_locations + + def _nms_by_max_pool(self, heatmap_volumes): + max_num = self.num_candidates + batch_size = heatmap_volumes.shape[0] + root_cubes_nms = self._max_pool(heatmap_volumes) + root_cubes_nms_reshape = root_cubes_nms.reshape(batch_size, -1) + topk_values, topk_index = root_cubes_nms_reshape.topk(max_num) + topk_unravel_index = self._get_3d_indices(topk_index, + heatmap_volumes[0].shape) + + return topk_values, topk_unravel_index + + def _max_pool(self, inputs): + kernel = self.max_pool_kernel + padding = (kernel - 1) // 2 + max = F.max_pool3d( + inputs, kernel_size=kernel, stride=1, padding=padding) + keep = (inputs == max).float() + return keep * inputs + + @staticmethod + def _get_3d_indices(indices, shape): + """Get indices in the 3-D tensor. + + Args: + indices (torch.Tensor(NXp)): Indices of points in the 1D tensor + shape (torch.Size(3)): The shape of the original 3D tensor + + Returns: + indices: Indices of points in the original 3D tensor + """ + batch_size = indices.shape[0] + num_people = indices.shape[1] + indices_x = (indices // + (shape[1] * shape[2])).reshape(batch_size, num_people, -1) + indices_y = ((indices % (shape[1] * shape[2])) // + shape[2]).reshape(batch_size, num_people, -1) + indices_z = (indices % shape[2]).reshape(batch_size, num_people, -1) + indices = torch.cat([indices_x, indices_y, indices_z], dim=2) + return indices + + def forward(self, heatmap_volumes): + """ + + Args: + heatmap_volumes (torch.Tensor(NXLXWXH)): + 3D human center heatmaps predicted by the network. + Returns: + human_centers (torch.Tensor(NXPX5)): + Coordinates of human centers. + """ + batch_size = heatmap_volumes.shape[0] + + topk_values, topk_unravel_index = self._nms_by_max_pool( + heatmap_volumes.detach()) + + topk_unravel_index = self._get_real_locations(topk_unravel_index) + + human_centers = torch.zeros( + batch_size, self.num_candidates, 5, device=heatmap_volumes.device) + human_centers[:, :, 0:3] = topk_unravel_index + human_centers[:, :, 4] = topk_values + + return human_centers + + def get_loss(self, pred_cubes, gt): + + return dict(loss_center=self.loss(pred_cubes, gt)) + + +@HEADS.register_module() +class CuboidPoseHead(nn.Module): + + def __init__(self, beta): + """Get results from the 3D human pose heatmap. Instead of obtaining + maximums on the heatmap, this module regresses the coordinates of + keypoints via integral pose regression. Refer to `paper. + + ` for more details. + + Args: + beta: Constant to adjust the magnification of soft-maxed heatmap. + """ + super(CuboidPoseHead, self).__init__() + self.beta = beta + self.loss = nn.L1Loss() + + def forward(self, heatmap_volumes, grid_coordinates): + """ + + Args: + heatmap_volumes (torch.Tensor(NxKxLxWxH)): + 3D human pose heatmaps predicted by the network. + grid_coordinates (torch.Tensor(Nx(LxWxH)x3)): + Coordinates of the grids in the heatmap volumes. + Returns: + human_poses (torch.Tensor(NxKx3)): Coordinates of human poses. + """ + batch_size = heatmap_volumes.size(0) + channel = heatmap_volumes.size(1) + x = heatmap_volumes.reshape(batch_size, channel, -1, 1) + x = F.softmax(self.beta * x, dim=2) + grid_coordinates = grid_coordinates.unsqueeze(1) + x = torch.mul(x, grid_coordinates) + human_poses = torch.sum(x, dim=2) + + return human_poses + + def get_loss(self, preds, targets, weights): + + return dict(loss_pose=self.loss(preds * weights, targets * weights)) diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py new file mode 100644 index 0000000..d67973f --- /dev/null +++ b/mmpose/models/losses/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .classfication_loss import BCELoss +from .heatmap_loss import AdaptiveWingLoss +from .mesh_loss import GANLoss, MeshLoss +from .mse_loss import JointsMSELoss, JointsOHKMMSELoss +from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory +from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, + SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss, + WingLoss) + +__all__ = [ + 'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss', + 'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss', + 'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss', + 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss' +] diff --git a/mmpose/models/losses/classfication_loss.py b/mmpose/models/losses/classfication_loss.py new file mode 100644 index 0000000..b79b69d --- /dev/null +++ b/mmpose/models/losses/classfication_loss.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES + + +@LOSSES.register_module() +class BCELoss(nn.Module): + """Binary Cross Entropy loss.""" + + def __init__(self, use_target_weight=False, loss_weight=1.): + super().__init__() + self.criterion = F.binary_cross_entropy + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_labels: K + + Args: + output (torch.Tensor[N, K]): Output classification. + target (torch.Tensor[N, K]): Target classification. + target_weight (torch.Tensor[N, K] or torch.Tensor[N]): + Weights across different labels. + """ + + if self.use_target_weight: + assert target_weight is not None + loss = self.criterion(output, target, reduction='none') + if target_weight.dim() == 1: + target_weight = target_weight[:, None] + loss = (loss * target_weight).mean() + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight diff --git a/mmpose/models/losses/heatmap_loss.py b/mmpose/models/losses/heatmap_loss.py new file mode 100644 index 0000000..9471457 --- /dev/null +++ b/mmpose/models/losses/heatmap_loss.py @@ -0,0 +1,86 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from ..builder import LOSSES + + +@LOSSES.register_module() +class AdaptiveWingLoss(nn.Module): + """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face + Alignment via Heatmap Regression' Wang et al. ICCV'2019. + + Args: + alpha (float), omega (float), epsilon (float), theta (float) + are hyper-parameters. + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, + alpha=2.1, + omega=14, + epsilon=1, + theta=0.5, + use_target_weight=False, + loss_weight=1.): + super().__init__() + self.alpha = float(alpha) + self.omega = float(omega) + self.epsilon = float(epsilon) + self.theta = float(theta) + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def criterion(self, pred, target): + """Criterion of wingloss. + + Note: + batch_size: N + num_keypoints: K + + Args: + pred (torch.Tensor[NxKxHxW]): Predicted heatmaps. + target (torch.Tensor[NxKxHxW]): Target heatmaps. + """ + H, W = pred.shape[2:4] + delta = (target - pred).abs() + + A = self.omega * ( + 1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - target)) + ) * (self.alpha - target) * (torch.pow( + self.theta / self.epsilon, + self.alpha - target - 1)) * (1 / self.epsilon) + C = self.theta * A - self.omega * torch.log( + 1 + torch.pow(self.theta / self.epsilon, self.alpha - target)) + + losses = torch.where( + delta < self.theta, + self.omega * + torch.log(1 + + torch.pow(delta / self.epsilon, self.alpha - target)), + A * delta - C) + + return torch.mean(losses) + + def forward(self, output, target, target_weight): + """Forward function. + + Note: + batch_size: N + num_keypoints: K + + Args: + output (torch.Tensor[NxKxHxW]): Output heatmaps. + target (torch.Tensor[NxKxHxW]): Target heatmaps. + target_weight (torch.Tensor[NxKx1]): + Weights across different joint types. + """ + if self.use_target_weight: + loss = self.criterion(output * target_weight.unsqueeze(-1), + target * target_weight.unsqueeze(-1)) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight diff --git a/mmpose/models/losses/mesh_loss.py b/mmpose/models/losses/mesh_loss.py new file mode 100644 index 0000000..f9d18bd --- /dev/null +++ b/mmpose/models/losses/mesh_loss.py @@ -0,0 +1,340 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from ..builder import LOSSES +from ..utils.geometry import batch_rodrigues + + +def perspective_projection(points, rotation, translation, focal_length, + camera_center): + """This function computes the perspective projection of a set of 3D points. + + Note: + - batch size: B + - point number: N + + Args: + points (Tensor([B, N, 3])): A set of 3D points + rotation (Tensor([B, 3, 3])): Camera rotation matrix + translation (Tensor([B, 3])): Camera translation + focal_length (Tensor([B,])): Focal length + camera_center (Tensor([B, 2])): Camera center + + Returns: + projected_points (Tensor([B, N, 2])): Projected 2D + points in image space. + """ + + batch_size = points.shape[0] + K = torch.zeros([batch_size, 3, 3], device=points.device) + K[:, 0, 0] = focal_length + K[:, 1, 1] = focal_length + K[:, 2, 2] = 1. + K[:, :-1, -1] = camera_center + + # Transform points + points = torch.einsum('bij,bkj->bki', rotation, points) + points = points + translation.unsqueeze(1) + + # Apply perspective distortion + projected_points = points / points[:, :, -1].unsqueeze(-1) + + # Apply camera intrinsics + projected_points = torch.einsum('bij,bkj->bki', K, projected_points) + projected_points = projected_points[:, :, :-1] + return projected_points + + +@LOSSES.register_module() +class MeshLoss(nn.Module): + """Mix loss for 3D human mesh. It is composed of loss on 2D joints, 3D + joints, mesh vertices and smpl parameters (if any). + + Args: + joints_2d_loss_weight (float): Weight for loss on 2D joints. + joints_3d_loss_weight (float): Weight for loss on 3D joints. + vertex_loss_weight (float): Weight for loss on 3D verteices. + smpl_pose_loss_weight (float): Weight for loss on SMPL + pose parameters. + smpl_beta_loss_weight (float): Weight for loss on SMPL + shape parameters. + img_res (int): Input image resolution. + focal_length (float): Focal length of camera model. Default=5000. + """ + + def __init__(self, + joints_2d_loss_weight, + joints_3d_loss_weight, + vertex_loss_weight, + smpl_pose_loss_weight, + smpl_beta_loss_weight, + img_res, + focal_length=5000): + + super().__init__() + # Per-vertex loss on the mesh + self.criterion_vertex = nn.L1Loss(reduction='none') + + # Joints (2D and 3D) loss + self.criterion_joints_2d = nn.SmoothL1Loss(reduction='none') + self.criterion_joints_3d = nn.SmoothL1Loss(reduction='none') + + # Loss for SMPL parameter regression + self.criterion_regr = nn.MSELoss(reduction='none') + + self.joints_2d_loss_weight = joints_2d_loss_weight + self.joints_3d_loss_weight = joints_3d_loss_weight + self.vertex_loss_weight = vertex_loss_weight + self.smpl_pose_loss_weight = smpl_pose_loss_weight + self.smpl_beta_loss_weight = smpl_beta_loss_weight + self.focal_length = focal_length + self.img_res = img_res + + def joints_2d_loss(self, pred_joints_2d, gt_joints_2d, joints_2d_visible): + """Compute 2D reprojection loss on the joints. + + The loss is weighted by joints_2d_visible. + """ + conf = joints_2d_visible.float() + loss = (conf * + self.criterion_joints_2d(pred_joints_2d, gt_joints_2d)).mean() + return loss + + def joints_3d_loss(self, pred_joints_3d, gt_joints_3d, joints_3d_visible): + """Compute 3D joints loss for the examples that 3D joint annotations + are available. + + The loss is weighted by joints_3d_visible. + """ + conf = joints_3d_visible.float() + if len(gt_joints_3d) > 0: + gt_pelvis = (gt_joints_3d[:, 2, :] + gt_joints_3d[:, 3, :]) / 2 + gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :] + pred_pelvis = (pred_joints_3d[:, 2, :] + + pred_joints_3d[:, 3, :]) / 2 + pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :] + return ( + conf * + self.criterion_joints_3d(pred_joints_3d, gt_joints_3d)).mean() + return pred_joints_3d.sum() * 0 + + def vertex_loss(self, pred_vertices, gt_vertices, has_smpl): + """Compute 3D vertex loss for the examples that 3D human mesh + annotations are available. + + The loss is weighted by the has_smpl. + """ + conf = has_smpl.float() + loss_vertex = self.criterion_vertex(pred_vertices, gt_vertices) + loss_vertex = (conf[:, None, None] * loss_vertex).mean() + return loss_vertex + + def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas, + has_smpl): + """Compute SMPL parameters loss for the examples that SMPL parameter + annotations are available. + + The loss is weighted by has_smpl. + """ + conf = has_smpl.float() + gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3) + loss_regr_pose = self.criterion_regr(pred_rotmat, gt_rotmat) + loss_regr_betas = self.criterion_regr(pred_betas, gt_betas) + loss_regr_pose = (conf[:, None, None, None] * loss_regr_pose).mean() + loss_regr_betas = (conf[:, None] * loss_regr_betas).mean() + return loss_regr_pose, loss_regr_betas + + def project_points(self, points_3d, camera): + """Perform orthographic projection of 3D points using the camera + parameters, return projected 2D points in image plane. + + Note: + - batch size: B + - point number: N + + Args: + points_3d (Tensor([B, N, 3])): 3D points. + camera (Tensor([B, 3])): camera parameters with the + 3 channel as (scale, translation_x, translation_y) + + Returns: + Tensor([B, N, 2]): projected 2D points \ + in image space. + """ + batch_size = points_3d.shape[0] + device = points_3d.device + cam_t = torch.stack([ + camera[:, 1], camera[:, 2], 2 * self.focal_length / + (self.img_res * camera[:, 0] + 1e-9) + ], + dim=-1) + camera_center = camera.new_zeros([batch_size, 2]) + rot_t = torch.eye( + 3, device=device, + dtype=points_3d.dtype).unsqueeze(0).expand(batch_size, -1, -1) + joints_2d = perspective_projection( + points_3d, + rotation=rot_t, + translation=cam_t, + focal_length=self.focal_length, + camera_center=camera_center) + return joints_2d + + def forward(self, output, target): + """Forward function. + + Args: + output (dict): dict of network predicted results. + Keys: 'vertices', 'joints_3d', 'camera', + 'pose'(optional), 'beta'(optional) + target (dict): dict of ground-truth labels. + Keys: 'vertices', 'joints_3d', 'joints_3d_visible', + 'joints_2d', 'joints_2d_visible', 'pose', 'beta', + 'has_smpl' + + Returns: + dict: dict of losses. + """ + losses = {} + + # Per-vertex loss for the shape + pred_vertices = output['vertices'] + + gt_vertices = target['vertices'] + has_smpl = target['has_smpl'] + loss_vertex = self.vertex_loss(pred_vertices, gt_vertices, has_smpl) + losses['vertex_loss'] = loss_vertex * self.vertex_loss_weight + + # Compute loss on SMPL parameters, if available + if 'pose' in output.keys() and 'beta' in output.keys(): + pred_rotmat = output['pose'] + pred_betas = output['beta'] + gt_pose = target['pose'] + gt_betas = target['beta'] + loss_regr_pose, loss_regr_betas = self.smpl_losses( + pred_rotmat, pred_betas, gt_pose, gt_betas, has_smpl) + losses['smpl_pose_loss'] = \ + loss_regr_pose * self.smpl_pose_loss_weight + losses['smpl_beta_loss'] = \ + loss_regr_betas * self.smpl_beta_loss_weight + + # Compute 3D joints loss + pred_joints_3d = output['joints_3d'] + gt_joints_3d = target['joints_3d'] + joints_3d_visible = target['joints_3d_visible'] + loss_joints_3d = self.joints_3d_loss(pred_joints_3d, gt_joints_3d, + joints_3d_visible) + losses['joints_3d_loss'] = loss_joints_3d * self.joints_3d_loss_weight + + # Compute 2D reprojection loss for the 2D joints + pred_camera = output['camera'] + gt_joints_2d = target['joints_2d'] + joints_2d_visible = target['joints_2d_visible'] + pred_joints_2d = self.project_points(pred_joints_3d, pred_camera) + + # Normalize keypoints to [-1,1] + # The coordinate origin of pred_joints_2d is + # the center of the input image. + pred_joints_2d = 2 * pred_joints_2d / (self.img_res - 1) + # The coordinate origin of gt_joints_2d is + # the top left corner of the input image. + gt_joints_2d = 2 * gt_joints_2d / (self.img_res - 1) - 1 + loss_joints_2d = self.joints_2d_loss(pred_joints_2d, gt_joints_2d, + joints_2d_visible) + losses['joints_2d_loss'] = loss_joints_2d * self.joints_2d_loss_weight + + return losses + + +@LOSSES.register_module() +class GANLoss(nn.Module): + """Define GAN loss. + + Args: + gan_type (str): Support 'vanilla', 'lsgan', 'wgan', 'hinge'. + real_label_val (float): The value for real label. Default: 1.0. + fake_label_val (float): The value for fake label. Default: 0.0. + loss_weight (float): Loss weight. Default: 1.0. + Note that loss_weight is only for generators; and it is always 1.0 + for discriminators. + """ + + def __init__(self, + gan_type, + real_label_val=1.0, + fake_label_val=0.0, + loss_weight=1.0): + super().__init__() + self.gan_type = gan_type + self.loss_weight = loss_weight + self.real_label_val = real_label_val + self.fake_label_val = fake_label_val + + if self.gan_type == 'vanilla': + self.loss = nn.BCEWithLogitsLoss() + elif self.gan_type == 'lsgan': + self.loss = nn.MSELoss() + elif self.gan_type == 'wgan': + self.loss = self._wgan_loss + elif self.gan_type == 'hinge': + self.loss = nn.ReLU() + else: + raise NotImplementedError( + f'GAN type {self.gan_type} is not implemented.') + + @staticmethod + def _wgan_loss(input, target): + """wgan loss. + + Args: + input (Tensor): Input tensor. + target (bool): Target label. + + Returns: + Tensor: wgan loss. + """ + return -input.mean() if target else input.mean() + + def get_target_label(self, input, target_is_real): + """Get target label. + + Args: + input (Tensor): Input tensor. + target_is_real (bool): Whether the target is real or fake. + + Returns: + (bool | Tensor): Target tensor. Return bool for wgan, \ + otherwise, return Tensor. + """ + + if self.gan_type == 'wgan': + return target_is_real + target_val = ( + self.real_label_val if target_is_real else self.fake_label_val) + return input.new_ones(input.size()) * target_val + + def forward(self, input, target_is_real, is_disc=False): + """ + Args: + input (Tensor): The input for the loss module, i.e., the network + prediction. + target_is_real (bool): Whether the targe is real or fake. + is_disc (bool): Whether the loss for discriminators or not. + Default: False. + + Returns: + Tensor: GAN loss value. + """ + target_label = self.get_target_label(input, target_is_real) + if self.gan_type == 'hinge': + if is_disc: # for discriminators in hinge-gan + input = -input if target_is_real else input + loss = self.loss(1 + input).mean() + else: # for generators in hinge-gan + loss = -input.mean() + else: # other gan types + loss = self.loss(input, target_label) + + # loss_weight is always 1.0 for discriminators + return loss if is_disc else loss * self.loss_weight diff --git a/mmpose/models/losses/mse_loss.py b/mmpose/models/losses/mse_loss.py new file mode 100644 index 0000000..f972efa --- /dev/null +++ b/mmpose/models/losses/mse_loss.py @@ -0,0 +1,153 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from ..builder import LOSSES + + +@LOSSES.register_module() +class JointsMSELoss(nn.Module): + """MSE loss for heatmaps. + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, use_target_weight=False, loss_weight=1.): + super().__init__() + self.criterion = nn.MSELoss() + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output, target, target_weight): + """Forward function.""" + batch_size = output.size(0) + num_joints = output.size(1) + + heatmaps_pred = output.reshape( + (batch_size, num_joints, -1)).split(1, 1) + heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1) + + loss = 0. + + for idx in range(num_joints): + heatmap_pred = heatmaps_pred[idx].squeeze(1) + heatmap_gt = heatmaps_gt[idx].squeeze(1) + if self.use_target_weight: + loss += self.criterion(heatmap_pred * target_weight[:, idx], + heatmap_gt * target_weight[:, idx]) + else: + loss += self.criterion(heatmap_pred, heatmap_gt) + + return loss / num_joints * self.loss_weight + + +@LOSSES.register_module() +class CombinedTargetMSELoss(nn.Module): + """MSE loss for combined target. + CombinedTarget: The combination of classification target + (response map) and regression target (offset map). + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, use_target_weight, loss_weight=1.): + super().__init__() + self.criterion = nn.MSELoss(reduction='mean') + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output, target, target_weight): + batch_size = output.size(0) + num_channels = output.size(1) + heatmaps_pred = output.reshape( + (batch_size, num_channels, -1)).split(1, 1) + heatmaps_gt = target.reshape( + (batch_size, num_channels, -1)).split(1, 1) + loss = 0. + num_joints = num_channels // 3 + for idx in range(num_joints): + heatmap_pred = heatmaps_pred[idx * 3].squeeze() + heatmap_gt = heatmaps_gt[idx * 3].squeeze() + offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze() + offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze() + offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze() + offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze() + if self.use_target_weight: + heatmap_pred = heatmap_pred * target_weight[:, idx] + heatmap_gt = heatmap_gt * target_weight[:, idx] + # classification loss + loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt) + # regression loss + loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred, + heatmap_gt * offset_x_gt) + loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred, + heatmap_gt * offset_y_gt) + return loss / num_joints * self.loss_weight + + +@LOSSES.register_module() +class JointsOHKMMSELoss(nn.Module): + """MSE loss with online hard keypoint mining. + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + topk (int): Only top k joint losses are kept. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, use_target_weight=False, topk=8, loss_weight=1.): + super().__init__() + assert topk > 0 + self.criterion = nn.MSELoss(reduction='none') + self.use_target_weight = use_target_weight + self.topk = topk + self.loss_weight = loss_weight + + def _ohkm(self, loss): + """Online hard keypoint mining.""" + ohkm_loss = 0. + N = len(loss) + for i in range(N): + sub_loss = loss[i] + _, topk_idx = torch.topk( + sub_loss, k=self.topk, dim=0, sorted=False) + tmp_loss = torch.gather(sub_loss, 0, topk_idx) + ohkm_loss += torch.sum(tmp_loss) / self.topk + ohkm_loss /= N + return ohkm_loss + + def forward(self, output, target, target_weight): + """Forward function.""" + batch_size = output.size(0) + num_joints = output.size(1) + if num_joints < self.topk: + raise ValueError(f'topk ({self.topk}) should not ' + f'larger than num_joints ({num_joints}).') + heatmaps_pred = output.reshape( + (batch_size, num_joints, -1)).split(1, 1) + heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1) + + losses = [] + for idx in range(num_joints): + heatmap_pred = heatmaps_pred[idx].squeeze(1) + heatmap_gt = heatmaps_gt[idx].squeeze(1) + if self.use_target_weight: + losses.append( + self.criterion(heatmap_pred * target_weight[:, idx], + heatmap_gt * target_weight[:, idx])) + else: + losses.append(self.criterion(heatmap_pred, heatmap_gt)) + + losses = [loss.mean(dim=1).unsqueeze(dim=1) for loss in losses] + losses = torch.cat(losses, dim=1) + + return self._ohkm(losses) * self.loss_weight diff --git a/mmpose/models/losses/multi_loss_factory.py b/mmpose/models/losses/multi_loss_factory.py new file mode 100644 index 0000000..65f90a7 --- /dev/null +++ b/mmpose/models/losses/multi_loss_factory.py @@ -0,0 +1,281 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ + +import torch +import torch.nn as nn + +from ..builder import LOSSES + + +def _make_input(t, requires_grad=False, device=torch.device('cpu')): + """Make zero inputs for AE loss. + + Args: + t (torch.Tensor): input + requires_grad (bool): Option to use requires_grad. + device: torch device + + Returns: + torch.Tensor: zero input. + """ + inp = torch.autograd.Variable(t, requires_grad=requires_grad) + inp = inp.sum() + inp = inp.to(device) + return inp + + +@LOSSES.register_module() +class HeatmapLoss(nn.Module): + """Accumulate the heatmap loss for each image in the batch. + + Args: + supervise_empty (bool): Whether to supervise empty channels. + """ + + def __init__(self, supervise_empty=True): + super().__init__() + self.supervise_empty = supervise_empty + + def forward(self, pred, gt, mask): + """Forward function. + + Note: + - batch_size: N + - heatmaps weight: W + - heatmaps height: H + - max_num_people: M + - num_keypoints: K + + Args: + pred (torch.Tensor[N,K,H,W]):heatmap of output. + gt (torch.Tensor[N,K,H,W]): target heatmap. + mask (torch.Tensor[N,H,W]): mask of target. + """ + assert pred.size() == gt.size( + ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}' + + if not self.supervise_empty: + empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float() + loss = ((pred - gt)**2) * empty_mask.expand_as( + pred) * mask[:, None, :, :].expand_as(pred) + else: + loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred) + loss = loss.mean(dim=3).mean(dim=2).mean(dim=1) + return loss + + +@LOSSES.register_module() +class AELoss(nn.Module): + """Associative Embedding loss. + + `Associative Embedding: End-to-End Learning for Joint Detection and + Grouping `_. + """ + + def __init__(self, loss_type): + super().__init__() + self.loss_type = loss_type + + def singleTagLoss(self, pred_tag, joints): + """Associative embedding loss for one image. + + Note: + - heatmaps weight: W + - heatmaps height: H + - max_num_people: M + - num_keypoints: K + + Args: + pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image. + joints (torch.Tensor[M,K,2]): joints information for one image. + """ + tags = [] + pull = 0 + for joints_per_person in joints: + tmp = [] + for joint in joints_per_person: + if joint[1] > 0: + tmp.append(pred_tag[joint[0]]) + if len(tmp) == 0: + continue + tmp = torch.stack(tmp) + tags.append(torch.mean(tmp, dim=0)) + pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2) + + num_tags = len(tags) + if num_tags == 0: + return ( + _make_input(torch.zeros(1).float(), device=pred_tag.device), + _make_input(torch.zeros(1).float(), device=pred_tag.device)) + elif num_tags == 1: + return (_make_input( + torch.zeros(1).float(), device=pred_tag.device), pull) + + tags = torch.stack(tags) + + size = (num_tags, num_tags) + A = tags.expand(*size) + B = A.permute(1, 0) + + diff = A - B + + if self.loss_type == 'exp': + diff = torch.pow(diff, 2) + push = torch.exp(-diff) + push = torch.sum(push) - num_tags + elif self.loss_type == 'max': + diff = 1 - torch.abs(diff) + push = torch.clamp(diff, min=0).sum() - num_tags + else: + raise ValueError('Unknown ae loss type') + + push_loss = push / ((num_tags - 1) * num_tags) * 0.5 + pull_loss = pull / (num_tags) + + return push_loss, pull_loss + + def forward(self, tags, joints): + """Accumulate the tag loss for each image in the batch. + + Note: + - batch_size: N + - heatmaps weight: W + - heatmaps height: H + - max_num_people: M + - num_keypoints: K + + Args: + tags (torch.Tensor[N,KxHxW,1]): tag channels of output. + joints (torch.Tensor[N,M,K,2]): joints information. + """ + pushes, pulls = [], [] + joints = joints.cpu().data.numpy() + batch_size = tags.size(0) + for i in range(batch_size): + push, pull = self.singleTagLoss(tags[i], joints[i]) + pushes.append(push) + pulls.append(pull) + return torch.stack(pushes), torch.stack(pulls) + + +@LOSSES.register_module() +class MultiLossFactory(nn.Module): + """Loss for bottom-up models. + + Args: + num_joints (int): Number of keypoints. + num_stages (int): Number of stages. + ae_loss_type (str): Type of ae loss. + with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap. + push_loss_factor (list[float]): + Parameter of push loss in multi-heatmap. + pull_loss_factor (list[float]): + Parameter of pull loss in multi-heatmap. + with_heatmap_loss (list[bool]): + Use heatmap loss or not in multi-heatmap. + heatmaps_loss_factor (list[float]): + Parameter of heatmap loss in multi-heatmap. + supervise_empty (bool): Whether to supervise empty channels. + """ + + def __init__(self, + num_joints, + num_stages, + ae_loss_type, + with_ae_loss, + push_loss_factor, + pull_loss_factor, + with_heatmaps_loss, + heatmaps_loss_factor, + supervise_empty=True): + super().__init__() + + assert isinstance(with_heatmaps_loss, (list, tuple)), \ + 'with_heatmaps_loss should be a list or tuple' + assert isinstance(heatmaps_loss_factor, (list, tuple)), \ + 'heatmaps_loss_factor should be a list or tuple' + assert isinstance(with_ae_loss, (list, tuple)), \ + 'with_ae_loss should be a list or tuple' + assert isinstance(push_loss_factor, (list, tuple)), \ + 'push_loss_factor should be a list or tuple' + assert isinstance(pull_loss_factor, (list, tuple)), \ + 'pull_loss_factor should be a list or tuple' + + self.num_joints = num_joints + self.num_stages = num_stages + self.ae_loss_type = ae_loss_type + self.with_ae_loss = with_ae_loss + self.push_loss_factor = push_loss_factor + self.pull_loss_factor = pull_loss_factor + self.with_heatmaps_loss = with_heatmaps_loss + self.heatmaps_loss_factor = heatmaps_loss_factor + + self.heatmaps_loss = \ + nn.ModuleList( + [ + HeatmapLoss(supervise_empty) + if with_heatmaps_loss else None + for with_heatmaps_loss in self.with_heatmaps_loss + ] + ) + + self.ae_loss = \ + nn.ModuleList( + [ + AELoss(self.ae_loss_type) if with_ae_loss else None + for with_ae_loss in self.with_ae_loss + ] + ) + + def forward(self, outputs, heatmaps, masks, joints): + """Forward function to calculate losses. + + Note: + - batch_size: N + - heatmaps weight: W + - heatmaps height: H + - max_num_people: M + - num_keypoints: K + - output_channel: C C=2K if use ae loss else K + + Args: + outputs (list(torch.Tensor[N,C,H,W])): outputs of stages. + heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps. + masks (list(torch.Tensor[N,H,W])): masks of heatmaps. + joints (list(torch.Tensor[N,M,K,2])): joints of ae loss. + """ + heatmaps_losses = [] + push_losses = [] + pull_losses = [] + for idx in range(len(outputs)): + offset_feat = 0 + if self.heatmaps_loss[idx]: + heatmaps_pred = outputs[idx][:, :self.num_joints] + offset_feat = self.num_joints + heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred, + heatmaps[idx], + masks[idx]) + heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx] + heatmaps_losses.append(heatmaps_loss) + else: + heatmaps_losses.append(None) + + if self.ae_loss[idx]: + tags_pred = outputs[idx][:, offset_feat:] + batch_size = tags_pred.size()[0] + tags_pred = tags_pred.contiguous().view(batch_size, -1, 1) + + push_loss, pull_loss = self.ae_loss[idx](tags_pred, + joints[idx]) + push_loss = push_loss * self.push_loss_factor[idx] + pull_loss = pull_loss * self.pull_loss_factor[idx] + + push_losses.append(push_loss) + pull_losses.append(pull_loss) + else: + push_losses.append(None) + pull_losses.append(None) + + return heatmaps_losses, push_losses, pull_losses diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py new file mode 100644 index 0000000..db41783 --- /dev/null +++ b/mmpose/models/losses/regression_loss.py @@ -0,0 +1,448 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES + + +@LOSSES.register_module() +class SmoothL1Loss(nn.Module): + """SmoothL1Loss loss. + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, use_target_weight=False, loss_weight=1.): + super().__init__() + self.criterion = F.smooth_l1_loss + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + output (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + target_weight (torch.Tensor[N, K, D]): + Weights across different joint types. + """ + if self.use_target_weight: + assert target_weight is not None + loss = self.criterion(output * target_weight, + target * target_weight) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight + + +@LOSSES.register_module() +class WingLoss(nn.Module): + """Wing Loss. paper ref: 'Wing Loss for Robust Facial Landmark Localisation + with Convolutional Neural Networks' Feng et al. CVPR'2018. + + Args: + omega (float): Also referred to as width. + epsilon (float): Also referred to as curvature. + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, + omega=10.0, + epsilon=2.0, + use_target_weight=False, + loss_weight=1.): + super().__init__() + self.omega = omega + self.epsilon = epsilon + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + # constant that smoothly links the piecewise-defined linear + # and nonlinear parts + self.C = self.omega * (1.0 - math.log(1.0 + self.omega / self.epsilon)) + + def criterion(self, pred, target): + """Criterion of wingloss. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + pred (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + """ + delta = (target - pred).abs() + losses = torch.where( + delta < self.omega, + self.omega * torch.log(1.0 + delta / self.epsilon), delta - self.C) + return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0) + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + output (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + target_weight (torch.Tensor[N,K,D]): + Weights across different joint types. + """ + if self.use_target_weight: + assert target_weight is not None + loss = self.criterion(output * target_weight, + target * target_weight) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight + + +@LOSSES.register_module() +class SoftWingLoss(nn.Module): + """Soft Wing Loss 'Structure-Coherent Deep Feature Learning for Robust Face + Alignment' Lin et al. TIP'2021. + + loss = + 1. |x| , if |x| < omega1 + 2. omega2*ln(1+|x|/epsilon) + B, if |x| >= omega1 + + Args: + omega1 (float): The first threshold. + omega2 (float): The second threshold. + epsilon (float): Also referred to as curvature. + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, + omega1=2.0, + omega2=20.0, + epsilon=0.5, + use_target_weight=False, + loss_weight=1.): + super().__init__() + self.omega1 = omega1 + self.omega2 = omega2 + self.epsilon = epsilon + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + # constant that smoothly links the piecewise-defined linear + # and nonlinear parts + self.B = self.omega1 - self.omega2 * math.log(1.0 + self.omega1 / + self.epsilon) + + def criterion(self, pred, target): + """Criterion of wingloss. + + Note: + batch_size: N + num_keypoints: K + dimension of keypoints: D (D=2 or D=3) + + Args: + pred (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + """ + delta = (target - pred).abs() + losses = torch.where( + delta < self.omega1, delta, + self.omega2 * torch.log(1.0 + delta / self.epsilon) + self.B) + return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0) + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + batch_size: N + num_keypoints: K + dimension of keypoints: D (D=2 or D=3) + + Args: + output (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + target_weight (torch.Tensor[N, K, D]): + Weights across different joint types. + """ + if self.use_target_weight: + assert target_weight is not None + loss = self.criterion(output * target_weight, + target * target_weight) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight + + +@LOSSES.register_module() +class MPJPELoss(nn.Module): + """MPJPE (Mean Per Joint Position Error) loss. + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, use_target_weight=False, loss_weight=1.): + super().__init__() + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + output (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + target_weight (torch.Tensor[N,K,D]): + Weights across different joint types. + """ + + if self.use_target_weight: + assert target_weight is not None + loss = torch.mean( + torch.norm((output - target) * target_weight, dim=-1)) + else: + loss = torch.mean(torch.norm(output - target, dim=-1)) + + return loss * self.loss_weight + + +@LOSSES.register_module() +class L1Loss(nn.Module): + """L1Loss loss .""" + + def __init__(self, use_target_weight=False, loss_weight=1.): + super().__init__() + self.criterion = F.l1_loss + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + output (torch.Tensor[N, K, 2]): Output regression. + target (torch.Tensor[N, K, 2]): Target regression. + target_weight (torch.Tensor[N, K, 2]): + Weights across different joint types. + """ + if self.use_target_weight: + assert target_weight is not None + loss = self.criterion(output * target_weight, + target * target_weight) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight + + +@LOSSES.register_module() +class MSELoss(nn.Module): + """MSE loss for coordinate regression.""" + + def __init__(self, use_target_weight=False, loss_weight=1.): + super().__init__() + self.criterion = F.mse_loss + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + output (torch.Tensor[N, K, 2]): Output regression. + target (torch.Tensor[N, K, 2]): Target regression. + target_weight (torch.Tensor[N, K, 2]): + Weights across different joint types. + """ + if self.use_target_weight: + assert target_weight is not None + loss = self.criterion(output * target_weight, + target * target_weight) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight + + +@LOSSES.register_module() +class BoneLoss(nn.Module): + """Bone length loss. + + Args: + joint_parents (list): Indices of each joint's parent joint. + use_target_weight (bool): Option to use weighted bone loss. + Different bone types may have different target weights. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.): + super().__init__() + self.joint_parents = joint_parents + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + self.non_root_indices = [] + for i in range(len(self.joint_parents)): + if i != self.joint_parents[i]: + self.non_root_indices.append(i) + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + output (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + target_weight (torch.Tensor[N, K-1]): + Weights across different bone types. + """ + output_bone = torch.norm( + output - output[:, self.joint_parents, :], + dim=-1)[:, self.non_root_indices] + target_bone = torch.norm( + target - target[:, self.joint_parents, :], + dim=-1)[:, self.non_root_indices] + if self.use_target_weight: + assert target_weight is not None + loss = torch.mean( + torch.abs((output_bone * target_weight).mean(dim=0) - + (target_bone * target_weight).mean(dim=0))) + else: + loss = torch.mean( + torch.abs(output_bone.mean(dim=0) - target_bone.mean(dim=0))) + + return loss * self.loss_weight + + +@LOSSES.register_module() +class SemiSupervisionLoss(nn.Module): + """Semi-supervision loss for unlabeled data. It is composed of projection + loss and bone loss. + + Paper ref: `3D human pose estimation in video with temporal convolutions + and semi-supervised training` Dario Pavllo et al. CVPR'2019. + + Args: + joint_parents (list): Indices of each joint's parent joint. + projection_loss_weight (float): Weight for projection loss. + bone_loss_weight (float): Weight for bone loss. + warmup_iterations (int): Number of warmup iterations. In the first + `warmup_iterations` iterations, the model is trained only on + labeled data, and semi-supervision loss will be 0. + This is a workaround since currently we cannot access + epoch number in loss functions. Note that the iteration number in + an epoch can be changed due to different GPU numbers in multi-GPU + settings. So please set this parameter carefully. + warmup_iterations = dataset_size // samples_per_gpu // gpu_num + * warmup_epochs + """ + + def __init__(self, + joint_parents, + projection_loss_weight=1., + bone_loss_weight=1., + warmup_iterations=0): + super().__init__() + self.criterion_projection = MPJPELoss( + loss_weight=projection_loss_weight) + self.criterion_bone = BoneLoss( + joint_parents, loss_weight=bone_loss_weight) + self.warmup_iterations = warmup_iterations + self.num_iterations = 0 + + @staticmethod + def project_joints(x, intrinsics): + """Project 3D joint coordinates to 2D image plane using camera + intrinsic parameters. + + Args: + x (torch.Tensor[N, K, 3]): 3D joint coordinates. + intrinsics (torch.Tensor[N, 4] | torch.Tensor[N, 9]): Camera + intrinsics: f (2), c (2), k (3), p (2). + """ + while intrinsics.dim() < x.dim(): + intrinsics.unsqueeze_(1) + f = intrinsics[..., :2] + c = intrinsics[..., 2:4] + _x = torch.clamp(x[:, :, :2] / x[:, :, 2:], -1, 1) + if intrinsics.shape[-1] == 9: + k = intrinsics[..., 4:7] + p = intrinsics[..., 7:9] + + r2 = torch.sum(_x[:, :, :2]**2, dim=-1, keepdim=True) + radial = 1 + torch.sum( + k * torch.cat((r2, r2**2, r2**3), dim=-1), + dim=-1, + keepdim=True) + tan = torch.sum(p * _x, dim=-1, keepdim=True) + _x = _x * (radial + tan) + p * r2 + _x = f * _x + c + return _x + + def forward(self, output, target): + losses = dict() + + self.num_iterations += 1 + if self.num_iterations <= self.warmup_iterations: + return losses + + labeled_pose = output['labeled_pose'] + unlabeled_pose = output['unlabeled_pose'] + unlabeled_traj = output['unlabeled_traj'] + unlabeled_target_2d = target['unlabeled_target_2d'] + intrinsics = target['intrinsics'] + + # projection loss + unlabeled_output = unlabeled_pose + unlabeled_traj + unlabeled_output_2d = self.project_joints(unlabeled_output, intrinsics) + loss_proj = self.criterion_projection(unlabeled_output_2d, + unlabeled_target_2d, None) + losses['proj_loss'] = loss_proj + + # bone loss + loss_bone = self.criterion_bone(unlabeled_pose, labeled_pose, None) + losses['bone_loss'] = loss_bone + + return losses diff --git a/mmpose/models/misc/__init__.py b/mmpose/models/misc/__init__.py new file mode 100644 index 0000000..ef101fe --- /dev/null +++ b/mmpose/models/misc/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmpose/models/misc/discriminator.py b/mmpose/models/misc/discriminator.py new file mode 100644 index 0000000..712f0a8 --- /dev/null +++ b/mmpose/models/misc/discriminator.py @@ -0,0 +1,307 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/akanazawa/hmr +# Original licence: Copyright (c) 2018 akanazawa, under the MIT License. +# ------------------------------------------------------------------------------ + +from abc import abstractmethod + +import torch +import torch.nn as nn +from mmcv.cnn import normal_init, xavier_init + +from mmpose.models.utils.geometry import batch_rodrigues + + +class BaseDiscriminator(nn.Module): + """Base linear module for SMPL parameter discriminator. + + Args: + fc_layers (Tuple): Tuple of neuron count, + such as (9, 32, 32, 1) + use_dropout (Tuple): Tuple of bool define use dropout or not + for each layer, such as (True, True, False) + drop_prob (Tuple): Tuple of float defined the drop prob, + such as (0.5, 0.5, 0) + use_activation(Tuple): Tuple of bool define use active function + or not, such as (True, True, False) + """ + + def __init__(self, fc_layers, use_dropout, drop_prob, use_activation): + super().__init__() + self.fc_layers = fc_layers + self.use_dropout = use_dropout + self.drop_prob = drop_prob + self.use_activation = use_activation + self._check() + self.create_layers() + + def _check(self): + """Check input to avoid ValueError.""" + if not isinstance(self.fc_layers, tuple): + raise TypeError(f'fc_layers require tuple, ' + f'get {type(self.fc_layers)}') + + if not isinstance(self.use_dropout, tuple): + raise TypeError(f'use_dropout require tuple, ' + f'get {type(self.use_dropout)}') + + if not isinstance(self.drop_prob, tuple): + raise TypeError(f'drop_prob require tuple, ' + f'get {type(self.drop_prob)}') + + if not isinstance(self.use_activation, tuple): + raise TypeError(f'use_activation require tuple, ' + f'get {type(self.use_activation)}') + + l_fc_layer = len(self.fc_layers) + l_use_drop = len(self.use_dropout) + l_drop_prob = len(self.drop_prob) + l_use_activation = len(self.use_activation) + + pass_check = ( + l_fc_layer >= 2 and l_use_drop < l_fc_layer + and l_drop_prob < l_fc_layer and l_use_activation < l_fc_layer + and l_drop_prob == l_use_drop) + + if not pass_check: + msg = 'Wrong BaseDiscriminator parameters!' + raise ValueError(msg) + + def create_layers(self): + """Create layers.""" + l_fc_layer = len(self.fc_layers) + l_use_drop = len(self.use_dropout) + l_use_activation = len(self.use_activation) + + self.fc_blocks = nn.Sequential() + + for i in range(l_fc_layer - 1): + self.fc_blocks.add_module( + name=f'regressor_fc_{i}', + module=nn.Linear( + in_features=self.fc_layers[i], + out_features=self.fc_layers[i + 1])) + + if i < l_use_activation and self.use_activation[i]: + self.fc_blocks.add_module( + name=f'regressor_af_{i}', module=nn.ReLU()) + + if i < l_use_drop and self.use_dropout[i]: + self.fc_blocks.add_module( + name=f'regressor_fc_dropout_{i}', + module=nn.Dropout(p=self.drop_prob[i])) + + @abstractmethod + def forward(self, inputs): + """Forward function.""" + msg = 'the base class [BaseDiscriminator] is not callable!' + raise NotImplementedError(msg) + + def init_weights(self): + """Initialize model weights.""" + for m in self.fc_blocks.named_modules(): + if isinstance(m, nn.Linear): + xavier_init(m, gain=0.01) + + +class ShapeDiscriminator(BaseDiscriminator): + """Discriminator for SMPL shape parameters, the inputs is (batch_size x 10) + + Args: + fc_layers (Tuple): Tuple of neuron count, such as (10, 5, 1) + use_dropout (Tuple): Tuple of bool define use dropout or + not for each layer, such as (True, True, False) + drop_prob (Tuple): Tuple of float defined the drop prob, + such as (0.5, 0) + use_activation(Tuple): Tuple of bool define use active + function or not, such as (True, False) + """ + + def __init__(self, fc_layers, use_dropout, drop_prob, use_activation): + if fc_layers[-1] != 1: + msg = f'the neuron count of the last layer ' \ + f'must be 1, but got {fc_layers[-1]}' + raise ValueError(msg) + + super().__init__(fc_layers, use_dropout, drop_prob, use_activation) + + def forward(self, inputs): + """Forward function.""" + return self.fc_blocks(inputs) + + +class PoseDiscriminator(nn.Module): + """Discriminator for SMPL pose parameters of each joint. It is composed of + discriminators for each joints. The inputs is (batch_size x joint_count x + 9) + + Args: + channels (Tuple): Tuple of channel number, + such as (9, 32, 32, 1) + joint_count (int): Joint number, such as 23 + """ + + def __init__(self, channels, joint_count): + super().__init__() + if channels[-1] != 1: + msg = f'the neuron count of the last layer ' \ + f'must be 1, but got {channels[-1]}' + raise ValueError(msg) + self.joint_count = joint_count + + self.conv_blocks = nn.Sequential() + len_channels = len(channels) + for idx in range(len_channels - 2): + self.conv_blocks.add_module( + name=f'conv_{idx}', + module=nn.Conv2d( + in_channels=channels[idx], + out_channels=channels[idx + 1], + kernel_size=1, + stride=1)) + + self.fc_layer = nn.ModuleList() + for idx in range(joint_count): + self.fc_layer.append( + nn.Linear( + in_features=channels[len_channels - 2], out_features=1)) + + def forward(self, inputs): + """Forward function. + + The input is (batch_size x joint_count x 9). + """ + # shape: batch_size x 9 x 1 x joint_count + inputs = inputs.transpose(1, 2).unsqueeze(2).contiguous() + # shape: batch_size x c x 1 x joint_count + internal_outputs = self.conv_blocks(inputs) + outputs = [] + for idx in range(self.joint_count): + outputs.append(self.fc_layer[idx](internal_outputs[:, :, 0, idx])) + + return torch.cat(outputs, 1), internal_outputs + + def init_weights(self): + """Initialize model weights.""" + for m in self.conv_blocks: + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) + for m in self.fc_layer.named_modules(): + if isinstance(m, nn.Linear): + xavier_init(m, gain=0.01) + + +class FullPoseDiscriminator(BaseDiscriminator): + """Discriminator for SMPL pose parameters of all joints. + + Args: + fc_layers (Tuple): Tuple of neuron count, + such as (736, 1024, 1024, 1) + use_dropout (Tuple): Tuple of bool define use dropout or not + for each layer, such as (True, True, False) + drop_prob (Tuple): Tuple of float defined the drop prob, + such as (0.5, 0.5, 0) + use_activation(Tuple): Tuple of bool define use active + function or not, such as (True, True, False) + """ + + def __init__(self, fc_layers, use_dropout, drop_prob, use_activation): + if fc_layers[-1] != 1: + msg = f'the neuron count of the last layer must be 1,' \ + f' but got {fc_layers[-1]}' + raise ValueError(msg) + + super().__init__(fc_layers, use_dropout, drop_prob, use_activation) + + def forward(self, inputs): + """Forward function.""" + return self.fc_blocks(inputs) + + +class SMPLDiscriminator(nn.Module): + """Discriminator for SMPL pose and shape parameters. It is composed of a + discriminator for SMPL shape parameters, a discriminator for SMPL pose + parameters of all joints and a discriminator for SMPL pose parameters of + each joint. + + Args: + beta_channel (tuple of int): Tuple of neuron count of the + discriminator of shape parameters. Defaults to (10, 5, 1) + per_joint_channel (tuple of int): Tuple of neuron count of the + discriminator of each joint. Defaults to (9, 32, 32, 1) + full_pose_channel (tuple of int): Tuple of neuron count of the + discriminator of full pose. Defaults to (23*32, 1024, 1024, 1) + """ + + def __init__(self, + beta_channel=(10, 5, 1), + per_joint_channel=(9, 32, 32, 1), + full_pose_channel=(23 * 32, 1024, 1024, 1)): + super().__init__() + self.joint_count = 23 + # The count of SMPL shape parameter is 10. + assert beta_channel[0] == 10 + # Use 3 x 3 rotation matrix as the pose parameters + # of each joint, so the input channel is 9. + assert per_joint_channel[0] == 9 + assert self.joint_count * per_joint_channel[-2] \ + == full_pose_channel[0] + + self.beta_channel = beta_channel + self.per_joint_channel = per_joint_channel + self.full_pose_channel = full_pose_channel + self._create_sub_modules() + + def _create_sub_modules(self): + """Create sub discriminators.""" + + # create theta discriminator for each joint + self.pose_discriminator = PoseDiscriminator(self.per_joint_channel, + self.joint_count) + + # create full pose discriminator for total joints + fc_layers = self.full_pose_channel + use_dropout = tuple([False] * (len(fc_layers) - 1)) + drop_prob = tuple([0.5] * (len(fc_layers) - 1)) + use_activation = tuple([True] * (len(fc_layers) - 2) + [False]) + + self.full_pose_discriminator = FullPoseDiscriminator( + fc_layers, use_dropout, drop_prob, use_activation) + + # create shape discriminator for betas + fc_layers = self.beta_channel + use_dropout = tuple([False] * (len(fc_layers) - 1)) + drop_prob = tuple([0.5] * (len(fc_layers) - 1)) + use_activation = tuple([True] * (len(fc_layers) - 2) + [False]) + self.shape_discriminator = ShapeDiscriminator(fc_layers, use_dropout, + drop_prob, + use_activation) + + def forward(self, thetas): + """Forward function.""" + _, poses, shapes = thetas + + batch_size = poses.shape[0] + shape_disc_value = self.shape_discriminator(shapes) + + # The first rotation matrix is global rotation + # and is NOT used in discriminator. + if poses.dim() == 2: + rotate_matrixs = \ + batch_rodrigues(poses.contiguous().view(-1, 3) + ).view(batch_size, 24, 9)[:, 1:, :] + else: + rotate_matrixs = poses.contiguous().view(batch_size, 24, + 9)[:, 1:, :].contiguous() + pose_disc_value, pose_inter_disc_value \ + = self.pose_discriminator(rotate_matrixs) + full_pose_disc_value = self.full_pose_discriminator( + pose_inter_disc_value.contiguous().view(batch_size, -1)) + return torch.cat( + (pose_disc_value, full_pose_disc_value, shape_disc_value), 1) + + def init_weights(self): + """Initialize model weights.""" + self.full_pose_discriminator.init_weights() + self.pose_discriminator.init_weights() + self.shape_discriminator.init_weights() diff --git a/mmpose/models/necks/__init__.py b/mmpose/models/necks/__init__.py new file mode 100644 index 0000000..0d3a5cc --- /dev/null +++ b/mmpose/models/necks/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .gap_neck import GlobalAveragePooling +from .posewarper_neck import PoseWarperNeck + +__all__ = ['GlobalAveragePooling', 'PoseWarperNeck'] diff --git a/mmpose/models/necks/gap_neck.py b/mmpose/models/necks/gap_neck.py new file mode 100644 index 0000000..5e6ad68 --- /dev/null +++ b/mmpose/models/necks/gap_neck.py @@ -0,0 +1,37 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from ..builder import NECKS + + +@NECKS.register_module() +class GlobalAveragePooling(nn.Module): + """Global Average Pooling neck. + + Note that we use `view` to remove extra channel after pooling. We do not + use `squeeze` as it will also remove the batch dimension when the tensor + has a batch dimension of size 1, which can lead to unexpected errors. + """ + + def __init__(self): + super().__init__() + self.gap = nn.AdaptiveAvgPool2d((1, 1)) + + def init_weights(self): + pass + + def forward(self, inputs): + if isinstance(inputs, tuple): + outs = tuple([self.gap(x) for x in inputs]) + outs = tuple( + [out.view(x.size(0), -1) for out, x in zip(outs, inputs)]) + elif isinstance(inputs, list): + outs = [self.gap(x) for x in inputs] + outs = [out.view(x.size(0), -1) for out, x in zip(outs, inputs)] + elif isinstance(inputs, torch.Tensor): + outs = self.gap(inputs) + outs = outs.view(inputs.size(0), -1) + else: + raise TypeError('neck inputs should be tuple or torch.tensor') + return outs diff --git a/mmpose/models/necks/posewarper_neck.py b/mmpose/models/necks/posewarper_neck.py new file mode 100644 index 0000000..dd4ddfb --- /dev/null +++ b/mmpose/models/necks/posewarper_neck.py @@ -0,0 +1,329 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import torch +import torch.nn as nn +from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, + normal_init) +from mmcv.utils import digit_version +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.models.utils.ops import resize +from ..backbones.resnet import BasicBlock, Bottleneck +from ..builder import NECKS + +try: + from mmcv.ops import DeformConv2d + has_mmcv_full = True +except (ImportError, ModuleNotFoundError): + has_mmcv_full = False + + +@NECKS.register_module() +class PoseWarperNeck(nn.Module): + """PoseWarper neck. + + `"Learning temporal pose estimation from sparsely-labeled videos" + `_. + + Args: + in_channels (int): Number of input channels from backbone + out_channels (int): Number of output channels + inner_channels (int): Number of intermediate channels of the res block + deform_groups (int): Number of groups in the deformable conv + dilations (list|tuple): different dilations of the offset conv layers + trans_conv_kernel (int): the kernel of the trans conv layer, which is + used to get heatmap from the output of backbone. Default: 1 + res_blocks_cfg (dict|None): config of residual blocks. If None, + use the default values. If not None, it should contain the + following keys: + + - block (str): the type of residual block, Default: 'BASIC'. + - num_blocks (int): the number of blocks, Default: 20. + + offsets_kernel (int): the kernel of offset conv layer. + deform_conv_kernel (int): the kernel of defomrable conv layer. + in_index (int|Sequence[int]): Input feature index. Default: 0 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + Default: None. + + - 'resize_concat': Multiple feature maps will be resize to \ + the same size as first one and than concat together. \ + Usually used in FCN head of HRNet. + - 'multiple_select': Multiple feature maps will be bundle into \ + a list and passed into decode head. + - None: Only one select feature map is allowed. + + freeze_trans_layer (bool): Whether to freeze the transition layer + (stop grad and set eval mode). Default: True. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + im2col_step (int): the argument `im2col_step` in deformable conv, + Default: 80. + """ + blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} + minimum_mmcv_version = '1.3.17' + + def __init__(self, + in_channels, + out_channels, + inner_channels, + deform_groups=17, + dilations=(3, 6, 12, 18, 24), + trans_conv_kernel=1, + res_blocks_cfg=None, + offsets_kernel=3, + deform_conv_kernel=3, + in_index=0, + input_transform=None, + freeze_trans_layer=True, + norm_eval=False, + im2col_step=80): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.inner_channels = inner_channels + self.deform_groups = deform_groups + self.dilations = dilations + self.trans_conv_kernel = trans_conv_kernel + self.res_blocks_cfg = res_blocks_cfg + self.offsets_kernel = offsets_kernel + self.deform_conv_kernel = deform_conv_kernel + self.in_index = in_index + self.input_transform = input_transform + self.freeze_trans_layer = freeze_trans_layer + self.norm_eval = norm_eval + self.im2col_step = im2col_step + + identity_trans_layer = False + + assert trans_conv_kernel in [0, 1, 3] + kernel_size = trans_conv_kernel + if kernel_size == 3: + padding = 1 + elif kernel_size == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_trans_layer = True + + if identity_trans_layer: + self.trans_layer = nn.Identity() + else: + self.trans_layer = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=padding) + + # build chain of residual blocks + if res_blocks_cfg is not None and not isinstance(res_blocks_cfg, dict): + raise TypeError('res_blocks_cfg should be dict or None.') + + if res_blocks_cfg is None: + block_type = 'BASIC' + num_blocks = 20 + else: + block_type = res_blocks_cfg.get('block', 'BASIC') + num_blocks = res_blocks_cfg.get('num_blocks', 20) + + block = self.blocks_dict[block_type] + + res_layers = [] + downsample = nn.Sequential( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=out_channels, + out_channels=inner_channels, + kernel_size=1, + stride=1, + bias=False), + build_norm_layer(dict(type='BN'), inner_channels)[1]) + res_layers.append( + block( + in_channels=out_channels, + out_channels=inner_channels, + downsample=downsample)) + + for _ in range(1, num_blocks): + res_layers.append(block(inner_channels, inner_channels)) + self.offset_feats = nn.Sequential(*res_layers) + + # build offset layers + self.num_offset_layers = len(dilations) + assert self.num_offset_layers > 0, 'Number of offset layers ' \ + 'should be larger than 0.' + + target_offset_channels = 2 * offsets_kernel**2 * deform_groups + + offset_layers = [ + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=inner_channels, + out_channels=target_offset_channels, + kernel_size=offsets_kernel, + stride=1, + dilation=dilations[i], + padding=dilations[i], + bias=False, + ) for i in range(self.num_offset_layers) + ] + self.offset_layers = nn.ModuleList(offset_layers) + + # build deformable conv layers + assert digit_version(mmcv.__version__) >= \ + digit_version(self.minimum_mmcv_version), \ + f'Current MMCV version: {mmcv.__version__}, ' \ + f'but MMCV >= {self.minimum_mmcv_version} is required, see ' \ + f'https://github.com/open-mmlab/mmcv/issues/1440, ' \ + f'Please install the latest MMCV.' + + if has_mmcv_full: + deform_conv_layers = [ + DeformConv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=deform_conv_kernel, + stride=1, + padding=int(deform_conv_kernel / 2) * dilations[i], + dilation=dilations[i], + deform_groups=deform_groups, + im2col_step=self.im2col_step, + ) for i in range(self.num_offset_layers) + ] + else: + raise ImportError('Please install the full version of mmcv ' + 'to use `DeformConv2d`.') + + self.deform_conv_layers = nn.ModuleList(deform_conv_layers) + + self.freeze_layers() + + def freeze_layers(self): + if self.freeze_trans_layer: + self.trans_layer.eval() + + for param in self.trans_layer.parameters(): + param.requires_grad = False + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + elif isinstance(m, DeformConv2d): + filler = torch.zeros([ + m.weight.size(0), + m.weight.size(1), + m.weight.size(2), + m.weight.size(3) + ], + dtype=torch.float32, + device=m.weight.device) + for k in range(m.weight.size(0)): + filler[k, k, + int(m.weight.size(2) / 2), + int(m.weight.size(3) / 2)] = 1.0 + m.weight = torch.nn.Parameter(filler) + m.weight.requires_grad = True + + # posewarper offset layer weight initialization + for m in self.offset_layers.modules(): + constant_init(m, 0) + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor] | Tensor): multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + if not isinstance(inputs, list): + return inputs + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = torch.cat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + def forward(self, inputs, frame_weight): + assert isinstance(inputs, (list, tuple)), 'PoseWarperNeck inputs ' \ + 'should be list or tuple, even though the length is 1, ' \ + 'for unified processing.' + + output_heatmap = 0 + if len(inputs) > 1: + inputs = [self._transform_inputs(input) for input in inputs] + inputs = [self.trans_layer(input) for input in inputs] + + # calculate difference features + diff_features = [ + self.offset_feats(inputs[0] - input) for input in inputs + ] + + for i in range(len(inputs)): + if frame_weight[i] == 0: + continue + warped_heatmap = 0 + for j in range(self.num_offset_layers): + offset = (self.offset_layers[j](diff_features[i])) + warped_heatmap_tmp = self.deform_conv_layers[j](inputs[i], + offset) + warped_heatmap += warped_heatmap_tmp / \ + self.num_offset_layers + + output_heatmap += warped_heatmap * frame_weight[i] + + else: + inputs = inputs[0] + inputs = self._transform_inputs(inputs) + inputs = self.trans_layer(inputs) + + num_frames = len(frame_weight) + batch_size = inputs.size(0) // num_frames + ref_x = inputs[:batch_size] + ref_x_tiled = ref_x.repeat(num_frames, 1, 1, 1) + + offset_features = self.offset_feats(ref_x_tiled - inputs) + + warped_heatmap = 0 + for j in range(self.num_offset_layers): + offset = self.offset_layers[j](offset_features) + + warped_heatmap_tmp = self.deform_conv_layers[j](inputs, offset) + warped_heatmap += warped_heatmap_tmp / self.num_offset_layers + + for i in range(num_frames): + if frame_weight[i] == 0: + continue + output_heatmap += warped_heatmap[i * batch_size:(i + 1) * + batch_size] * frame_weight[i] + + return output_heatmap + + def train(self, mode=True): + """Convert the model into training mode.""" + super().train(mode) + self.freeze_layers() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/registry.py b/mmpose/models/registry.py new file mode 100644 index 0000000..f354ae9 --- /dev/null +++ b/mmpose/models/registry.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from .builder import BACKBONES, HEADS, LOSSES, NECKS, POSENETS + +__all__ = ['BACKBONES', 'HEADS', 'LOSSES', 'NECKS', 'POSENETS'] + +warnings.simplefilter('once', DeprecationWarning) +warnings.warn( + 'Registries (BACKBONES, NECKS, HEADS, LOSSES, POSENETS) have ' + 'been moved to mmpose.models.builder. Importing from ' + 'mmpose.models.registry will be deprecated in the future.', + DeprecationWarning) diff --git a/mmpose/models/utils/__init__.py b/mmpose/models/utils/__init__.py new file mode 100644 index 0000000..6871c66 --- /dev/null +++ b/mmpose/models/utils/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .smpl import SMPL + +__all__ = ['SMPL'] diff --git a/mmpose/models/utils/geometry.py b/mmpose/models/utils/geometry.py new file mode 100644 index 0000000..0ceadae --- /dev/null +++ b/mmpose/models/utils/geometry.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn import functional as F + + +def rot6d_to_rotmat(x): + """Convert 6D rotation representation to 3x3 rotation matrix. + + Based on Zhou et al., "On the Continuity of Rotation + Representations in Neural Networks", CVPR 2019 + Input: + (B,6) Batch of 6-D rotation representations + Output: + (B,3,3) Batch of corresponding rotation matrices + """ + x = x.view(-1, 3, 2) + a1 = x[:, :, 0] + a2 = x[:, :, 1] + b1 = F.normalize(a1) + b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1) + b3 = torch.cross(b1, b2) + return torch.stack((b1, b2, b3), dim=-1) + + +def batch_rodrigues(theta): + """Convert axis-angle representation to rotation matrix. + Args: + theta: size = [B, 3] + Returns: + Rotation matrix corresponding to the quaternion + -- size = [B, 3, 3] + """ + l2norm = torch.norm(theta + 1e-8, p=2, dim=1) + angle = torch.unsqueeze(l2norm, -1) + normalized = torch.div(theta, angle) + angle = angle * 0.5 + v_cos = torch.cos(angle) + v_sin = torch.sin(angle) + quat = torch.cat([v_cos, v_sin * normalized], dim=1) + return quat_to_rotmat(quat) + + +def quat_to_rotmat(quat): + """Convert quaternion coefficients to rotation matrix. + Args: + quat: size = [B, 4] 4 <===>(w, x, y, z) + Returns: + Rotation matrix corresponding to the quaternion + -- size = [B, 3, 3] + """ + norm_quat = quat + norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True) + w, x, y, z = norm_quat[:, 0], norm_quat[:, 1],\ + norm_quat[:, 2], norm_quat[:, 3] + + B = quat.size(0) + + w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2) + wx, wy, wz = w * x, w * y, w * z + xy, xz, yz = x * y, x * z, y * z + + rotMat = torch.stack([ + w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy, + w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz, + w2 - x2 - y2 + z2 + ], + dim=1).view(B, 3, 3) + return rotMat diff --git a/mmpose/models/utils/ops.py b/mmpose/models/utils/ops.py new file mode 100644 index 0000000..858d0a9 --- /dev/null +++ b/mmpose/models/utils/ops.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch +import torch.nn.functional as F + + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > output_h: + if ((output_h > 1 and output_w > 1 and input_h > 1 + and input_w > 1) and (output_h - 1) % (input_h - 1) + and (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + if isinstance(size, torch.Size): + size = tuple(int(x) for x in size) + return F.interpolate(input, size, scale_factor, mode, align_corners) diff --git a/mmpose/models/utils/smpl.py b/mmpose/models/utils/smpl.py new file mode 100644 index 0000000..fe723d4 --- /dev/null +++ b/mmpose/models/utils/smpl.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +import torch.nn as nn + +from ..builder import MESH_MODELS + +try: + from smplx import SMPL as SMPL_ + has_smpl = True +except (ImportError, ModuleNotFoundError): + has_smpl = False + + +@MESH_MODELS.register_module() +class SMPL(nn.Module): + """SMPL 3d human mesh model of paper ref: Matthew Loper. ``SMPL: A skinned + multi-person linear model''. This module is based on the smplx project + (https://github.com/vchoutas/smplx). + + Args: + smpl_path (str): The path to the folder where the model weights are + stored. + joints_regressor (str): The path to the file where the joints + regressor weight are stored. + """ + + def __init__(self, smpl_path, joints_regressor): + super().__init__() + + assert has_smpl, 'Please install smplx to use SMPL.' + + self.smpl_neutral = SMPL_( + model_path=smpl_path, + create_global_orient=False, + create_body_pose=False, + create_transl=False, + gender='neutral') + + self.smpl_male = SMPL_( + model_path=smpl_path, + create_betas=False, + create_global_orient=False, + create_body_pose=False, + create_transl=False, + gender='male') + + self.smpl_female = SMPL_( + model_path=smpl_path, + create_betas=False, + create_global_orient=False, + create_body_pose=False, + create_transl=False, + gender='female') + + joints_regressor = torch.tensor( + np.load(joints_regressor), dtype=torch.float)[None, ...] + self.register_buffer('joints_regressor', joints_regressor) + + self.num_verts = self.smpl_neutral.get_num_verts() + self.num_joints = self.joints_regressor.shape[1] + + def smpl_forward(self, model, **kwargs): + """Apply a specific SMPL model with given model parameters. + + Note: + B: batch size + V: number of vertices + K: number of joints + + Returns: + outputs (dict): Dict with mesh vertices and joints. + - vertices: Tensor([B, V, 3]), mesh vertices + - joints: Tensor([B, K, 3]), 3d joints regressed + from mesh vertices. + """ + + betas = kwargs['betas'] + batch_size = betas.shape[0] + device = betas.device + output = {} + if batch_size == 0: + output['vertices'] = betas.new_zeros([0, self.num_verts, 3]) + output['joints'] = betas.new_zeros([0, self.num_joints, 3]) + else: + smpl_out = model(**kwargs) + output['vertices'] = smpl_out.vertices + output['joints'] = torch.matmul( + self.joints_regressor.to(device), output['vertices']) + return output + + def get_faces(self): + """Return mesh faces. + + Note: + F: number of faces + + Returns: + faces: np.ndarray([F, 3]), mesh faces + """ + return self.smpl_neutral.faces + + def forward(self, + betas, + body_pose, + global_orient, + transl=None, + gender=None): + """Forward function. + + Note: + B: batch size + J: number of controllable joints of model, for smpl model J=23 + K: number of joints + + Args: + betas: Tensor([B, 10]), human body shape parameters of SMPL model. + body_pose: Tensor([B, J*3] or [B, J, 3, 3]), human body pose + parameters of SMPL model. It should be axis-angle vector + ([B, J*3]) or rotation matrix ([B, J, 3, 3)]. + global_orient: Tensor([B, 3] or [B, 1, 3, 3]), global orientation + of human body. It should be axis-angle vector ([B, 3]) or + rotation matrix ([B, 1, 3, 3)]. + transl: Tensor([B, 3]), global translation of human body. + gender: Tensor([B]), gender parameters of human body. -1 for + neutral, 0 for male , 1 for female. + + Returns: + outputs (dict): Dict with mesh vertices and joints. + - vertices: Tensor([B, V, 3]), mesh vertices + - joints: Tensor([B, K, 3]), 3d joints regressed from + mesh vertices. + """ + + batch_size = betas.shape[0] + pose2rot = True if body_pose.dim() == 2 else False + if batch_size > 0 and gender is not None: + output = { + 'vertices': betas.new_zeros([batch_size, self.num_verts, 3]), + 'joints': betas.new_zeros([batch_size, self.num_joints, 3]) + } + + mask = gender < 0 + _out = self.smpl_forward( + self.smpl_neutral, + betas=betas[mask], + body_pose=body_pose[mask], + global_orient=global_orient[mask], + transl=transl[mask] if transl is not None else None, + pose2rot=pose2rot) + output['vertices'][mask] = _out['vertices'] + output['joints'][mask] = _out['joints'] + + mask = gender == 0 + _out = self.smpl_forward( + self.smpl_male, + betas=betas[mask], + body_pose=body_pose[mask], + global_orient=global_orient[mask], + transl=transl[mask] if transl is not None else None, + pose2rot=pose2rot) + output['vertices'][mask] = _out['vertices'] + output['joints'][mask] = _out['joints'] + + mask = gender == 1 + _out = self.smpl_forward( + self.smpl_male, + betas=betas[mask], + body_pose=body_pose[mask], + global_orient=global_orient[mask], + transl=transl[mask] if transl is not None else None, + pose2rot=pose2rot) + output['vertices'][mask] = _out['vertices'] + output['joints'][mask] = _out['joints'] + else: + return self.smpl_forward( + self.smpl_neutral, + betas=betas, + body_pose=body_pose, + global_orient=global_orient, + transl=transl, + pose2rot=pose2rot) + + return output diff --git a/mmpose/utils/__init__.py b/mmpose/utils/__init__.py new file mode 100644 index 0000000..1293ca0 --- /dev/null +++ b/mmpose/utils/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .collect_env import collect_env +from .logger import get_root_logger +from .setup_env import setup_multi_processes +from .timer import StopWatch + +__all__ = [ + 'get_root_logger', 'collect_env', 'StopWatch', 'setup_multi_processes' +] diff --git a/mmpose/utils/collect_env.py b/mmpose/utils/collect_env.py new file mode 100644 index 0000000..f75c5ea --- /dev/null +++ b/mmpose/utils/collect_env.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.utils import collect_env as collect_basic_env +from mmcv.utils import get_git_hash + +import mmpose + + +def collect_env(): + env_info = collect_basic_env() + env_info['MMPose'] = (mmpose.__version__ + '+' + get_git_hash(digits=7)) + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print(f'{name}: {val}') diff --git a/mmpose/utils/hooks.py b/mmpose/utils/hooks.py new file mode 100644 index 0000000..b68940f --- /dev/null +++ b/mmpose/utils/hooks.py @@ -0,0 +1,60 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools + + +class OutputHook: + + def __init__(self, module, outputs=None, as_tensor=False): + self.outputs = outputs + self.as_tensor = as_tensor + self.layer_outputs = {} + self.register(module) + + def register(self, module): + + def hook_wrapper(name): + + def hook(model, input, output): + if self.as_tensor: + self.layer_outputs[name] = output + else: + if isinstance(output, list): + self.layer_outputs[name] = [ + out.detach().cpu().numpy() for out in output + ] + else: + self.layer_outputs[name] = output.detach().cpu().numpy( + ) + + return hook + + self.handles = [] + if isinstance(self.outputs, (list, tuple)): + for name in self.outputs: + try: + layer = rgetattr(module, name) + h = layer.register_forward_hook(hook_wrapper(name)) + except ModuleNotFoundError as module_not_found: + raise ModuleNotFoundError( + f'Module {name} not found') from module_not_found + self.handles.append(h) + + def remove(self): + for h in self.handles: + h.remove() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.remove() + + +# using wonder's beautiful simplification: +# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects +def rgetattr(obj, attr, *args): + + def _getattr(obj, attr): + return getattr(obj, attr, *args) + + return functools.reduce(_getattr, [obj] + attr.split('.')) diff --git a/mmpose/utils/logger.py b/mmpose/utils/logger.py new file mode 100644 index 0000000..294837f --- /dev/null +++ b/mmpose/utils/logger.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +from mmcv.utils import get_logger + + +def get_root_logger(log_file=None, log_level=logging.INFO): + """Use `get_logger` method in mmcv to get the root logger. + + The logger will be initialized if it has not been initialized. By default a + StreamHandler will be added. If `log_file` is specified, a FileHandler will + also be added. The name of the root logger is the top-level package name, + e.g., "mmpose". + + Args: + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the root logger. + log_level (int): The root logger level. Note that only the process of + rank 0 is affected, while other processes will set the level to + "Error" and be silent most of the time. + + Returns: + logging.Logger: The root logger. + """ + return get_logger(__name__.split('.')[0], log_file, log_level) diff --git a/mmpose/utils/setup_env.py b/mmpose/utils/setup_env.py new file mode 100644 index 0000000..21def2f --- /dev/null +++ b/mmpose/utils/setup_env.py @@ -0,0 +1,47 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import platform +import warnings + +import cv2 +import torch.multiprocessing as mp + + +def setup_multi_processes(cfg): + """Setup multi-processing environment variables.""" + # set multi-process start method as `fork` to speed up the training + if platform.system() != 'Windows': + mp_start_method = cfg.get('mp_start_method', 'fork') + current_method = mp.get_start_method(allow_none=True) + if current_method is not None and current_method != mp_start_method: + warnings.warn( + f'Multi-processing start method `{mp_start_method}` is ' + f'different from the previous setting `{current_method}`.' + f'It will be force set to `{mp_start_method}`. You can change ' + f'this behavior by changing `mp_start_method` in your config.') + mp.set_start_method(mp_start_method, force=True) + + # disable opencv multithreading to avoid system being overloaded + opencv_num_threads = cfg.get('opencv_num_threads', 0) + cv2.setNumThreads(opencv_num_threads) + + # setup OMP threads + # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py # noqa + if 'OMP_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1: + omp_num_threads = 1 + warnings.warn( + f'Setting OMP_NUM_THREADS environment variable for each process ' + f'to be {omp_num_threads} in default, to avoid your system being ' + f'overloaded, please further tune the variable for optimal ' + f'performance in your application as needed.') + os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) + + # setup MKL threads + if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1: + mkl_num_threads = 1 + warnings.warn( + f'Setting MKL_NUM_THREADS environment variable for each process ' + f'to be {mkl_num_threads} in default, to avoid your system being ' + f'overloaded, please further tune the variable for optimal ' + f'performance in your application as needed.') + os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads) diff --git a/mmpose/utils/timer.py b/mmpose/utils/timer.py new file mode 100644 index 0000000..5a3185c --- /dev/null +++ b/mmpose/utils/timer.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import defaultdict +from contextlib import contextmanager +from functools import partial + +import numpy as np +from mmcv import Timer + + +class RunningAverage(): + r"""A helper class to calculate running average in a sliding window. + + Args: + window (int): The size of the sliding window. + """ + + def __init__(self, window: int = 1): + self.window = window + self._data = [] + + def update(self, value): + """Update a new data sample.""" + self._data.append(value) + self._data = self._data[-self.window:] + + def average(self): + """Get the average value of current window.""" + return np.mean(self._data) + + +class StopWatch: + r"""A helper class to measure FPS and detailed time consuming of each phase + in a video processing loop or similar scenarios. + + Args: + window (int): The sliding window size to calculate the running average + of the time consuming. + + Example: + >>> from mmpose.utils import StopWatch + >>> import time + >>> stop_watch = StopWatch(window=10) + >>> with stop_watch.timeit('total'): + >>> time.sleep(0.1) + >>> # 'timeit' support nested use + >>> with stop_watch.timeit('phase1'): + >>> time.sleep(0.1) + >>> with stop_watch.timeit('phase2'): + >>> time.sleep(0.2) + >>> time.sleep(0.2) + >>> report = stop_watch.report() + """ + + def __init__(self, window=1): + self.window = window + self._record = defaultdict(partial(RunningAverage, window=self.window)) + self._timer_stack = [] + + @contextmanager + def timeit(self, timer_name='_FPS_'): + """Timing a code snippet with an assigned name. + + Args: + timer_name (str): The unique name of the interested code snippet to + handle multiple timers and generate reports. Note that '_FPS_' + is a special key that the measurement will be in `fps` instead + of `millisecond`. Also see `report` and `report_strings`. + Default: '_FPS_'. + Note: + This function should always be used in a `with` statement, as shown + in the example. + """ + self._timer_stack.append((timer_name, Timer())) + try: + yield + finally: + timer_name, timer = self._timer_stack.pop() + self._record[timer_name].update(timer.since_start()) + + def report(self, key=None): + """Report timing information. + + Returns: + dict: The key is the timer name and the value is the \ + corresponding average time consuming. + """ + result = { + name: r.average() * 1000. + for name, r in self._record.items() + } + + if '_FPS_' in result: + result['_FPS_'] = 1000. / result.pop('_FPS_') + + if key is None: + return result + return result[key] + + def report_strings(self): + """Report timing information in texture strings. + + Returns: + list(str): Each element is the information string of a timed \ + event, in format of '{timer_name}: {time_in_ms}'. \ + Specially, if timer_name is '_FPS_', the result will \ + be converted to fps. + """ + result = self.report() + strings = [] + if '_FPS_' in result: + strings.append(f'FPS: {result["_FPS_"]:>5.1f}') + strings += [f'{name}: {val:>3.0f}' for name, val in result.items()] + return strings + + def reset(self): + self._record = defaultdict(list) + self._active_timer_stack = [] diff --git a/mmpose/version.py b/mmpose/version.py new file mode 100644 index 0000000..1a10826 --- /dev/null +++ b/mmpose/version.py @@ -0,0 +1,19 @@ +# Copyright (c) Open-MMLab. All rights reserved. + +__version__ = '0.24.0' +short_version = __version__ + + +def parse_version_info(version_str): + version_info = [] + for x in version_str.split('.'): + if x.isdigit(): + version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + version_info.append(int(patch_version[0])) + version_info.append(f'rc{patch_version[1]}') + return tuple(version_info) + + +version_info = parse_version_info(__version__) diff --git a/nets/nn.py b/nets/nn.py new file mode 100644 index 0000000..69c9c70 --- /dev/null +++ b/nets/nn.py @@ -0,0 +1,278 @@ +import copy +import math + +import numpy +import torch + + +def normalize(v): + mag = torch.sqrt(torch.sum(v.pow(2), dim=1, keepdim=True)) + eps = torch.FloatTensor([1E-8]).to(mag.device) + mag = torch.max(mag, eps) + return v / mag + + +def cross_product(u, v): + shape = u.shape + + i = u[:, 1] * v[:, 2] - u[:, 2] * v[:, 1] + j = u[:, 2] * v[:, 0] - u[:, 0] * v[:, 2] + k = u[:, 0] * v[:, 1] - u[:, 1] * v[:, 0] + + i = i.view(shape[0], 1) + j = j.view(shape[0], 1) + k = k.view(shape[0], 1) + + return torch.cat(tensors=(i, j, k), dim=1) + + +class Conv(torch.nn.Module): + def __init__(self, in_ch, out_ch, k=1, s=1, p=0): + super().__init__() + self.conv = torch.nn.Conv2d(in_ch, out_ch, k, s, p, bias=False) + self.norm = torch.nn.BatchNorm2d(out_ch) + + def forward(self, x): + return self.norm(self.conv(x)) + + +class Residual(torch.nn.Module): + def __init__(self, in_ch, out_ch, k, s, p): + super().__init__() + + assert k == 3 + assert p == 1 + self.in_channels = in_ch + + self.relu = torch.nn.ReLU() + self.conv = torch.nn.Identity() + + self.conv1 = Conv(in_ch, out_ch, k=k, s=s, p=p) + self.conv2 = Conv(in_ch, out_ch, k=1, s=s, p=p - k // 2) + self.identity = torch.nn.BatchNorm2d(in_ch) if in_ch == out_ch and s == 1 else None + + @staticmethod + def __pad(k): + if k is None: + return 0 + else: + return torch.nn.functional.pad(k, pad=[1, 1, 1, 1]) + + def __fuse_norm(self, m): + if m is None: + return 0, 0 + if isinstance(m, Conv): + kernel = m.conv.weight + running_mean = m.norm.running_mean + running_var = m.norm.running_var + gamma = m.norm.weight + beta = m.norm.bias + eps = m.norm.eps + else: + assert isinstance(m, torch.nn.BatchNorm2d) + if not hasattr(self, 'norm'): + in_channels = self.conv1.conv.in_channels + kernel_value = numpy.zeros((in_channels, in_channels, 3, 3), dtype=numpy.float32) + for i in range(in_channels): + kernel_value[i, i % in_channels, 1, 1] = 1 + self.norm = torch.from_numpy(kernel_value).to(m.weight.device) + kernel = self.norm + running_mean = m.running_mean + running_var = m.running_var + gamma = m.weight + beta = m.bias + eps = m.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def forward(self, x): + if self.identity is None: + return self.relu(self.conv1(x) + self.conv2(x)) + else: + return self.relu(self.conv1(x) + self.conv2(x) + self.identity(x)) + + def fuse_forward(self, x): + return self.relu(self.conv(x)) + + def fuse(self): + k1, b1 = self.__fuse_norm(self.conv1) + k2, b2 = self.__fuse_norm(self.conv2) + k3, b3 = self.__fuse_norm(self.identity) + + self.conv = torch.nn.Conv2d(in_channels=self.conv1.conv.in_channels, + out_channels=self.conv1.conv.out_channels, + kernel_size=self.conv1.conv.kernel_size, + stride=self.conv1.conv.stride, + padding=self.conv1.conv.padding, + dilation=self.conv1.conv.dilation, + groups=self.conv1.conv.groups, bias=True) + + self.conv.weight.data = k1 + self.__pad(k2) + k3 + self.conv.bias.data = b1 + b2 + b3 + + if hasattr(self, 'conv1'): + self.__delattr__('conv1') + if hasattr(self, 'conv2'): + self.__delattr__('conv2') + if hasattr(self, 'identity'): + self.__delattr__('identity') + if hasattr(self, 'norm'): + self.__delattr__('norm') + self.forward = self.fuse_forward + + +class SixDRepVGG(torch.nn.Module): + def __init__(self, width, depth, num_classes=6): + super().__init__() + + self.p1 = [] + self.p2 = [] + self.p3 = [] + self.p4 = [] + self.p5 = [] + + # p1 + self.p1.append(Residual(width[0], width[1], k=3, s=2, p=1)) + # p2 + for i in range(depth[0]): + if i == 0: + self.p2.append(Residual(width[1], width[2], k=3, s=2, p=1)) + else: + self.p2.append(Residual(width[2], width[2], k=3, s=1, p=1)) + # p3 + for i in range(depth[1]): + if i == 0: + self.p3.append(Residual(width[2], width[3], k=3, s=2, p=1)) + else: + self.p3.append(Residual(width[3], width[3], k=3, s=1, p=1)) + # p4 + for i in range(depth[2]): + if i == 0: + self.p4.append(Residual(width[3], width[4], k=3, s=2, p=1)) + else: + self.p4.append(Residual(width[4], width[4], k=3, s=1, p=1)) + # p5 + for i in range(depth[3]): + if i == 0: + self.p5.append(Residual(width[4], width[5], k=3, s=2, p=1)) + else: + self.p5.append(Residual(width[5], width[5], k=3, s=1, p=1)) + + self.p1 = torch.nn.Sequential(*self.p1) + self.p2 = torch.nn.Sequential(*self.p2) + self.p3 = torch.nn.Sequential(*self.p3) + self.p4 = torch.nn.Sequential(*self.p4) + self.p5 = torch.nn.Sequential(*self.p5) + self.fc = torch.nn.Sequential(torch.nn.AdaptiveAvgPool2d(1), + torch.nn.Flatten(), + torch.nn.Linear(width[5], num_classes)) + + def forward(self, x): + p1 = self.p1(x) + p2 = self.p2(p1) + p3 = self.p3(p2) + p4 = self.p4(p3) + p5 = self.p5(p4) + fc = self.fc(p5) + + x_raw = fc[:, 0:3] + y_raw = fc[:, 3:6] + + x = normalize(x_raw) + z = cross_product(x, y_raw) + z = normalize(z) + y = cross_product(z, x) + + x = x.view(-1, 3, 1) + y = y.view(-1, 3, 1) + z = z.view(-1, 3, 1) + return torch.cat(tensors=(x, y, z), dim=2) + + def fuse(self): + for m in self.modules(): + if type(m) is Residual: + m.fuse() + return self + + +def rep_net_a0(): + return SixDRepVGG(width=(3, 48, 48, 96, 192, 1280), depth=(2, 4, 14, 1)) + + +def rep_net_a1(): + return SixDRepVGG(width=(3, 64, 64, 128, 256, 1280), depth=[2, 4, 14, 1]) + + +def rep_net_a2(): + return SixDRepVGG(width=[3, 64, 96, 192, 384, 1408], depth=[2, 4, 14, 1]) + + +def rep_net_b0(): + return SixDRepVGG(width=[3, 64, 64, 128, 256, 1280], depth=[4, 6, 16, 1]) + + +def rep_net_b1(): + return SixDRepVGG(width=[3, 64, 128, 256, 512, 2048], depth=[4, 6, 16, 1]) + + +def rep_net_b2(): + return SixDRepVGG(width=[3, 64, 160, 320, 640, 2560], depth=[4, 6, 16, 1]) + + +class EMA: + """ + Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models + Keeps a moving average of everything in the model state_dict (parameters and buffers) + For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + """ + + def __init__(self, model, decay=0.9999, tau=2000, updates=0): + # Create EMA + self.ema = copy.deepcopy(model).eval() # FP32 EMA + self.updates = updates # number of EMA updates + # decay exponential ramp (to help early epochs) + self.decay = lambda x: decay * (1 - math.exp(-x / tau)) + for p in self.ema.parameters(): + p.requires_grad_(False) + + def update(self, model): + if hasattr(model, 'module'): + model = model.module + # Update EMA parameters + with torch.no_grad(): + self.updates += 1 + d = self.decay(self.updates) + + msd = model.state_dict() # model state_dict + for k, v in self.ema.state_dict().items(): + if v.dtype.is_floating_point: + v *= d + v += (1 - d) * msd[k].detach() + + +class CosineLR: + def __init__(self, args, optimizer): + self.min_lr = 1E-6 + self.epochs = args.epochs + self.learning_rates = [x['lr'] for x in optimizer.param_groups] + + def step(self, epoch, optimizer): + param_groups = optimizer.param_groups + for param_group, lr in zip(param_groups, self.learning_rates): + alpha = math.cos(math.pi * epoch / self.epochs) + lr = 0.5 * (lr - self.min_lr) * (1 + alpha) + param_group['lr'] = self.min_lr + lr + + +class ComputeLoss(torch.nn.Module): + def __init__(self): + super().__init__() + self.eps = 1E-7 + + def forward(self, outputs, targets): + m = torch.bmm(targets, outputs.transpose(1, 2)) + cos = (m[:, 0, 0] + m[:, 1, 1] + m[:, 2, 2] - 1) / 2 + theta = torch.acos(torch.clamp(cos, -1 + self.eps, 1 - self.eps)) + + return torch.mean(theta) diff --git a/rgbd_3d.py b/rgbd_3d.py new file mode 100755 index 0000000..0448846 --- /dev/null +++ b/rgbd_3d.py @@ -0,0 +1,765 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# mmdet and mmpose import +from mmpose.apis import (get_track_id, inference_top_down_pose_model, + init_pose_model, process_mmdet_results, + vis_pose_tracking_result) +from mmpose.datasets import DatasetInfo + +try: + from mmdet.apis import inference_detector, init_detector + has_mmdet = True +except (ImportError, ModuleNotFoundError): + has_mmdet = False + +# ros related import +import rospy +from sensor_msgs.msg import Image, PointCloud2 +from geometry_msgs.msg import Pose, Point +from cv_bridge import CvBridge + +# other import +import cv2 +import os +import matplotlib.pyplot as plt +from argparse import ArgumentParser +from datetime import datetime +import time +import json +import warnings +import numpy as np + +# utils import +from utils import * + +# motion bert import +import imageio +import torch +import torch.nn as nn +from torch.utils.data import DataLoader +from lib.utils.tools import * +from lib.utils.learning import * +from lib.utils.utils_data import flip_data +from lib.data.dataset_wild import WildDetDataset +from lib.utils.vismo import render_and_save + +import copy + +# remove numpy scientific notation +np.set_printoptions(suppress=True) + + +prSuccess("Everything imported !") + + +def crop_scale(motion, scale_range=[1, 1]): + ''' + For input of MotionBERT + Motion: [(M), T, 17, 3]. + Normalize to [-1, 1] + ''' + result = copy.deepcopy(motion) + valid_coords = motion[motion[..., 2]!=0][:,:2] + if len(valid_coords) < 4: + return np.zeros(motion.shape) + xmin = min(valid_coords[:,0]) + xmax = max(valid_coords[:,0]) + ymin = min(valid_coords[:,1]) + ymax = max(valid_coords[:,1]) + ratio = np.random.uniform(low=scale_range[0], high=scale_range[1], size=1)[0] + scale = max(xmax-xmin, ymax-ymin) * ratio + if scale==0: + return np.zeros(motion.shape) + xs = (xmin+xmax-scale) / 2 + ys = (ymin+ymax-scale) / 2 + result[...,:2] = (motion[..., :2]- [xs,ys]) / scale + result[...,:2] = (result[..., :2] - 0.5) * 2 + result = np.clip(result, -1, 1) + return result + + +def coco2h36m(x): + ''' + Input: x ((M )x T x V x C) + + COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank} + + H36M: + 0: 'root', + 1: 'rhip', + 2: 'rkne', + 3: 'rank', + 4: 'lhip', + 5: 'lkne', + 6: 'lank', + 7: 'belly', + 8: 'neck', + 9: 'nose', + 10: 'head', + 11: 'lsho', + 12: 'lelb', + 13: 'lwri', + 14: 'rsho', + 15: 'relb', + 16: 'rwri' + ''' + y = np.zeros(x.shape) + y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5 + y[:,1,:] = x[:,12,:] + y[:,2,:] = x[:,14,:] + y[:,3,:] = x[:,16,:] + y[:,4,:] = x[:,11,:] + y[:,5,:] = x[:,13,:] + y[:,6,:] = x[:,15,:] + y[:,8,:] = (x[:,5,:] + x[:,6,:]) * 0.5 + y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5 + y[:,9,:] = x[:,0,:] + y[:,10,:] = (x[:,1,:] + x[:,2,:]) * 0.5 + y[:,11,:] = x[:,5,:] + y[:,12,:] = x[:,7,:] + y[:,13,:] = x[:,9,:] + y[:,14,:] = x[:,6,:] + y[:,15,:] = x[:,8,:] + y[:,16,:] = x[:,10,:] + return y + + +class InferenceNodeRGBD(object): + def __init__(self, args): + + # init args + self.args = args + + # init detector and pose + prInfo('Initialiazing detector {}'.format(args.mb_checkpoint)) + self.det_model = init_detector( + args.det_config, args.det_checkpoint, device=args.device.lower()) + + prInfo('Initialiazing 2D Pose model {}'.format(args.mb_checkpoint)) + self.pose_model = init_pose_model( + args.pose_config, args.pose_checkpoint, device=args.device.lower()) + + # init 3d MotionBERT model + prInfo('Initialiazing 3D Pose Lifter {}'.format(args.mb_checkpoint)) + mb_3d_args = get_config(args.mb_3d_config) + self.motionbert_3d_model = load_backbone(mb_3d_args) + if torch.cuda.is_available(): + self.motionbert_3d_model = nn.DataParallel(self.motionbert_3d_model) + self.motionbert_3d_model = self.motionbert_3d_model.cuda() + else: + prWarning("Expect cuda to be available but is_available returned false") + exit(0) + + prInfo('Loading checkpoint {}'.format(args.mb_checkpoint)) + mb_checkpoint = torch.load(args.mb_checkpoint, map_location=lambda storage, loc: storage) + self.motionbert_3d_model.load_state_dict(mb_checkpoint['model_pos'], strict=True) + self.motionbert_3d_model.eval() + prInfo('Loaded motionbert_3d_model') + # no need for the whole WildDetDataset stuff, just manually make the input trajectories for the tracks + + # dataset params for detector and pose + self.dataset = self.pose_model.cfg.data['test']['type'] + self.dataset_info = self.pose_model.cfg.data['test'].get('self.dataset_info', None) + if self.dataset_info is None: + warnings.warn( + 'Please set `self.dataset_info` in the config.' + 'Check https://github.com/open-mmlab/mmpose/pull/663 for details.', + DeprecationWarning) + else: + self.dataset_info = DatasetInfo(self.dataset_info) + + self.return_heatmap = False + + self.next_id = 0 + self.pose_results = [] + self.count_frames = 0 + self.tracks_in_current_image = {} + + ## Init for node and save path + + self.rgb = None # Image frame + self.depth = None # Image frame + + self.pcl_array_rgb = None + self.pcl_array_xyz = None + + self.depth_array_max_threshold = 20000 #3000 # does not apply when saving depth mono16 image + + # viewing options + self.depth_cmap = get_mpl_colormap(args.depth_cmap) + self.confidence_cmap = get_mpl_colormap("viridis") + self.vis_img = None # output image RGB + detections + self.view_all_classes_dets = True + self.display_all_detection = args.display_all_detection + self.light_display = args.light_display + + self.pcl_current_seq = -1 + self.rgb_current_seq = -1 + self.last_inferred_seq = -1 + self.depth_current_seq = -1 + self.current_image_count = 0 + + self.br = CvBridge() + + prInfo("Setting node rate to {} fps".format(args.fps)) + self.loop_rate = rospy.Rate(args.fps) + + # make the output path + now = datetime.now() + timestamp = now.strftime("%Y_%m_%d_%H_%M_%S") + self.save_dir = os.path.join("output", "record_{:s}".format(timestamp)) + self.metadata = os.path.join(self.save_dir, "metadata.json") + self.save_dir_rgb = os.path.join(self.save_dir, "rgb") + self.save_dir_depth = os.path.join(self.save_dir, "depth") + self.save_dir_depth_color = os.path.join(self.save_dir, "depth_color") + self.save_dir_result = os.path.join(self.save_dir, "output") + self.save_dir_pcl_bin = os.path.join(self.save_dir, "pcl") + + if args.save or args.light_save: + prInfo("Saving to {}/[rgb][depth][depth_color][output][pcl]".format(self.save_dir)) + if not os.path.exists(self.save_dir): + prInfo("Creating directories to {}/[rgb][depth][depth_color][output][pcl]".format(self.save_dir)) + os.makedirs(self.save_dir) + os.makedirs(self.save_dir_rgb) + os.makedirs(self.save_dir_pcl_bin) + + if args.save: + os.makedirs(self.save_dir_depth) + os.makedirs(self.save_dir_depth_color) + os.makedirs(self.save_dir_result) + + args_dic = vars(args) + with open(self.metadata, 'w') as fp: + json.dump(args_dic, fp) + + prSuccess("Created directories to {}/[rgb][depth][depth_color][output][pcl]".format(self.save_dir)) + time.sleep(1) + + # Publishers + self.goal_pub = rospy.Publisher('points/handover_goal', Point, queue_size=10) + + # Subscribers + prInfo("Subscribing to {} for RGB".format(args.rgb_topic)) + rospy.Subscriber(args.rgb_topic, Image,self.callback_rgb) + prInfo("Subscribing to {} for depth".format(args.depth_topic)) + rospy.Subscriber(args.depth_topic,Image,self.callback_depth) + prInfo("Subscribing to {} for PCL".format(args.pcl_topic)) + rospy.Subscriber(args.pcl_topic, PointCloud2, self.callback_pcl) + + + def callback_pcl(self, msg): + pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape((msg.height, msg.width, -1)) + self.pcl_array_xyz = pcl_array[:,:,:3] + self.pcl_array_rgb = pcl_array[:,:,3:] + self.pcl_current_seq = msg.header.seq + # rospy.loginfo('pcl received ({})...'.format(msg.header.seq)) + + def callback_rgb(self, msg): + self.rgb = self.br.imgmsg_to_cv2(msg, "bgr8") + self.rgb_current_seq = msg.header.seq + # rospy.loginfo('RGB received ({})...'.format(msg.header.seq)) + + def callback_depth(self, msg): + self.depth = self.br.imgmsg_to_cv2(msg, "mono16") + self.depth_current_seq = msg.header.seq + # rospy.loginfo('Depth received ({})...'.format(msg.header.seq)) + + def is_ready(self): + ready = (self.rgb is not None) and (self.depth is not None) and (self.pcl_array_xyz is not None) + return ready + + def start(self): + + self.tracks = {} # all the tracks along time, we need to keep and history + + while not rospy.is_shutdown(): + + if self.is_ready(): + + image_count = self.current_image_count + self.current_image_count += 1 + + start_t = time.time() + + image_seq_unique = self.rgb_current_seq + now = datetime.now() + timestamp = now.strftime("%Y_%m_%d_%H_%M_%S_%f") + + if self.args.save or self.args.light_save: + rgb_path = os.path.join(self.save_dir_rgb, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp)) + cv2.imwrite(rgb_path, self.rgb) + prSuccess("Saved RGB to {}".format(rgb_path)) + + rgb_array = np.asarray(self.rgb) + + if self.args.save: + depth_path = os.path.join(self.save_dir_depth, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp)) + cv2.imwrite(depth_path, self.depth) + prSuccess("Saved depth to {}".format(depth_path)) + + depth_array = np.asarray(self.depth) + depth_array[depth_array > self.depth_array_max_threshold] = self.depth_array_max_threshold + + assert(depth_array.shape[0] == rgb_array.shape[0]) + assert(depth_array.shape[1] == rgb_array.shape[1]) + + # Process RGB array + if self.last_inferred_seq < self.rgb_current_seq: + + prInfo("Do inference on frame {}".format(self.rgb_current_seq)) + + # keep old poses for tracking + pose_results_last = self.pose_results + + tic = time.time() + mmdet_results = inference_detector(self.det_model, rgb_array) # list of detection rectangle i.e [(x1,y1,x2,y2), ...] + tac = time.time() + prInfo("Detection in {:.4f} sec (frame {}, number of human detection {})".format(tac-tic, self.rgb_current_seq, len(mmdet_results[0]))) + + # keep the person class bounding boxes. + person_results = process_mmdet_results(mmdet_results, self.args.det_cat_id) + + tic = time.time() + # test a single image, with a list of bboxes. + self.pose_results, returned_outputs = inference_top_down_pose_model( + self.pose_model, + rgb_array, + person_results, + bbox_thr=self.args.bbox_thr, + format='xyxy', + dataset=self.dataset, + dataset_info=self.dataset_info, + return_heatmap=self.return_heatmap, + outputs=None) + tac = time.time() + prInfo("Poses in {:.4f} sec".format(tac-tic)) + + # get track id for each person instance + self.pose_results, self.next_id = get_track_id( + self.pose_results, + pose_results_last, + self.next_id, + use_oks=False, + tracking_thr=self.args.tracking_thr, + use_one_euro=self.args.euro, + fps=10) + + # produce an output image + if not self.args.no_show: + self.vis_img = rgb_array.copy() + + if self.display_all_detection and not self.args.no_show: + for c in range(len(mmdet_results)): + if len(mmdet_results[c]) > 0: + for bi in range(mmdet_results[c].shape[0]): + if mmdet_results[c][bi,4] > self.args.bbox_thr: + bbox = mmdet_results[c][bi,:4].copy().astype(np.int32) + bbox_ints = [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])] + pt1 = ( min( max(0,bbox_ints[0]), depth_array.shape[1]), + min( max(0,bbox_ints[1]), depth_array.shape[0]) ) + pt2 = ( min( max(0,bbox_ints[2]), depth_array.shape[1]), + min( max(0,bbox_ints[3]), depth_array.shape[0]) ) + cv2.rectangle(self.vis_img, pt1, pt2, (255,255,255), 1) + cv2.putText(self.vis_img, "{:s} ({:.0f}%)".format(YOLO_COCO_80_CLASSES[c], mmdet_results[c][bi,4]*100), pt1, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + + + #### post processing and 3D lifting #### + + # remove too old tracks + for idx, track in list(self.tracks.items()): + if abs(self.current_image_count - track["last_seen"]) > self.args.max_frames_remove_tracks: + prInfo("Removing track {}, not seen since frame {}, current is {}".format(idx, track["last_seen"], self.current_image_count)) + self.tracks.pop(idx) + + self.tracks_in_current_image = {} + + for res in self.pose_results: + + # for each instance + + bbox = res["bbox"] + keypoints = res["keypoints"] + idx = res["track_id"] + + if idx not in self.tracks.keys(): + prInfo("Adding a new track with idx {}".format(idx)) + self.tracks[idx] = {} + self.tracks[idx]["last_seen"] = self.current_image_count + self.tracks[idx]["keypoints_2d"] = [] + + # add keypoint to the current track + self.tracks[idx]["last_seen"] = self.current_image_count + self.tracks[idx]["keypoints_2d"].append(keypoints) + + self.tracks_in_current_image[idx] = { + "right_wrist_depth" : None, + "right_wrist_pose" : None, + "left_wrist_depth" : None, + "left_wrist_pose" : None, + "depth_center" : None, + "pose_center" : None, + "pose_from" : None + } + + # if history is long enough, process the trajectory for MotionBERT + if len(self.tracks[idx]["keypoints_2d"]) >= self.args.mb_clip_len: + prInfo("Running MotionBERT for track {}".format(idx)) + + # prepare motion + motion = np.asarray(self.tracks[idx]["keypoints_2d"]) # T, 17, 3 + motion = motion[-self.args.mb_clip_len:, :, :] # keep only the required len + assert(motion.shape[1] == 17) + assert(motion.shape[2] == 3) + motion_h36 = coco2h36m(motion) # input is h36 format + motion_h36_scaled = crop_scale(motion_h36) # scale [1,1], normalize, crop + + with torch.no_grad(): + current_input = torch.Tensor(motion_h36_scaled).unsqueeze(0).cuda() + tic = time.time() + predicted_3d_pos = self.motionbert_3d_model(current_input) + tac = time.time() + prInfo("MotionBERT in {:.4f} sec".format(tac-tic)) + + # root relative + predicted_3d_pos[:,:,0,:] = 0 # [1,T,17,3] + + # TODO : change it because a bit weird it is not aligned with 2D poses because of the history ! + predicted_3d_pos_np = predicted_3d_pos[0,-1,:,:].cpu().numpy() # keep only the last prediction + if "keypoints_3d" in self.tracks[idx].keys(): + self.tracks[idx]["keypoints_3d"].append(predicted_3d_pos_np) + else: + self.tracks[idx]["keypoints_3d"] = [predicted_3d_pos_np] + + + # Draw bounding bbox + bbox = bbox.astype(np.int32) + bbox_ints = [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])] + pt1 = ( min( max(0,bbox_ints[0]), depth_array.shape[1]), + min( max(0,bbox_ints[1]), depth_array.shape[0]) ) + pt2 = ( min( max(0,bbox_ints[2]), depth_array.shape[1]), + min( max(0,bbox_ints[3]), depth_array.shape[0]) ) + color = RANDOM_COLORS[idx % 255] + color_tuple = (int(color[0]), int(color[1]), int(color[2])) + + if not self.args.no_show: + cv2.rectangle( self.vis_img, pt1, pt2, color_tuple, 2) + + body_center_joints = [] # to store center of lsho, rsho, lhip, rhip in pixels + + for j in range(keypoints.shape[0]): + + kp = keypoints[j,:] + confidence = int(kp[2] * 255) + confidence_color = (self.confidence_cmap[min(255,confidence)]*255).astype(np.uint8) + + if confidence > self.args.kpt_thr and kp[0] > 0 and kp[1] > 0 and kp[0] < depth_array.shape[1] and kp[1] < depth_array.shape[0]: + + if (j == 5) or (j == 6) or (j == 11) or (j == 12): + # one keypoint of the torso + body_center_joints.append(kp) + + if not self.args.no_show: + # kp_color_tuple = (int(confidence_color[0]), int(confidence_color[1]), int(confidence_color[2])) + cv2.circle(self.vis_img, (int(kp[0]), int(kp[1])), 2, color_tuple, thickness = 3) + + # if wrists, find depth and pose + + if (j == 10): + # right wrist + depth_wrist = depth_array[int(kp[1]), int(kp[0])] + pose_wrist = self.pcl_array_xyz[int(kp[1]), int(kp[0]),:] + self.tracks_in_current_image[idx]["right_wrist_depth"] = depth_wrist + self.tracks_in_current_image[idx]["right_wrist_pose"] = pose_wrist + if not self.light_display and not self.args.no_show: + cv2.drawMarker(self.vis_img, (int(kp[0]), int(kp[1])), color = color_tuple, thickness = 3, + markerType = cv2.MARKER_CROSS, line_type = cv2.LINE_AA, + markerSize = 16) + cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_wrist/10, pose_wrist[0], pose_wrist[1], pose_wrist[2]), (int(kp[0]), int(kp[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 255), 2) + + elif (j == 9): + # left wrist + depth_wrist = depth_array[int(kp[1]), int(kp[0])] + pose_wrist = self.pcl_array_xyz[int(kp[1]), int(kp[0]),:] + self.tracks_in_current_image[idx]["left_wrist_depth"] = depth_wrist + self.tracks_in_current_image[idx]["left_wrist_pose"] = pose_wrist + if not self.light_display and not self.args.no_show: + cv2.drawMarker(self.vis_img, (int(kp[0]), int(kp[1])), color = color_tuple, thickness = 3, + markerType = cv2.MARKER_CROSS, line_type = cv2.LINE_AA, + markerSize = 16) + cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_wrist/10, pose_wrist[0], pose_wrist[1], pose_wrist[2]), (int(kp[0]), int(kp[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 255), 2) + + # find the body center + if len(body_center_joints) == 4: + # if we managed to find the 4 points of the torso, search on the torso + body_center_joints = np.asarray(body_center_joints) # lsho, rsho, lhip, rhip + lsho = body_center_joints[0,:] + rsho = body_center_joints[1,:] + lhip = body_center_joints[2,:] + rhip = body_center_joints[3,:] + + # find 4 points between lsho and rhip and 4 points between rsho and lhip to find something more precise + seg_steps = [0.0, 0.25, 0.50, 0.75, 1.0] + depths_torso = [] + poses_torso = [] + for step in seg_steps: + + p1 = step * lsho + (1 - step) * rhip + if p1[0] < depth_array.shape[1] and p1[1] < depth_array.shape[0]: + depth_p1 = depth_array[int(p1[1]), int(p1[0])] + pose_p1 = self.pcl_array_xyz[int(p1[1]), int(p1[0]), :] + if depth_p1 > 0: + depths_torso.append(depth_p1) + poses_torso.append(pose_p1) + + p2 = step * rsho + (1 - step) * lhip + if p2[0] < depth_array.shape[1] and p2[1] < depth_array.shape[0]: + depth_p2 = depth_array[int(p2[1]), int(p2[0])] + pose_p2 = self.pcl_array_xyz[int(p2[1]), int(p2[0]), :] + if depth_p2 > 0: + depths_torso.append(depth_p2) + poses_torso.append(pose_p2) + + if not self.args.no_show: + # draw to check + cv2.drawMarker(self.vis_img, (int(p1[0]), int(p1[1])), color = color_tuple, thickness = 1, + markerType = cv2.MARKER_DIAMOND, line_type = cv2.LINE_AA, + markerSize = 8) + cv2.drawMarker(self.vis_img, (int(p2[0]), int(p2[1])), color = color_tuple, thickness = 1, + markerType = cv2.MARKER_DIAMOND, line_type = cv2.LINE_AA, + markerSize = 8) + + if len(depths_torso) > 3: + # at least 4 points to average decently + depth_body = np.asarray(depths_torso).mean() + pose_body = np.asarray(poses_torso).mean(axis = 0) + self.tracks_in_current_image[idx]["depth_center"] = depth_body # mm + self.tracks_in_current_image[idx]["pose_center"] = pose_body # m + self.tracks_in_current_image[idx]["pose_from"] = "torso" + + # just for drawing + body_center = np.mean(body_center_joints, axis = 0) + # Draw center of body + body_center = (int(body_center[0]), int(body_center[1])) + if not self.light_display and not self.args.no_show: + cv2.drawMarker(self.vis_img, body_center, color = color_tuple, thickness = 3, + markerType = cv2.MARKER_TILTED_CROSS, line_type = cv2.LINE_AA, + markerSize = 16) + cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_body/10, pose_body[0], pose_body[1], pose_body[2]), (int(body_center[0]), int(body_center[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 3) + + # # fetch depth and pose at torso center + # if body_center[0] < depth_array.shape[1] and body_center[1] < depth_array.shape[0]: + # depth_center = depth_array[body_center[1], body_center[0]] + # pose_center = self.pcl_array_xyz[body_center[1], body_center[0],:] + # if not self.light_display: + # cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_center/10, pose_center[0], pose_center[1], pose_center[2]), (int(body_center[0]), int(body_center[1])), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3) + # if (depth_center != 0): + # self.tracks_in_current_image[idx]["depth_center"] = depth_center # mm + # self.tracks_in_current_image[idx]["pose_center"] = pose_center # m + # self.tracks_in_current_image[idx]["pose_from"] = "torso" + # # prSuccess("Publishing coordinates {:.2f} {:.2f} {:.2f}".format(pose_center[0], pose_center[1], pose_center[2])) + # # self.goal_pub.publish(Point(x = pose_center[0], y = pose_center[1], z = pose_center[2])) + + else: + # if we did not managed to find the 4 points of the torso, search in the bbox + prWarning("Can't use body center from shoulders and hips, use center of box for track {} || UPDATE : do nothing".format(idx)) + + if False: + # Draw center of bbox + bbox_center = (int(pt1[0]/2 + pt2[0]/2), int(pt1[1]/2 + pt2[1]/2)) + if not self.light_display: + cv2.drawMarker(self.vis_img, bbox_center, color = color_tuple, thickness = 3, + markerType = cv2.MARKER_CROSS, line_type = cv2.LINE_AA, + markerSize = 16) + + # fetch depth and pose at bbox center + if bbox_center[0] < depth_array.shape[1] and bbox_center[1] < depth_array.shape[0]: + depth_center = depth_array[bbox_center[1], bbox_center[0]] + pose_center = self.pcl_array_xyz[bbox_center[1], bbox_center[0],:] + if not self.light_display: + cv2.putText(self.vis_img, "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format(depth_center/10, pose_center[0], pose_center[1], pose_center[2]), (int(bbox_center[0]), int(bbox_center[1])), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3) + if (depth_center != 0): + self.tracks_in_current_image[idx]["depth_center"] = depth_center # mm + self.tracks_in_current_image[idx]["pose_center"] = pose_center # m + self.tracks_in_current_image[idx]["pose_from"] = "bbox" + # prSuccess("Publishing coordinates {:.2f} {:.2f} {:.2255f}".format(pose_center[0], pose_center[1], pose_center[2])) + # self.goal_pub.publish(Point(x = pose_center[0], y = pose_center[1], z = pose_center[2])) + + # draw skeleton + if not self.args.no_show: + for limb in COCO17_JOINTS_LIMBS: + start = keypoints[limb[0],:] + end = keypoints[limb[1],:] + start_point = (int(start[0]), int(start[1])) + end_point = (int(end[0]), int(end[1])) + if (start[2] > self.args.kpt_thr) and (end[2] > self.args.kpt_thr): + cv2.line(self.vis_img, start_point, end_point, color = color_tuple, thickness = 3) + + min_depth = 1e6 # mm + min_depth_idx = -1 + for idx, track_info in self.tracks_in_current_image.items(): + depth = track_info["depth_center"] + if depth is not None: + if depth < min_depth: + min_depth = depth + min_depth_idx = idx + + if (min_depth_idx != -1): + pose_closest = self.tracks_in_current_image[min_depth_idx]["pose_center"] + prInfo("Using track {} as it is the closest".format(min_depth_idx)) + self.goal_pub.publish(Point(x = pose_closest[0], y = pose_closest[1], z = pose_closest[2])) + prSuccess("Publishing coordinates {:.2f} {:.2f} {:.2f}".format(pose_closest[0], pose_closest[1], pose_closest[2])) + if not self.args.no_show: + cv2.putText(self.vis_img, "{:.2f} {:.2f} {:.2f}".format(pose_closest[0], pose_closest[1], pose_closest[2]), (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 255, 255), 5) + cv2.putText(self.vis_img, "{:.2f} {:.2f} {:.2f}".format(pose_closest[0], pose_closest[1], pose_closest[2]), (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 0), 3) + else: + if not self.args.no_show: + cv2.putText(self.vis_img, "No tracks with pose found", (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 255, 255), 5) + cv2.putText(self.vis_img, "No tracks with pose found", (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 0), 3) + + self.last_inferred_seq = self.rgb_current_seq + + if self.args.save and not self.args.no_show: + results_path = os.path.join(self.save_dir_result, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp)) + cv2.imwrite(results_path, self.vis_img) + prSuccess("Saved result to {}".format(results_path)) + + else: + prWarning("No inference because the current RGB frame has already been processed") + + if not self.args.no_show: + depth_array_norm = ((depth_array - depth_array.min())) / (depth_array.max() - depth_array.min()) + depth_array_norm = depth_array_norm * 255 + depth_array_norm = depth_array_norm.astype(np.uint8) + depth_array_norm_colored = (self.depth_cmap[depth_array_norm] * 255).astype(np.uint8) + + if self.args.save: + depth_color_path = os.path.join(self.save_dir_depth_color, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp)) + cv2.imwrite(depth_color_path, depth_array_norm_colored) + prSuccess("Saved depth color (scaled) to {}".format(depth_color_path)) + + if self.args.save or self.args.light_save: + pcl_path = os.path.join(self.save_dir_pcl_bin, "{:08d}_seq_{:010d}_ts_{}.bin".format(image_count, image_seq_unique, timestamp)) + self.pcl_array_xyz.tofile(pcl_path) + prSuccess("Saved pcl to {}".format(pcl_path)) + + if self.vis_img is not None: + full_display_array = np.zeros((rgb_array.shape[0] * 2, rgb_array.shape[1], 3), dtype = np.uint8) + full_display_array[:rgb_array.shape[0], : ,:] = self.vis_img + full_display_array[rgb_array.shape[0]:, : ,:] = depth_array_norm_colored + + cv2.imshow("RGBD window", full_display_array) + cv2.waitKey(3) + + end_t = time.time() + prInfoBold("Processed frame {} in {:.4f} sec".format(self.current_image_count, end_t-start_t)) + + + + + else: + print("Images are None !") + + self.loop_rate.sleep() + +if __name__ == '__main__': + + ## Parser with params + parser = ArgumentParser() + parser.add_argument('--det_config', type=str, default = "./configs/detection/yolov3_d53_320_273e_coco.py", help='Config file for detection') + parser.add_argument('--det_checkpoint', type=str, default = "./models/yolov3_d53_320_273e_coco-421362b6.pth", help='Checkpoint file for detection') + parser.add_argument('--pose_config', type=str, default = "./configs/pose/ViTPose_small_coco_256x192.py", help='Config file for pose') + parser.add_argument('--pose_checkpoint', type=str, default = "./models/vitpose_small.pth", help='Checkpoint file for pose') + parser.add_argument( + '--device', + default='cuda:0', + help='Device used for inference') + parser.add_argument( + '--det_cat_id', + type=int, + default=1, + help='Category id for bounding box detection model (person)') + parser.add_argument( + '--bbox_thr', + type=float, + default=0.3, + help='Bounding box score threshold') + parser.add_argument( + '--kpt_thr', + type=float, + default=0.3, + help='Keypoint score threshold') + parser.add_argument( + '--tracking_thr', + type=float, + default=0.3, + help='Tracking threshold') + parser.add_argument( + '--euro', + action='store_true', + help='Using One_Euro_Filter for smoothing') + + parser.add_argument('--rgb_topic', default = "orbbec/rgb", type=str, help='ROS topic for RGB image') + parser.add_argument('--depth_topic', default = "orbbec/depth", type=str, help='ROS topic for depth image') + parser.add_argument('--pcl_topic', default = "orbbec/pcl", type=str, help='ROS topic for pcl') + parser.add_argument( + '--no_show', + action='store_true', + default=False, + help='whether to show visualizations.') + parser.add_argument( + '--save', + action='store_true', + default=False, + help='whether to save images (rgb and d and predictions and pcl)') + parser.add_argument( + '--light_save', + action='store_true', + default=False, + help='whether to save only rgb and pcl (not optimized use the light_save of visualizer for optimized saving)') + parser.add_argument( + '--display_all_detection', "-dad", + action='store_true', + default=False, + help='whether to display all detections or only human') + parser.add_argument( + '--light_display', "-ld", + action='store_true', + default=False, + help='whether to display only skeletons') + parser.add_argument( + '--fps', + type=int, + default=30, + help='Node and recording fps') + parser.add_argument('--depth_cmap', default = "jet", type=str, help='mpl colormap for depth image') + + parser.add_argument('--mb_3d_config', type=str, default = "./configs/pose3d/MB_ft_h36m.yaml", help='Config file for 3D poses') + parser.add_argument('--mb_checkpoint', type=str, default = "./checkpoint/pose3d/MB_train_h36m/best_epoch.bin", help='Checkpoint file for 3D poses') + parser.add_argument( + '--mb_clip_len', + type=int, + default=10, + help='Number of past frames to use for MotionBERT (default in model is 243)') + parser.add_argument( + '--max_frames_remove_tracks', + type=int, + default=2, + help='Number frames without the track present to keep going before removing a track') + + + args = parser.parse_args() + + assert has_mmdet, 'Please install mmdet to run the demo.' + assert args.det_config is not None + assert args.det_checkpoint is not None + + if (args.save or args.light_save) and args.no_show: + print("Do not use the no_show mode if save is enabled, no rendering is done if --no_show") + + prInfo("Loaded with args : {}".format(args)) + + rospy.init_node("python_orbbec_inference", anonymous=True) + my_node = InferenceNodeRGBD(args) + my_node.start() + cv2.destroyAllWindows() \ No newline at end of file diff --git a/rgbd_detect.py b/rgbd_detect.py new file mode 100644 index 0000000..662d30f --- /dev/null +++ b/rgbd_detect.py @@ -0,0 +1,1011 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# mmdet and mmpose import +from mmpose.apis import ( + get_track_id, + inference_top_down_pose_model, + init_pose_model, + process_mmdet_results, + vis_pose_tracking_result, +) +from mmpose.datasets import DatasetInfo + +try: + from mmdet.apis import inference_detector, init_detector + + has_mmdet = True +except (ImportError, ModuleNotFoundError): + has_mmdet = False + +# ros related import +import rospy +from sensor_msgs.msg import Image, PointCloud2 +from geometry_msgs.msg import TransformStamped +from cv_bridge import CvBridge +import tf2_ros + +# other import +import cv2 +import os +import matplotlib.pyplot as plt +from argparse import ArgumentParser +from datetime import datetime +import time +import json +import warnings +import numpy as np +from PyKDL import Rotation + +# utils import +from utils import * + +# remove numpy scientific notation +np.set_printoptions(suppress=True) + + +class InferenceNodeRGBD(object): + def __init__(self, args): + + # init args + self.args = args + + # init detector and pose + self.det_model = init_detector( + args.det_config, args.det_checkpoint, device=args.device.lower() + ) + + self.pose_model = init_pose_model( + args.pose_config, args.pose_checkpoint, device=args.device.lower() + ) + + self.dataset = self.pose_model.cfg.data["test"]["type"] + self.dataset_info = self.pose_model.cfg.data["test"].get( + "self.dataset_info", None + ) + if self.dataset_info is None: + warnings.warn( + "Please set `self.dataset_info` in the config." + "Check https://github.com/open-mmlab/mmpose/pull/663 for details.", + DeprecationWarning, + ) + else: + self.dataset_info = DatasetInfo(self.dataset_info) + + self.return_heatmap = False + + self.next_id = 0 + self.pose_results = [] + self.count_frames = 0 + self.tracks_in_current_image = {} + + ## Init for node and save path + + self.rgb = None # Image frame + self.depth = None # Image frame + + self.pcl_array_rgb = None + self.pcl_array_xyz = None + + self.depth_array_max_threshold = ( + 20000 # 3000 # does not apply when saving depth mono16 image + ) + + # viewing options + self.depth_cmap = get_mpl_colormap(args.depth_cmap) + self.confidence_cmap = get_mpl_colormap("viridis") + self.vis_img = None # output image RGB + detections + self.view_all_classes_dets = True + self.display_all_detection = args.display_all_detection + self.light_display = args.light_display + + self.pcl_current_seq = -1 + self.rgb_current_seq = -1 + self.last_inferred_seq = -1 + self.depth_current_seq = -1 + self.current_image_count = 0 + + self.br = CvBridge() + + prInfo("Setting node rate to {} fps".format(args.fps)) + self.loop_rate = rospy.Rate(args.fps) + + # make the output path + now = datetime.now() + timestamp = now.strftime("%Y_%m_%d_%H_%M_%S") + self.save_dir = os.path.join("output", "record_{:s}".format(timestamp)) + self.metadata = os.path.join(self.save_dir, "metadata.json") + self.save_dir_rgb = os.path.join(self.save_dir, "rgb") + self.save_dir_depth = os.path.join(self.save_dir, "depth") + self.save_dir_result = os.path.join(self.save_dir, "output") + self.save_dir_pcl_bin = os.path.join(self.save_dir, "pcl") + + if args.save or args.light_save: + prInfo( + "Saving to {}/[rgb][depth][depth_color][output][pcl]".format( + self.save_dir + ) + ) + if not os.path.exists(self.save_dir): + prInfo( + "Creating directories to {}/[rgb][depth][depth_color][output][pcl]".format( + self.save_dir + ) + ) + os.makedirs(self.save_dir) + os.makedirs(self.save_dir_rgb) + os.makedirs(self.save_dir_pcl_bin) + + if args.save: + os.makedirs(self.save_dir_depth) + os.makedirs(self.save_dir_result) + + args_dic = vars(args) + with open(self.metadata, "w") as fp: + json.dump(args_dic, fp) + + prSuccess( + "Created directories to {}/[rgb][depth][depth_color][output][pcl]".format( + self.save_dir + ) + ) + time.sleep(1) + + # Publishers + self.goal_pub = rospy.Publisher( + args.namespace + "/human", TransformStamped, queue_size=1 + ) + + self.tf_br = tf2_ros.TransformBroadcaster() + + # Subscribers + rgb_topic = args.namespace + "/rgb" + depth_topic = args.namespace + "/depth" + pcl_topic = args.namespace + "/pcl" + prInfo("Subscribing to {} for RGB".format(rgb_topic)) + rospy.Subscriber(rgb_topic, Image, self.callback_rgb) + prInfo("Subscribing to {} for depth".format(depth_topic)) + rospy.Subscriber(depth_topic, Image, self.callback_depth) + prInfo("Subscribing to {} for PCL".format(pcl_topic)) + rospy.Subscriber(pcl_topic, PointCloud2, self.callback_pcl) + + self.rgb_frame_id = None + + def callback_pcl(self, msg): + if self.args.flip: + pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape( + (msg.height, msg.width, -1) + )[::-1, ::-1, :] + else: + pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape( + (msg.height, msg.width, -1) + ) + + # pcl_array = pcl_array[::-1, :, :] + self.pcl_array_xyz = pcl_array[:, :, :3] + # self.pcl_array_rgb = pcl_array[:,:,3:] + self.pcl_current_seq = msg.header.seq + # rospy.loginfo('pcl received ({})...'.format(msg.header.seq)) + + def callback_rgb(self, msg): + if self.rgb_frame_id != msg.header.frame_id: + self.rgb_frame_id = msg.header.frame_id + if self.args.flip: + self.rgb = cv2.flip(self.br.imgmsg_to_cv2(msg, "bgr8"), -1) + else: + self.rgb = self.br.imgmsg_to_cv2(msg, "bgr8") + + # self.rgb = cv2.rotate(self.rgb, cv2.ROTATE_180) + self.rgb_current_seq = msg.header.seq + # rospy.loginfo('RGB received ({})...'.format(msg.header.seq)) + self.rgb_timestamp = msg.header.stamp + + def callback_depth(self, msg): + if self.args.flip: + self.depth = cv2.flip(self.br.imgmsg_to_cv2(msg, "mono16"), -1) + else: + self.depth = self.br.imgmsg_to_cv2(msg, "mono16") + + self.depth_current_seq = msg.header.seq + # rospy.loginfo('Depth received ({})...'.format(msg.header.seq)) + + def is_ready(self): + ready = ( + (self.rgb is not None) + and (self.depth is not None) + and (self.pcl_array_xyz is not None) + ) + return ready + + def save_rgb(self, image_count, image_seq_unique, timestamp): + prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save") + rgb_path = os.path.join( + self.save_dir_rgb, + "{:08d}_seq_{:010d}_ts_{}.png".format( + image_count, image_seq_unique, timestamp + ), + ) + cv2.imwrite(rgb_path, self.rgb) + prSuccess("Saved RGB to {}".format(rgb_path)) + + def save_depth(self, image_count, image_seq_unique, timestamp): + prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save") + depth_path = os.path.join( + self.save_dir_depth, + "{:08d}_seq_{:010d}_ts_{}.png".format( + image_count, image_seq_unique, timestamp + ), + ) + cv2.imwrite(depth_path, self.depth) + prSuccess("Saved depth to {}".format(depth_path)) + + def save_output_image(self, image_count, image_seq_unique, timestamp): + prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save") + results_path = os.path.join( + self.save_dir_result, + "{:08d}_seq_{:010d}_ts_{}.png".format( + image_count, image_seq_unique, timestamp + ), + ) + cv2.imwrite(results_path, self.vis_img) + prSuccess("Saved result to {}".format(results_path)) + + def save_pcl(self, image_count, image_seq_unique, timestamp): + prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save") + pcl_path = os.path.join( + self.save_dir_pcl_bin, + "{:08d}_seq_{:010d}_ts_{}.bin".format( + image_count, image_seq_unique, timestamp + ), + ) + self.pcl_array_xyz.tofile(pcl_path) + prSuccess("Saved pcl to {}".format(pcl_path)) + + def plot_mmdet_bbox(self, mmdet_results, array_shape): + for c in range(len(mmdet_results)): + if len(mmdet_results[c]) > 0: + for bi in range(mmdet_results[c].shape[0]): + if mmdet_results[c][bi, 4] > self.args.bbox_thr: + bbox = ( + mmdet_results[c][bi, :4] + .copy() + .astype(np.int32) + ) + bbox_ints = [ + int(bbox[0]), + int(bbox[1]), + int(bbox[2]), + int(bbox[3]), + ] + pt1 = ( + min( + max(0, bbox_ints[0]), + array_shape[1], + ), + min( + max(0, bbox_ints[1]), + array_shape[0], + ), + ) + pt2 = ( + min( + max(0, bbox_ints[2]), + array_shape[1], + ), + min( + max(0, bbox_ints[3]), + array_shape[0], + ), + ) + cv2.rectangle( + self.vis_img, pt1, pt2, (255, 255, 255), 1 + ) + cv2.putText( + self.vis_img, + "{:s} ({:.0f}%)".format( + YOLO_COCO_80_CLASSES[c], + mmdet_results[c][bi, 4] * 100, + ), + pt1, + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + ) + + def plot_mmdet_person_bbox(self, idx, bbox, array_shape): + bbox_ints = [ + int(bbox[0]), + int(bbox[1]), + int(bbox[2]), + int(bbox[3]), + ] + pt1 = ( + min(max(0, bbox_ints[0]), array_shape[1]), + min(max(0, bbox_ints[1]), array_shape[0]), + ) + pt2 = ( + min(max(0, bbox_ints[2]), array_shape[1]), + min(max(0, bbox_ints[3]), array_shape[0]), + ) + color = RANDOM_COLORS[idx] + color_tuple = (int(color[0]), int(color[1]), int(color[2])) + + cv2.rectangle(self.vis_img, pt1, pt2, color_tuple, 2) + + def process_keypoints(self, keypoints, depth_array, idx): + body_center_joints = ( + [] + ) # to store center of lsho, rsho, lhip, rhip in pixels + color = RANDOM_COLORS[idx] + color_tuple = (int(color[0]), int(color[1]), int(color[2])) + + for j in range(keypoints.shape[0]): + + kp = keypoints[j, :] + confidence = int(kp[2] * 255) + confidence_color = ( + self.confidence_cmap[min(255, confidence)] * 255 + ).astype(np.uint8) + + if ( + kp[2] > self.args.kpt_thr + and kp[0] > 0 + and kp[1] > 0 + and kp[0] < depth_array.shape[1] + and kp[1] < depth_array.shape[0] + ): + + if (j == 5) or (j == 6) or (j == 11) or (j == 12): + # one keypoint of the torso + body_center_joints.append(kp) + + if not self.args.no_show: + # kp_color_tuple = (int(confidence_color[0]), int(confidence_color[1]), int(confidence_color[2])) + cv2.circle( + self.vis_img, + (int(kp[0]), int(kp[1])), + 2, + color_tuple, + thickness=3, + ) + + # if wrists, find depth and pose + + if j == 10: + # right wrist + depth_wrist = depth_array[int(kp[1]), int(kp[0])] + pose_wrist = self.pcl_array_xyz[ + int(kp[1]), int(kp[0]), : + ] + self.tracks_in_current_image[idx][ + "right_wrist_depth" + ] = depth_wrist + self.tracks_in_current_image[idx][ + "right_wrist_pose" + ] = pose_wrist + if not self.light_display and not self.args.no_show: + cv2.drawMarker( + self.vis_img, + (int(kp[0]), int(kp[1])), + color=color_tuple, + thickness=3, + markerType=cv2.MARKER_CROSS, + line_type=cv2.LINE_AA, + markerSize=16, + ) + cv2.putText( + self.vis_img, + "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format( + depth_wrist / 10, + pose_wrist[0], + pose_wrist[1], + pose_wrist[2], + ), + (int(kp[0]), int(kp[1])), + cv2.FONT_HERSHEY_SIMPLEX, + 0.8, + (255, 0, 255), + 2, + ) + + elif j == 9: + # left wrist + depth_wrist = depth_array[int(kp[1]), int(kp[0])] + pose_wrist = self.pcl_array_xyz[ + int(kp[1]), int(kp[0]), : + ] + self.tracks_in_current_image[idx][ + "left_wrist_depth" + ] = depth_wrist + self.tracks_in_current_image[idx][ + "left_wrist_pose" + ] = pose_wrist + if not self.light_display and not self.args.no_show: + cv2.drawMarker( + self.vis_img, + (int(kp[0]), int(kp[1])), + color=color_tuple, + thickness=3, + markerType=cv2.MARKER_CROSS, + line_type=cv2.LINE_AA, + markerSize=16, + ) + cv2.putText( + self.vis_img, + "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format( + depth_wrist / 10, + pose_wrist[0], + pose_wrist[1], + pose_wrist[2], + ), + (int(kp[0]), int(kp[1])), + cv2.FONT_HERSHEY_SIMPLEX, + 0.8, + (255, 0, 255), + 2, + ) + + return body_center_joints + + def get_depth_and_poses_of_torso(self, depth_array, lsho, rsho, lhip, rhip, idx): + + color = RANDOM_COLORS[idx] + color_tuple = (int(color[0]), int(color[1]), int(color[2])) + + # find 4 points between lsho and rhip and 4 points between rsho and lhip to find something more precise + seg_steps = [0.0, 0.25, 0.50, 0.75, 1.0] + depths_torso = [] + poses_torso = [] + for step in seg_steps: + + p1 = step * lsho + (1 - step) * rhip + if ( + p1[0] < depth_array.shape[1] + and p1[1] < depth_array.shape[0] + ): + depth_p1 = depth_array[int(p1[1]), int(p1[0])] + pose_p1 = self.pcl_array_xyz[ + int(p1[1]), int(p1[0]), : + ] + if depth_p1 > 0: + depths_torso.append(depth_p1) + poses_torso.append(pose_p1) + + p2 = step * rsho + (1 - step) * lhip + if ( + p2[0] < depth_array.shape[1] + and p2[1] < depth_array.shape[0] + ): + depth_p2 = depth_array[int(p2[1]), int(p2[0])] + pose_p2 = self.pcl_array_xyz[ + int(p2[1]), int(p2[0]), : + ] + if depth_p2 > 0: + depths_torso.append(depth_p2) + poses_torso.append(pose_p2) + + if not self.args.no_show: + # draw to check + cv2.drawMarker( + self.vis_img, + (int(p1[0]), int(p1[1])), + color=color_tuple, + thickness=1, + markerType=cv2.MARKER_DIAMOND, + line_type=cv2.LINE_AA, + markerSize=8, + ) + cv2.drawMarker( + self.vis_img, + (int(p2[0]), int(p2[1])), + color=color_tuple, + thickness=1, + markerType=cv2.MARKER_DIAMOND, + line_type=cv2.LINE_AA, + markerSize=8, + ) + + return depths_torso, poses_torso + + def plot_body_pose_data(self, body_center, depth_body, pose_body, idx): + + color = RANDOM_COLORS[idx] + color_tuple = (int(color[0]), int(color[1]), int(color[2])) + + cv2.drawMarker( + self.vis_img, + body_center, + color = color_tuple, + thickness=3, + markerType=cv2.MARKER_TILTED_CROSS, + line_type=cv2.LINE_AA, + markerSize=16, + ) + cv2.putText( + self.vis_img, + "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format( + depth_body / 10, + pose_body[0], + pose_body[1], + pose_body[2], + ), + (int(body_center[0]), int(body_center[1])), + cv2.FONT_HERSHEY_SIMPLEX, + 0.8, + (0, 255, 0), + 3, + ) + + def plot_skeleton_2d(self, keypoints, idx): + + color = RANDOM_COLORS[idx] + color_tuple = (int(color[0]), int(color[1]), int(color[2])) + + for limb in COCO17_JOINTS_LIMBS: + start = keypoints[limb[0], :] + end = keypoints[limb[1], :] + start_point = (int(start[0]), int(start[1])) + end_point = (int(end[0]), int(end[1])) + if (start[2] > self.args.kpt_thr) and ( + end[2] > self.args.kpt_thr + ): + cv2.line( + self.vis_img, + start_point, + end_point, + color = color_tuple, + thickness=3, + ) + + def plot_det_text_info(self, pose_closest): + if pose_closest is not None: + cv2.putText( + self.vis_img, + "{:.2f} {:.2f} {:.2f}".format( + pose_closest[0], pose_closest[1], pose_closest[2] + ), + (30, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1.2, + (255, 255, 255), + 5, + ) + cv2.putText( + self.vis_img, + "{:.2f} {:.2f} {:.2f}".format( + pose_closest[0], pose_closest[1], pose_closest[2] + ), + (30, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1.2, + (0, 0, 0), + 3, + ) + else: + cv2.putText( + self.vis_img, + "No tracks with pose found", + (30, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1.2, + (255, 255, 255), + 5, + ) + cv2.putText( + self.vis_img, + "No tracks with pose found", + (30, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1.2, + (0, 0, 0), + 3, + ) + + def start(self): + + while not rospy.is_shutdown(): + + if self.is_ready(): + + image_count = self.current_image_count + image_seq_unique = self.rgb_current_seq + now = datetime.now() + timestamp = now.strftime("%Y_%m_%d_%H_%M_%S_%f") + + if self.args.save or self.args.light_save: + self.save_rgb(image_count, image_seq_unique, timestamp) + + rgb_array = np.array(self.rgb) + + if self.args.save: + self.save_depth(image_count, image_seq_unique, timestamp) + + depth_array = np.array(self.depth) + depth_array[depth_array > self.depth_array_max_threshold] = ( + self.depth_array_max_threshold + ) + + assert depth_array.shape[0] == rgb_array.shape[0] + assert depth_array.shape[1] == rgb_array.shape[1] + + # Process RGB array + if self.last_inferred_seq < self.rgb_current_seq: + + current_frame_processing = self.rgb_current_seq + current_timestamp = self.rgb_timestamp + current_frame_id = self.rgb_frame_id + prInfo("Do inference on frame {}".format(current_frame_processing)) + + # keep old poses for tracking + pose_results_last = self.pose_results + + tic = time.time() + mmdet_results = inference_detector( + self.det_model, rgb_array + ) # list of detection rectangle i.e [(x1,y1,x2,y2), ...] + tac = time.time() + prInfo( + "Detection in {:.4f} sec (frame {}, number of human detection {})".format( + tac - tic, current_frame_processing, len(mmdet_results[0]) + ) + ) + + # keep the person class bounding boxes. + person_results = process_mmdet_results( + mmdet_results, self.args.det_cat_id + ) + + tic = time.time() + # test a single image, with a list of bboxes. + self.pose_results, returned_outputs = inference_top_down_pose_model( + self.pose_model, + rgb_array, + person_results, + bbox_thr=self.args.bbox_thr, + format="xyxy", + dataset=self.dataset, + dataset_info=self.dataset_info, + return_heatmap=self.return_heatmap, + outputs=None, + ) + tac = time.time() + prInfo("Poses in {:.4f} sec".format(tac - tic)) + + # get track id for each person instance + self.pose_results, self.next_id = get_track_id( + self.pose_results, + pose_results_last, + self.next_id, + use_oks=False, + tracking_thr=self.args.tracking_thr, + use_one_euro=self.args.euro, + fps=10, + ) + + # produce an output image + if not self.args.no_show: + self.vis_img = rgb_array.copy() + + if self.display_all_detection and not self.args.no_show: + self.plot_mmdet_bbox(mmdet_results, depth_array.shape) + + #### post processing #### + + self.tracks_in_current_image = {} + + for res in self.pose_results: + + # for each instance + + bbox = res["bbox"] + keypoints = res["keypoints"] + idx = res["track_id"] % 255 + + self.tracks_in_current_image[idx] = { + "right_wrist_depth": None, + "right_wrist_pose": None, + "left_wrist_depth": None, + "left_wrist_pose": None, + "depth_center": None, + "pose_center": None, + "pose_from": None, + } + + # Draw bounding bbox + bbox = bbox.astype(np.int32) + + if not self.args.no_show: + self.plot_mmdet_person_bbox(idx, bbox, depth_array.shape) + + # return the list of body center joints and also fill self.tracks_in_current_image[idx] + body_center_joints = self.process_keypoints(keypoints, depth_array, idx) + + # find the body center + if len(body_center_joints) == 4: + # if we managed to find the 4 points of the torso, search on the torso + body_center_joints = np.array( + body_center_joints + ) # lsho, rsho, lhip, rhip + lsho = body_center_joints[0, :] + rsho = body_center_joints[1, :] + lhip = body_center_joints[2, :] + rhip = body_center_joints[3, :] + + depths_torso, poses_torso = self.get_depth_and_poses_of_torso(depth_array, lsho, rsho, lhip, rhip, idx) + + if len(depths_torso) > 3: + # at least 4 points to average decently + depth_body = np.array(depths_torso).mean() + pose_body = np.array(poses_torso).mean(axis=0) + self.tracks_in_current_image[idx][ + "depth_center" + ] = depth_body # mm + self.tracks_in_current_image[idx][ + "pose_center" + ] = pose_body # m + self.tracks_in_current_image[idx]["pose_from"] = "torso" + + # just for drawing + body_center = np.mean(body_center_joints, axis=0) + # Draw center of body + body_center = (int(body_center[0]), int(body_center[1])) + + if not self.light_display and not self.args.no_show: + self.plot_body_pose_data(body_center, depth_body, pose_body, idx) + + else: + # if we did not managed to find the 4 points of the torso, search in the bbox + prWarning( + "Can't use body center from shoulders and hips for track {} : do nothing".format( + idx + ) + ) + + # draw skeleton + if not self.args.no_show: + self.plot_skeleton_2d(keypoints, idx) + + min_depth = 1e6 # mm + min_depth_idx = -1 + for idx, track_info in self.tracks_in_current_image.items(): + depth = track_info["depth_center"] + if depth is not None: + if depth < min_depth: + min_depth = depth + min_depth_idx = idx + + if min_depth_idx != -1: + pose_closest = self.tracks_in_current_image[min_depth_idx][ + "pose_center" + ] + prInfo( + "Using track {} as it is the closest".format(min_depth_idx) + ) + tf_msg = TransformStamped() + tf_msg.child_frame_id = args.namespace + "/human" + tf_msg.header.seq = current_frame_processing + tf_msg.header.stamp = current_timestamp + tf_msg.header.frame_id = current_frame_id + # adapt to robot camera frame convention on the robot + tf_msg.transform.translation.x = pose_closest[2] + tf_msg.transform.translation.y = -pose_closest[0] + tf_msg.transform.translation.z = -pose_closest[1] + + angle = np.arctan( + tf_msg.transform.translation.y + / tf_msg.transform.translation.x + ) + + # Rotate to have 'human' x axis looking towards the robot + rot = Rotation() + rot.DoRotZ(angle) + rot.DoRotY(np.pi) + qx, qy, qz, qw = rot.GetQuaternion() + + tf_msg.transform.rotation.x = qx + tf_msg.transform.rotation.y = qy + tf_msg.transform.rotation.z = qz + tf_msg.transform.rotation.w = qw + + dist = np.sqrt( + tf_msg.transform.translation.x**2 + tf_msg.transform.translation.y**2 + tf_msg.transform.translation.z**2 + ) + if dist < self.args.max_distance: # meters + self.goal_pub.publish(tf_msg) + prSuccess( + "Publishing coordinates {:.2f} {:.2f} {:.2f}".format( + pose_closest[0], pose_closest[1], pose_closest[2] + ) + ) + + self.tf_br.sendTransform(tf_msg) + + if not self.args.no_show: + self.plot_det_text_info(pose_closest) + + else: + + if not self.args.no_show: + self.plot_det_text_info(None) + + + self.last_inferred_seq = current_frame_processing + + if self.args.save and not self.args.no_show: + self.save_output_image(image_count, image_seq_unique, timestamp) + + else: + prWarning( + "No inference because the current RGB frame has already been processed last_inferred_seq {} vs rgb_current_seq {}".format( + self.last_inferred_seq, self.rgb_current_seq + ) + ) + + if not self.args.no_show: + depth_array_disp = depth_array.copy() + depth_array_disp[depth_array_disp > 3000] = 3000 + depth_array_norm = ((depth_array_disp - depth_array_disp.min())) / ( + depth_array_disp.max() - depth_array_disp.min() + ) + depth_array_norm = depth_array_norm * 255 + depth_array_norm = depth_array_norm.astype(np.uint8) + depth_array_norm_colored = ( + self.depth_cmap[depth_array_norm] * 255 + ).astype(np.uint8) + + if self.args.save or self.args.light_save: + self.save_pcl(image_count, image_seq_unique, timestamp) + + if self.vis_img is not None: + full_display_array = np.zeros( + (rgb_array.shape[0] * 2, rgb_array.shape[1], 3), dtype=np.uint8 + ) + full_display_array[: rgb_array.shape[0], :, :] = self.vis_img + full_display_array[rgb_array.shape[0] :, :, :] = ( + depth_array_norm_colored + ) + + if not self.args.no_show: + cv2.imshow("RGBD window", full_display_array) + cv2.waitKey(3) + + else: + print("Images are None !") + + self.loop_rate.sleep() + + +if __name__ == "__main__": + + ## Parser with params + parser = ArgumentParser() + parser.add_argument( + "--det_config", + type=str, + default="./configs/detection/yolov3_d53_320_273e_coco.py", + help="Config file for detection | default = %(default)s", + ) + parser.add_argument( + "--det_checkpoint", + type=str, + default="./models/yolov3_d53_320_273e_coco-421362b6.pth", + help="Checkpoint file for detection | default = %(default)s", + ) + parser.add_argument( + "--pose_config", + type=str, + default="./configs/pose/ViTPose_small_coco_256x192.py", + help="Config file for pose | default = %(default)s", + ) + parser.add_argument( + "--pose_checkpoint", + type=str, + default="./models/vitpose_small.pth", + help="Checkpoint file for pose | default = %(default)s", + ) + parser.add_argument( + "--device", + default="cuda:0", + help="Device used for inference | default = %(default)s", + ) + parser.add_argument( + "--det_cat_id", + type=int, + default=1, + help="Category id for bounding box detection model (person) | default = %(default)s", + ) + parser.add_argument( + "--bbox_thr", + type=float, + default=0.3, + help="Bounding box score threshold | default = %(default)s", + ) + parser.add_argument( + "--kpt_thr", + type=float, + default=0.3, + help="Keypoint score threshold | default = %(default)s", + ) + parser.add_argument( + "--tracking_thr", + type=float, + default=0.3, + help="Tracking threshold | default = %(default)s", + ) + parser.add_argument( + "--euro", action="store_true", help="Using One_Euro_Filter for smoothing" + ) + # parser.add_argument('--rgb_topic', default = "orbbec/rgb", type=str, help='ROS topic for RGB image') + # parser.add_argument('--depth_topic', default = "orbbec/depth", type=str, help='ROS topic for depth image') + # parser.add_argument('--pcl_topic', default = "orbbec/pcl", type=str, help='ROS topic for pcl') + parser.add_argument( + "--namespace", + default="orbbec_head", + type=str, + help="ROS topic namespace for rgb, depth, pcl | default = %(default)s", + ) + parser.add_argument( + "--no_show", + action="store_true", + default=False, + help="whether to show visualizations | default = %(default)s", + ) + parser.add_argument( + "--save", + action="store_true", + default=False, + help="whether to save images (rgb and d and predictions and pcl) | default = %(default)s", + ) + parser.add_argument( + "--flip", + action="store_true", + default=True, + help="whether to flip images | default = %(default)s", + ) + parser.add_argument( + "--light_save", + action="store_true", + default=False, + help="whether to save only rgb and pcl (not optimized use the light_save of visualizer for optimized saving) | default = %(default)s", + ) + parser.add_argument( + "--display_all_detection", + "-dad", + action="store_true", + default=False, + help="whether to display all detections or only human | default = %(default)s", + ) + parser.add_argument( + "--light_display", + "-ld", + action="store_true", + default=False, + help="whether to display only skeletons | default = %(default)s", + ) + parser.add_argument("--fps", type=int, default=10, help="Node and recording fps") + parser.add_argument( + "--depth_cmap", + default="jet", + type=str, + help="mpl colormap for depth image | default = %(default)s", + ) + parser.add_argument( + "--max_distance", + type=float, + default=2.5, + help="Maximum distance allowed for publishing human pose | default = %(default)s", + ) + + args = parser.parse_args() + + assert has_mmdet, "Please install mmdet to run the demo." + assert args.det_config is not None + assert args.det_checkpoint is not None + + prInfo("Loaded with args : {}".format(args)) + + rospy.init_node("python_orbbec_inference", anonymous=True) + my_node = InferenceNodeRGBD(args) + my_node.start() + cv2.destroyAllWindows() diff --git a/rgbd_detect_3d_dir.py b/rgbd_detect_3d_dir.py new file mode 100644 index 0000000..bbbc26e --- /dev/null +++ b/rgbd_detect_3d_dir.py @@ -0,0 +1,2025 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# TODO : sort imports + +# mmdet and mmpose import +from mmpose.apis import ( + get_track_id, + inference_top_down_pose_model, + init_pose_model, + process_mmdet_results, +) +from mmpose.datasets import DatasetInfo + +try: + from mmdet.apis import inference_detector, init_detector + + has_mmdet = True +except (ImportError, ModuleNotFoundError): + has_mmdet = False + +# ros related import +import rospy +from sensor_msgs.msg import Image, PointCloud2 +from geometry_msgs.msg import TransformStamped +from cv_bridge import CvBridge +import tf2_ros + +# other import +import cv2 +import os +import matplotlib.pyplot as plt +from argparse import ArgumentParser +from datetime import datetime +import time +import json +import warnings +import numpy as np +from PyKDL import Rotation +import copy +# import imageio +from PIL import Image as PILImage + +from utils import * + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +try: + has_mb = True + # motion bert import + from lib.utils.tools import * + from lib.utils.learning import * + from lib.utils.utils_data import flip_data + from lib.data.dataset_wild import WildDetDataset + from lib.utils.vismo import render_and_save +except: + has_mb = False + prWarning("No MotionBERT import, fail") + +try: + # gafa import (optional) + has_gafa = True + from gafa_utils import body_transform, head_transform, head_transform_rest, normalize_img, body_transform_from_bb, normalize_img_torch, head_transform_face + from gafa.models.gazenet import GazeNet +except: + has_gafa = False + prWarning("No GAFA import, fail") + from gafa_utils import body_transform, head_transform, head_transform_rest, normalize_img, body_transform_from_bb, normalize_img_torch, head_transform_face + +# 6D Rep import +try: + has_sixdrep = True + from sixdrep.util import FaceDetector, compute_euler + from sixdrep.utils import sixdreptransform +except: + has_sixdrep = False + prWarning("No 6D Rep import, fail") + + +# gaze estimation simple models import +try: + has_gaze_est = True + from gaze_estimation.models import resnet18, mobilenet_v2, mobileone_s0 + from gaze_estimation.utils import pre_process + from gaze_estimation.models import SCRFD +except: + has_gaze_est = False + prWarning("No GazeEst import, fail") + + +# remove numpy scientific notation +np.set_printoptions(suppress=True) + + +class InferenceNodeRGBD(object): + def __init__(self, args): + + # init args + self.args = args + + # init detector and pose + self.det_model = init_detector( + args.det_config, args.det_checkpoint, device=args.device.lower() + ) + + self.pose_model = init_pose_model( + args.pose_config, args.pose_checkpoint, device=args.device.lower() + ) + + # if enabled, init MotionBERT + if self.args.use_mb: + # init 3d MotionBERT model + prInfo('Initialiazing 3D Pose Lifter {}'.format(args.mb_checkpoint)) + mb_3d_args = get_config(args.mb_3d_config) + self.motionbert_3d_model = load_backbone(mb_3d_args) + if torch.cuda.is_available(): + self.motionbert_3d_model = nn.DataParallel(self.motionbert_3d_model) + self.motionbert_3d_model = self.motionbert_3d_model.cuda() + else: + prError("Expect cuda to be available but is_available returned false") + exit(0) + + prInfo('Loading checkpoint {}'.format(args.mb_checkpoint)) + mb_checkpoint = torch.load(args.mb_checkpoint, map_location=lambda storage, loc: storage) + self.motionbert_3d_model.load_state_dict(mb_checkpoint['model_pos'], strict=True) + self.motionbert_3d_model.eval() + prInfo('Loaded motionbert_3d_model') + # no need for the whole WildDetDataset stuff, just manually make the input trajectories for the tracks + + # if enabled, init GAFA + if self.args.use_gafa: + self.gafa_model = GazeNet(n_frames=self.args.gafa_n_frames) + self.gafa_model.load_state_dict(torch.load( + self.args.gafa_checkpoint)) #, map_location=torch.device("cpu"))['state_dict']) + + self.gafa_model.cuda() + self.gafa_model.eval() + + prInfo( + "Loaded GAFA model from {}".format( + self.args.gafa_checkpoint)) + + # if enabled, init gaze resnet + if self.args.use_gaze_resnet: + self.face_detector = SCRFD(model_path="./gaze_estimation/weights/det_10g.onnx") + self.gaze_estimation_model = resnet18(pretrained = False, num_classes = 90) + state_dict = torch.load("./gaze_estimation/weights/resnet18.pt", map_location=args.device.lower()) + self.gaze_estimation_model.load_state_dict(state_dict) + self.gaze_estimation_model.to(args.device.lower()) + self.gaze_estimation_model.eval() + prInfo('Loaded ResNet18 for gaze estimation') + + # if enabled, init 6DRep + if self.args.use_six_d_rep: + + self.sixdrep_model = torch.load(f='./sixdrep/weights/best.pt', map_location='cuda') + self.sixdrep_model = self.sixdrep_model['model'].float().fuse() + self.sixdrep_detector = FaceDetector('./sixdrep/weights/detection.onnx') + + self.sixdrep_model.half() + self.sixdrep_model.eval() + + + # dataset for detection and pose + self.dataset = self.pose_model.cfg.data["test"]["type"] + self.dataset_info = self.pose_model.cfg.data["test"].get( + "self.dataset_info", None + ) + if self.dataset_info is None: + warnings.warn( + "Please set `self.dataset_info` in the config." + "Check https://github.com/open-mmlab/mmpose/pull/663 for details.", + DeprecationWarning, + ) + else: + self.dataset_info = DatasetInfo(self.dataset_info) + + self.return_heatmap = False + + # variables to keep tracks along time or in the current frame + self.next_id = 0 + self.pose_results = [] + self.tracks_in_current_image = {} + self.tracks = {} # all the tracks along time, we need to keep and history with some data + + # shared variables for the received images and pcl + self.rgb = None # Image frame + self.depth = None # Image frame + + self.pcl_array_rgb = None + self.pcl_array_xyz = None + + # viewing options + self.depth_array_max_threshold = 20000 + self.depth_cmap = get_mpl_colormap(args.depth_cmap) + self.confidence_cmap = get_mpl_colormap("viridis") + self.vis_img = None # output image RGB + detections + self.view_all_classes_dets = True + self.display_all_detection = args.display_all_detection + self.light_display = args.light_display + + # counter for the incoming frames + self.pcl_current_seq = -1 + self.rgb_current_seq = -1 + self.last_inferred_seq = -1 + self.depth_current_seq = -1 + self.current_image_count = 0 + self.rgb_frame_id = None # received from ROS image + + # CV Bridge for receiving frames + self.br = CvBridge() + + # Set ROS node rate + prInfo("Setting node rate to {} fps".format(args.fps)) + self.loop_rate = rospy.Rate(args.fps) + + # create the output path + now = datetime.now() + timestamp = now.strftime("%Y_%m_%d_%H_%M_%S") + self.save_dir = os.path.join("output", "record_{:s}".format(timestamp)) + self.metadata = os.path.join(self.save_dir, "metadata.json") + self.save_dir_rgb = os.path.join(self.save_dir, "rgb") + self.save_dir_depth = os.path.join(self.save_dir, "depth") + self.save_dir_result = os.path.join(self.save_dir, "output") + self.save_dir_pcl_bin = os.path.join(self.save_dir, "pcl") + + if args.save or args.light_save: + prInfo( + "Saving to {}/[rgb][depth][depth_color][output][pcl]".format( + self.save_dir + ) + ) + if not os.path.exists(self.save_dir): + prInfo( + "Creating directories to {}/[rgb][depth][depth_color][output][pcl]".format( + self.save_dir + ) + ) + os.makedirs(self.save_dir) + os.makedirs(self.save_dir_rgb) + os.makedirs(self.save_dir_pcl_bin) + + if args.save: + os.makedirs(self.save_dir_depth) + os.makedirs(self.save_dir_result) + + args_dic = vars(args) + with open(self.metadata, "w") as fp: + json.dump(args_dic, fp) + + prSuccess( + "Created directories to {}/[rgb][depth][depth_color][output][pcl]".format( + self.save_dir + ) + ) + time.sleep(1) + + # ROS publishers + self.goal_pub = rospy.Publisher( + args.namespace + "/human", TransformStamped, queue_size=1 + ) + + self.tf_br = tf2_ros.TransformBroadcaster() + + # ROS subscribers + rgb_topic = args.namespace + "/rgb" + depth_topic = args.namespace + "/depth" + pcl_topic = args.namespace + "/pcl" + prInfo("Subscribing to {} for RGB".format(rgb_topic)) + rospy.Subscriber(rgb_topic, Image, self.callback_rgb) + prInfo("Subscribing to {} for depth".format(depth_topic)) + rospy.Subscriber(depth_topic, Image, self.callback_depth) + prInfo("Subscribing to {} for PCL".format(pcl_topic)) + rospy.Subscriber(pcl_topic, PointCloud2, self.callback_pcl) + + + def callback_pcl(self, msg): + if self.args.flip: + pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape( + (msg.height, msg.width, -1) + )[::-1, ::-1, :] + else: + pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape( + (msg.height, msg.width, -1) + ) + + # pcl_array = pcl_array[::-1, :, :] + self.pcl_array_xyz = pcl_array[:, :, :3] + # self.pcl_array_rgb = pcl_array[:,:,3:] + self.pcl_current_seq = msg.header.seq + # rospy.loginfo('pcl received ({})...'.format(msg.header.seq)) + + def callback_rgb(self, msg): + if self.rgb_frame_id != msg.header.frame_id: + self.rgb_frame_id = msg.header.frame_id + if self.args.flip: + self.rgb = cv2.flip(self.br.imgmsg_to_cv2(msg, "bgr8"), -1) + else: + self.rgb = self.br.imgmsg_to_cv2(msg, "bgr8") + + # self.rgb = cv2.rotate(self.rgb, cv2.ROTATE_180) + self.rgb_current_seq = msg.header.seq + # rospy.loginfo('RGB received ({})...'.format(msg.header.seq)) + self.rgb_timestamp = msg.header.stamp + + def callback_depth(self, msg): + if self.args.flip: + self.depth = cv2.flip(self.br.imgmsg_to_cv2(msg, "mono16"), -1) + else: + self.depth = self.br.imgmsg_to_cv2(msg, "mono16") + + self.depth_current_seq = msg.header.seq + # rospy.loginfo('Depth received ({})...'.format(msg.header.seq)) + + def is_ready(self): + ready = ( + (self.rgb is not None) + and (self.depth is not None) + and (self.pcl_array_xyz is not None) + ) + return ready + + @timeit + def save_rgb(self, image_count, image_seq_unique, timestamp): + prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save") + rgb_path = os.path.join( + self.save_dir_rgb, + "{:08d}_seq_{:010d}_ts_{}.png".format( + image_count, image_seq_unique, timestamp + ), + ) + cv2.imwrite(rgb_path, self.rgb) + prSuccess("Saved RGB to {}".format(rgb_path)) + + @timeit + def save_depth(self, image_count, image_seq_unique, timestamp): + prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save") + depth_path = os.path.join( + self.save_dir_depth, + "{:08d}_seq_{:010d}_ts_{}.png".format( + image_count, image_seq_unique, timestamp + ), + ) + cv2.imwrite(depth_path, self.depth) + prSuccess("Saved depth to {}".format(depth_path)) + + @timeit + def save_output_image(self, image_count, image_seq_unique, timestamp): + prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save") + results_path = os.path.join( + self.save_dir_result, + "{:08d}_seq_{:010d}_ts_{}.png".format( + image_count, image_seq_unique, timestamp + ), + ) + cv2.imwrite(results_path, self.vis_img) + prSuccess("Saved result to {}".format(results_path)) + + @timeit + def save_pcl(self, image_count, image_seq_unique, timestamp): + prWarning("Saving images here may suffer synchronization issues, use visualizer.py for lighter save") + pcl_path = os.path.join( + self.save_dir_pcl_bin, + "{:08d}_seq_{:010d}_ts_{}.bin".format( + image_count, image_seq_unique, timestamp + ), + ) + self.pcl_array_xyz.tofile(pcl_path) + prSuccess("Saved pcl to {}".format(pcl_path)) + + @timeit + def plot_mmdet_bbox(self, mmdet_results, array_shape): + for c in range(len(mmdet_results)): + if len(mmdet_results[c]) > 0: + for bi in range(mmdet_results[c].shape[0]): + if mmdet_results[c][bi, 4] > self.args.bbox_thr: + bbox = ( + mmdet_results[c][bi, :4] + .copy() + .astype(np.int32) + ) + bbox_ints = [ + int(bbox[0]), + int(bbox[1]), + int(bbox[2]), + int(bbox[3]), + ] + pt1 = ( + min( + max(0, bbox_ints[0]), + array_shape[1], + ), + min( + max(0, bbox_ints[1]), + array_shape[0], + ), + ) + pt2 = ( + min( + max(0, bbox_ints[2]), + array_shape[1], + ), + min( + max(0, bbox_ints[3]), + array_shape[0], + ), + ) + cv2.rectangle( + self.vis_img, pt1, pt2, (255, 255, 255), 1 + ) + cv2.putText( + self.vis_img, + "{:s} ({:.0f}%)".format( + YOLO_COCO_80_CLASSES[c], + mmdet_results[c][bi, 4] * 100, + ), + pt1, + cv2.FONT_HERSHEY_SIMPLEX, + 0.5 * TEXT_SCALE, + (255, 255, 255), + 1, + ) + + @timeit + def plot_xyxy_person_bbox(self, idx, bbox, array_shape, track, poses_torso = None): + bbox_ints = [ + int(bbox[0]), + int(bbox[1]), + int(bbox[2]), + int(bbox[3]), + ] + pt1 = ( + min(max(0, bbox_ints[0]), array_shape[1]), + min(max(0, bbox_ints[1]), array_shape[0]), + ) + pt2 = ( + min(max(0, bbox_ints[2]), array_shape[1]), + min(max(0, bbox_ints[3]), array_shape[0]), + ) + color = RANDOM_COLORS[idx] + # color_tuple = (int(color[0]), int(color[1]), int(color[2])) + color_tuple = (255,255,255) + + # yolo score + score = bbox[4] + + # current gaze + if len(track["gaze_yaw_rad"]) > 0: + yaw_g = int(np.rad2deg(track["gaze_yaw_rad"][-1])) + pitch_g = int(np.rad2deg(track["gaze_pitch_rad"][-1])) + if yaw_g == 180 or pitch_g == 180: + yaw_g = "Unk" + pitch_g = "Unk" + else: + yaw_g = "Unk" + pitch_g = "Unk" + + # curent depth + if len(track["depth_face"]) > 0: + depth_f = track["depth_face"][-1] + else: + depth_f = "Unk" + + # position + if poses_torso is not None: + pose_body = np.array(poses_torso).mean(axis=0) + pose_body_n = pose_body.copy() + if type(pose_body) == np.ndarray: + pose_body_n[0] = pose_body[2] + pose_body_n[1] = -pose_body[0] + pose_body_n[2] = -pose_body[1] + else: + pose_body_n = ["Unk", "Unk", "Unk"] + else: + pose_body_n = ["Unk", "Unk", "Unk"] + + # attention + heat_count = 0 + history = 20 + length = min(len(track["depth_face"]), history) + for i in range(length): + index = len(track["depth_face"]) - i - 1 + yaw = np.rad2deg(track["gaze_yaw_rad"][index]) + pitch = np.rad2deg(track["gaze_pitch_rad"][index]) + depth = track["depth_face"][index] + + thresh = (int(depth / 1000) + 1) * 3 # 3 deg per meter + if np.abs(yaw) < thresh and pitch < 0: + heat_count += 1 + # suppose we are looking down + + attention_score = int( min((heat_count * 2 / history), 1) * 100) + + draw_bbox_with_corners(self.vis_img, bbox_ints, color = color_tuple, thickness = 5, proportion = 0.2) + + text = "Person : {}% | Attention : {}%".format(score, attention_score) + if poses_torso is not None and type(pose_body) == np.ndarray: + text2 = "Yaw = {} | Pitch = {} | pos = ({:.2f}, {:.2f}, {:.2f})".format(yaw_g, pitch_g, pose_body_n[0], pose_body_n[1], pose_body_n[2]) + + cv2.putText( + self.vis_img, + text, + (bbox_ints[0], bbox_ints[1] - 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5 * TEXT_SCALE, + color_tuple, + 1, + ) + + if poses_torso is not None and type(pose_body) == np.ndarray: + cv2.putText( + self.vis_img, + text2, + (bbox_ints[0], bbox_ints[1] - 15), + cv2.FONT_HERSHEY_SIMPLEX, + 0.35 * TEXT_SCALE, + color_tuple, + 1, + ) + + # cv2.rectangle(self.vis_img, pt1, pt2, color_tuple, 2) + + @timeit + def process_keypoints(self, keypoints, depth_array, idx): + body_center_joints = ( + [] + ) # to store center of lsho, rsho, lhip, rhip in pixels + color = RANDOM_COLORS[idx] + # color_tuple = (int(color[0]), int(color[1]), int(color[2])) + color_tuple = (255,255,255) + + for j in range(keypoints.shape[0]): + + kp = keypoints[j, :] + confidence = int(kp[2] * 255) + confidence_color = ( + self.confidence_cmap[min(255, confidence)] * 255 + ).astype(np.uint8) + + if ( + kp[2] > self.args.kpt_thr + and kp[0] > 0 + and kp[1] > 0 + and kp[0] < depth_array.shape[1] + and kp[1] < depth_array.shape[0] + ): + + if (j == 5) or (j == 6) or (j == 11) or (j == 12): + # one keypoint of the torso + body_center_joints.append(kp) + + if not self.args.no_show and not self.args.light_display: + # kp_color_tuple = (int(confidence_color[0]), int(confidence_color[1]), int(confidence_color[2])) + cv2.circle( + self.vis_img, + (int(kp[0]), int(kp[1])), + 2, + color_tuple, + thickness=3, + ) + + # if wrists, find depth and pose + + if j == 10: + # right wrist + depth_wrist = depth_array[int(kp[1]), int(kp[0])] + pose_wrist = self.pcl_array_xyz[ + int(kp[1]), int(kp[0]), : + ] + self.tracks_in_current_image[idx][ + "right_wrist_depth" + ] = depth_wrist + self.tracks_in_current_image[idx][ + "right_wrist_pose" + ] = pose_wrist + if not self.light_display and not self.args.no_show: + cv2.drawMarker( + self.vis_img, + (int(kp[0]), int(kp[1])), + color=color_tuple, + thickness=3, + markerType=cv2.MARKER_CROSS, + line_type=cv2.LINE_AA, + markerSize=8, + ) + # cv2.putText( + # self.vis_img, + # "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format( + # depth_wrist / 10, + # pose_wrist[0], + # pose_wrist[1], + # pose_wrist[2], + # ), + # (int(kp[0]), int(kp[1])), + # cv2.FONT_HERSHEY_SIMPLEX, + # 0.5 * TEXT_SCALE, + # (255,255,255), + # 2, + # ) + # cv2.putText( + # self.vis_img, + # "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format( + # depth_wrist / 10, + # pose_wrist[0], + # pose_wrist[1], + # pose_wrist[2], + # ), + # (int(kp[0]), int(kp[1])), + # cv2.FONT_HERSHEY_SIMPLEX, + # 0.5 * TEXT_SCALE, + # color_tuple, + # 1, + # ) + + elif j == 9: + # left wrist + depth_wrist = depth_array[int(kp[1]), int(kp[0])] + pose_wrist = self.pcl_array_xyz[ + int(kp[1]), int(kp[0]), : + ] + self.tracks_in_current_image[idx][ + "left_wrist_depth" + ] = depth_wrist + self.tracks_in_current_image[idx][ + "left_wrist_pose" + ] = pose_wrist + if not self.light_display and not self.args.no_show: + cv2.drawMarker( + self.vis_img, + (int(kp[0]), int(kp[1])), + color=color_tuple, + thickness=3, + markerType=cv2.MARKER_CROSS, + line_type=cv2.LINE_AA, + markerSize=8, + ) + # cv2.putText( + # self.vis_img, + # "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format( + # depth_wrist / 10, + # pose_wrist[0], + # pose_wrist[1], + # pose_wrist[2], + # ), + # (int(kp[0]), int(kp[1])), + # cv2.FONT_HERSHEY_SIMPLEX, + # 0.5 * TEXT_SCALE, + # (255,255,255), + # 2, + # ) + # cv2.putText( + # self.vis_img, + # "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format( + # depth_wrist / 10, + # pose_wrist[0], + # pose_wrist[1], + # pose_wrist[2], + # ), + # (int(kp[0]), int(kp[1])), + # cv2.FONT_HERSHEY_SIMPLEX, + # 0.5 * TEXT_SCALE, + # color_tuple, + # 1, + # ) + + return body_center_joints + + @timeit + def get_depth_and_poses_of_torso(self, depth_array, lsho, rsho, lhip, rhip, idx): + + color = RANDOM_COLORS[idx] + # color_tuple = (int(color[0]), int(color[1]), int(color[2])) + color_tuple = (255,255,255) + + # find 4 points between lsho and rhip and 4 points between rsho and lhip to find something more precise + seg_steps = [0.0, 0.25, 0.50, 0.75, 1.0] + depths_torso = [] + poses_torso = [] + for step in seg_steps: + + p1 = step * lsho + (1 - step) * rhip + if ( + p1[0] < depth_array.shape[1] + and p1[1] < depth_array.shape[0] + ): + depth_p1 = depth_array[int(p1[1]), int(p1[0])] + pose_p1 = self.pcl_array_xyz[ + int(p1[1]), int(p1[0]), : + ] + if depth_p1 > 0: + depths_torso.append(depth_p1) + poses_torso.append(pose_p1) + + p2 = step * rsho + (1 - step) * lhip + if ( + p2[0] < depth_array.shape[1] + and p2[1] < depth_array.shape[0] + ): + depth_p2 = depth_array[int(p2[1]), int(p2[0])] + pose_p2 = self.pcl_array_xyz[ + int(p2[1]), int(p2[0]), : + ] + if depth_p2 > 0: + depths_torso.append(depth_p2) + poses_torso.append(pose_p2) + + if not self.args.no_show: + # draw to check + cv2.drawMarker( + self.vis_img, + (int(p1[0]), int(p1[1])), + color=color_tuple, + thickness=1, + markerType=cv2.MARKER_DIAMOND, + line_type=cv2.LINE_AA, + markerSize=8, + ) + cv2.drawMarker( + self.vis_img, + (int(p2[0]), int(p2[1])), + color=color_tuple, + thickness=1, + markerType=cv2.MARKER_DIAMOND, + line_type=cv2.LINE_AA, + markerSize=8, + ) + + return depths_torso, poses_torso + + @timeit + def plot_body_pose_data(self, body_center, depth_body, pose_body, idx): + + color = RANDOM_COLORS[idx] + # color_tuple = (int(color[0]), int(color[1]), int(color[2])) + color_tuple = (255,255,255) + + cv2.drawMarker( + self.vis_img, + body_center, + color = color_tuple, + thickness=1, + markerType=cv2.MARKER_TILTED_CROSS, + line_type=cv2.LINE_AA, + markerSize=16, + ) + # cv2.putText( + # self.vis_img, + # "{:.0f}cm | {:.2f} {:.2f} {:.2f}".format( + # depth_body / 10, + # pose_body[0], + # pose_body[1], + # pose_body[2], + # ), + # (int(body_center[0]), int(body_center[1])), + # cv2.FONT_HERSHEY_SIMPLEX, + # 0.8 * TEXT_SCALE, + # (0, 255, 0), + # 3, + # ) + cv2.putText( + self.vis_img, + "{:.0f}cm".format( + depth_body / 10 + ), + (int(body_center[0]), int(body_center[1])), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5 * TEXT_SCALE, + (255, 255, 255), + 1, + ) + + @timeit + def plot_skeleton_2d(self, keypoints, idx): + + color = RANDOM_COLORS[idx] + # color_tuple = (int(color[0]), int(color[1]), int(color[2])) + color_tuple = (255,255,255) + + for limb in COCO17_JOINTS_LIMBS: + start = keypoints[limb[0], :] + end = keypoints[limb[1], :] + start_point = (int(start[0]), int(start[1])) + end_point = (int(end[0]), int(end[1])) + if (start[2] > self.args.kpt_thr) and ( + end[2] > self.args.kpt_thr + ): + cv2.line( + self.vis_img, + start_point, + end_point, + color = color_tuple, + thickness=1, + ) + @timeit + def plot_det_text_info(self, pose_closest): + if pose_closest is not None: + cv2.putText( + self.vis_img, + "{:.2f} {:.2f} {:.2f}".format( + pose_closest[0], pose_closest[1], pose_closest[2] + ), + (30, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1.2 * TEXT_SCALE, + (255, 255, 255), + 5, + ) + cv2.putText( + self.vis_img, + "{:.2f} {:.2f} {:.2f}".format( + pose_closest[0], pose_closest[1], pose_closest[2] + ), + (30, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1.2 * TEXT_SCALE, + (0, 0, 0), + 3, + ) + else: + cv2.putText( + self.vis_img, + "No tracks with pose found", + (30, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1.2 * TEXT_SCALE, + (255, 255, 255), + 5, + ) + cv2.putText( + self.vis_img, + "No tracks with pose found", + (30, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 1.2 * TEXT_SCALE, + (0, 0, 0), + 3, + ) + + @timeit + def plot_gaze_text_info(self, gaze_res, head_outputs, body_outputs, head_bb_abs, idx): + prediction = gaze_res['direction'] + kappa = gaze_res['kappa'][0, -1].item() + prediction_body = body_outputs['direction'] + prediction_head = head_outputs['direction'] + + prediction_show = prediction.clone().cpu().detach().numpy()[0, -1, :] + prediction_show_body = prediction_body.clone().cpu().detach().numpy()[0, -1, :] + prediction_show_head = prediction_head.clone().cpu().detach().numpy()[0, -1, :] + + prediction_show_norm = prediction_show / np.linalg.norm(prediction_show) + prediction_show_norm_body = prediction_show_body / np.linalg.norm(prediction_show_body) + prediction_show_norm_head = prediction_show_head / np.linalg.norm(prediction_show_head) + + cv2.putText( + self.vis_img, + "Gaze {:.2f} {:.2f} {:.2f} ({:.2f})".format( + prediction_show_norm[0], prediction_show_norm[1], prediction_show_norm[2], kappa + ), + (30, 70), + cv2.FONT_HERSHEY_SIMPLEX, + 1 * TEXT_SCALE, + (255, 255, 255), + 5, + ) + + cv2.putText( + self.vis_img, + "Gaze {:.2f} {:.2f} {:.2f} ({:.2f})".format( + prediction_show_norm[0], prediction_show_norm[1], prediction_show_norm[2], kappa + ), + (30, 70), + cv2.FONT_HERSHEY_SIMPLEX, + 1 * TEXT_SCALE, + (0, 255, 0), + 3, + ) + + + @timeit + def plot_gaze_and_body_dir(self, gaze_res, head_outputs, body_outputs, head_bb_abs, body_bbox): + head_bb_abs[2] += head_bb_abs[0] + head_bb_abs[3] += head_bb_abs[1] + + prediction = gaze_res['direction'] + prediction_body = body_outputs['direction'] + prediction_head = head_outputs['direction'] + + prediction_show = prediction.clone().cpu().detach().numpy()[0, -1, :] + prediction_show_body = prediction_body.clone().cpu().detach().numpy()[0, -1, :] + prediction_show_head = prediction_head.clone().cpu().detach().numpy()[0, -1, :] + + prediction_show_norm = prediction_show / np.linalg.norm(prediction_show) + prediction_show_norm_body = prediction_show_body / np.linalg.norm(prediction_show_body) + prediction_show_norm_head = prediction_show_head / np.linalg.norm(prediction_show_head) + + gaze_dir_2d = prediction_show_norm[0:2] + body_dir_2d = prediction_show_norm_body[0:2] + head_dir_2d = prediction_show_norm_head[0:2] + + body_center = (int((body_bbox[0] + body_bbox[2]) / 2), int((body_bbox[1] + body_bbox[3]) / 2)) + head_center = (int(head_bb_abs[0] / 2 + head_bb_abs[2] / 2), int(head_bb_abs[1] / 2 + head_bb_abs[3] / 2)) + + des = (head_center[0] + int(gaze_dir_2d[0]*150), int(head_center[1] + gaze_dir_2d[1]*150)) + des_body = (body_center[0] + int(body_dir_2d[0]*150), int(body_center[1] + body_dir_2d[1]*150)) + des_head = (head_center[0] + int(head_dir_2d[0]*150), int(head_center[1] + head_dir_2d[1]*150)) + + cv2.arrowedLine(self.vis_img, head_center, des, (0, 255, 0), 3, tipLength=0.3) + cv2.arrowedLine(self.vis_img, body_center, des_body, (0, 255, 255), 3, tipLength=0.3) + cv2.arrowedLine(self.vis_img, head_center, des_head, (255, 255, 255), 3, tipLength=0.3) + + + @timeit + def plot_gaze_from_pitch_yaw(self, pitch, yaw, head_bb_abs, idx, keypoints): + + # color = RANDOM_COLORS[idx] + # color_tuple = (int(color[0]), int(color[1]), int(color[2])) + color_tuple = (0,0,255) + + head_bb_abs[2] += head_bb_abs[0] + head_bb_abs[3] += head_bb_abs[1] + + prediction_show = np.zeros(3) + prediction_show[0] = -np.sin(pitch) * np.cos(yaw) + prediction_show[1] = -np.sin(yaw) + prediction_show[2] = 999 + + # prediction_show_norm = prediction_show / np.linalg.norm(prediction_show) + + gaze_dir_2d = prediction_show[0:2] + + # head_center = (int(head_bb_abs[0] / 2 + head_bb_abs[2] / 2), int(head_bb_abs[1] / 2 + head_bb_abs[3] / 2)) + head_center = (int(keypoints[1,0] / 2 + keypoints[2,0] / 2), int(keypoints[1,1] / 2 + keypoints[2,1] / 2)) + + des = (head_center[0] + int(gaze_dir_2d[0]*150), int(head_center[1] + gaze_dir_2d[1]*150)) + + # cv2.arrowedLine(self.vis_img, head_center, des, (255,255,255), 3, tipLength=0.3) + cv2.arrowedLine(self.vis_img, head_center, des, color_tuple, 2, tipLength=0.1) + cv2.circle(self.vis_img, head_center, 5, color = color_tuple, thickness=-1) + + @timeit + def plot_gaze_angle_info(self, pitch, yaw, head_bb, idx): + color = RANDOM_COLORS[idx] + color_tuple = (int(color[0]), int(color[1]), int(color[2])) + + cv2.putText( + self.vis_img, + "{:.2f} {:.2f} deg".format( + pitch, yaw + ), + (head_bb[0] + 30, head_bb[1] + 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5 * TEXT_SCALE, + (255,255,255), + 2, + ) + cv2.putText( + self.vis_img, + "{:.2f} {:.2f} deg".format( + pitch, yaw + ), + (head_bb[0] + 30, head_bb[1] + 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5 * TEXT_SCALE, + color_tuple, + 1, + ) + + @timeit + def get_gafa_input_from_current_image(self, image, keypoints, body_yolo_bbox): + + body_yolo_bbox_int = {} + body_yolo_bbox_int["u"] = int(body_yolo_bbox[0]) + body_yolo_bbox_int["v"] = int(body_yolo_bbox[1]) + body_yolo_bbox_int["w"] = int(body_yolo_bbox[2] - body_yolo_bbox[0]) + body_yolo_bbox_int["h"] = int(body_yolo_bbox[3] - body_yolo_bbox[1]) + + # use torch instead of PIL because faster conversion + # image_pil = PILImage.fromarray(image) + image_torch = torch.from_numpy(image.copy()).moveaxis(2, 0) + + item = { + "image": image_torch, + "keypoints": keypoints[:, :2], + } + + # get head bb in pixels + head_trans = head_transform(item) + head_bb = head_trans['bb'] + head_bb = np.array([head_bb['u'], head_bb['v'], head_bb['w'], head_bb['h']]).astype(np.float32) + + # get body bb in pixels + # body_trans = body_transform(item) + body_trans = body_transform(item) + body_bb = body_trans['bb'] + body_bb = np.array([body_bb['u'], body_bb['v'], body_bb['w'], body_bb['h']]) + body_image = body_trans['image'] # keep as tensor + + # change head bb to relative to body bb + head_bb_abs = head_bb.copy() + + head_bb[0] -= body_bb[0] + head_bb[1] -= body_bb[1] + + head_bb[0] = head_bb[0] / body_bb[2] + head_bb[1] = head_bb[1] / body_bb[3] + head_bb[2] = head_bb[2] / body_bb[2] + head_bb[3] = head_bb[3] / body_bb[3] + + # store body center + norm_body_center = (body_bb[[0, 1]] + body_bb[[2, 3]] / 2) / body_bb[[2,3]] + + # normalize image + # img = normalize_img(image = body_image)['image'] # with albumnentations normalization + # img = img.transpose(2, 0, 1) # with albumnentations normalization + img = normalize_img_torch((body_image.float())/255) # ith torchvision normalization, to float and in range [0-1] before normalization + + assert(img.shape[0] == 3) + assert(img.shape[1] == 256) + assert(img.shape[2] == 192) + + # create mask of head bounding box + head_mask = np.zeros((1, img.shape[1], img.shape[2])) + head_bb_int = head_bb.copy() + head_bb_int[[0, 2]] *= img.shape[2] + head_bb_int[[1, 3]] *= img.shape[1] + head_bb_int[2] += head_bb_int[0] + head_bb_int[3] += head_bb_int[1] + head_bb_int = head_bb_int.astype(np.int64) + head_bb_int[head_bb_int < 0] = 0 + + head_mask[:, head_bb_int[1]:head_bb_int[3], head_bb_int[0]:head_bb_int[2]] = 1 + + return img, head_mask, norm_body_center, head_bb_abs + + @timeit + def plot_overlay_face_attention(self, track, head_bbox): + x_min, y_min, x_max, y_max = map(int, head_bbox[:4]) + + valid_depths = [] + valid_yaws = [] + valid_pitchs = [] + + heat_count = 0 + history = 1 + length = min(len(track["depth_face"]), history) + for i in range(length): + index = len(track["depth_face"]) - i - 1 + yaw = np.rad2deg(track["gaze_yaw_rad"][index]) + pitch = np.rad2deg(track["gaze_pitch_rad"][index]) + depth = track["depth_face"][index] + + thresh = (int(depth / 1000) + 1) * 5 # 5 deg per meter + if np.abs(yaw) < thresh and np.abs(pitch) < thresh: + heat_count += 1 + + cv2.putText( + self.vis_img, + "{:d}".format( + heat_count + ), + (x_min - 30, y_min + 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5 * TEXT_SCALE, + (255,0,255), + 2, + ) + + overlay_img = self.vis_img.copy() + cv2.rectangle(overlay_img, (x_min,y_min), (x_max,y_max), color = (0,255,0), thickness = -1) + strength = (heat_count / history) * 0.75 + self.vis_img = cv2.addWeighted(self.vis_img,(1-strength),overlay_img,strength,0) + + @timeit + def plot_overlay_face_attention_6d(self, track, head_bbox, keypoints): + x_min, y_min, x_max, y_max = map(int, head_bbox[:4]) + + valid_depths = [] + valid_yaws = [] + valid_pitchs = [] + + heat_count = 0 + history = 20 + length = min(len(track["depth_face"]), history) + for i in range(length): + index = len(track["depth_face"]) - i - 1 + yaw = np.rad2deg(track["gaze_yaw_rad"][index]) + pitch = np.rad2deg(track["gaze_pitch_rad"][index]) + depth = track["depth_face"][index] + + thresh = (int(depth / 1000) + 1) * 5 # 5 deg per meter + if np.abs(yaw) < thresh and pitch < 0: + heat_count += 1 + # suppose we are looking down + + + # cv2.putText( + # self.vis_img, + # "{:d}".format( + # heat_count + # ), + # (x_min - 30, y_min + 30), + # cv2.FONT_HERSHEY_SIMPLEX, + # 0.5 * TEXT_SCALE, + # (255,0,255), + # 2, + # ) + + overlay_img = self.vis_img.copy() + + nose = keypoints[0,:2] + leye = keypoints[1,:2] + reye = keypoints[2,:2] + + colorval = min(((heat_count * 2) / history), 1.0) + strength = 0.5 + (heat_count / history) * 0.5 #(heat_count / history) + cmap = get_mpl_colormap("Reds") + color = (cmap[int(colorval * 255)] * 255) + color_tuple = (int(color[0]), int(color[1]), int(color[2])) + + # radius = np.linalg.norm(reye - leye) / 4 + # cv2.circle(overlay_img, (int(leye[0]), int(leye[1])), int(radius), color = color_tuple, thickness = -1) + # cv2.circle(self.vis_img, (int(leye[0]), int(leye[1])), int(radius), color = (0, 0, 0), thickness = 1) + + # cv2.circle(overlay_img, (int(reye[0]), int(reye[1])), int(radius), color = color_tuple, thickness = -1) + # cv2.circle(self.vis_img, (int(reye[0]), int(reye[1])), int(radius), color = (0, 0, 0), thickness = 1) + + ellipse_center = (int(leye[0] / 2 + reye[0] / 2), int(leye[1] / 2 + reye[1] / 2)) + ellipse_height = int(nose[1] - (leye[1] / 2 + reye[1] / 2)) + ellipse_width = int((leye[0] - reye[0]) * 1.1) + if ellipse_width > 0 and ellipse_height > 0: + cv2.ellipse(overlay_img, ellipse_center, (ellipse_width, ellipse_height), 0, 0, 360, color_tuple, 3) + + # cv2.rectangle(overlay_img, (x_min,y_min), (x_max,y_max), color = (0,255,0), thickness = -1) + # cv2.rectangle(self.vis_img, (x_min,y_min), (x_max,y_max), color = (255,255,255), thickness = 1) + + self.vis_img = cv2.addWeighted(self.vis_img,(1-strength),overlay_img,strength,0) + + + def start(self): + + while not rospy.is_shutdown(): + + if self.is_ready(): + + image_count = self.current_image_count + image_seq_unique = self.rgb_current_seq + now = datetime.now() + timestamp = now.strftime("%Y_%m_%d_%H_%M_%S_%f") + + if self.args.save or self.args.light_save: + self.save_rgb(image_count, image_seq_unique, timestamp) + + rgb_array = self.rgb.copy() + + if self.args.save: + self.save_depth(image_count, image_seq_unique, timestamp) + + depth_array = np.array(self.depth) + depth_array[depth_array > self.depth_array_max_threshold] = ( + self.depth_array_max_threshold + ) + + assert depth_array.shape[0] == rgb_array.shape[0] + assert depth_array.shape[1] == rgb_array.shape[1] + + # Process RGB array + if self.last_inferred_seq < self.rgb_current_seq: + + current_frame_processing = self.rgb_current_seq + current_timestamp = self.rgb_timestamp + current_frame_id = self.rgb_frame_id + prInfo("Do inference on frame {}".format(current_frame_processing)) + + # keep old poses for tracking + pose_results_last = self.pose_results + + tic = time.time() + mmdet_results = inference_detector( + self.det_model, rgb_array + ) # list of detection rectangle i.e [(x1,y1,x2,y2), ...] + tac = time.time() + prTimer("YOLO detection", tic, tac) + + # keep the person class bounding boxes. + person_results = process_mmdet_results( + mmdet_results, self.args.det_cat_id + ) + + new_persons = [] + for person in person_results: + bbox = person["bbox"] + pt1 = (max(0, min(bbox[0], depth_array.shape[1]-1)), max(0,min(bbox[1], depth_array.shape[0]-1)) ) + pt2 = (max(0, min(bbox[2], depth_array.shape[1]-1)), max(0,min(bbox[3], depth_array.shape[0]-1)) ) + + # depth1 = depth_array[int(pt1[1]), int(pt1[0])] + # depth2 = depth_array[int(pt2[1]), int(pt2[0])] + # if depth1 > self.args.depth_limit_threshold or depth1 == 0 or depth2 > self.args.depth_limit_threshold or depth2 == 0: + # pass + # else: + if abs(pt1[0] - pt2[0]) > self.args.bb_min_threshold/2 or abs(pt1[1]-pt2[1]) > self.args.bb_min_threshold: + new_persons.append(person) + + person_results = new_persons + + tic = time.time() + # test a single image, with a list of bboxes. + self.pose_results, returned_outputs = inference_top_down_pose_model( + self.pose_model, + rgb_array, + person_results, + bbox_thr=self.args.bbox_thr, + format="xyxy", + dataset=self.dataset, + dataset_info=self.dataset_info, + return_heatmap=self.return_heatmap, + outputs=None, + ) + tac = time.time() + prTimer("ViTPose", tic, tac) + # get track id for each person instance + self.pose_results, self.next_id = get_track_id( + self.pose_results, + pose_results_last, + self.next_id, + use_oks=False, + tracking_thr=self.args.tracking_thr, + use_one_euro=self.args.euro, + fps=10, + ) + + # produce an output image + if not self.args.no_show: + self.vis_img = rgb_array.copy() + + if self.display_all_detection and not self.args.no_show: + self.plot_mmdet_bbox(mmdet_results, depth_array.shape) + + #### post processing, 3D lifting (if enabled) and gaze estimation (if enabled) #### + + # remove too old tracks + for idx, track in list(self.tracks.items()): + if abs(image_count - track["last_seen"]) > self.args.max_frames_remove_tracks: + prInfo("Removing track {}, not seen since frame {}, current is {}".format(idx, track["last_seen"], image_count)) + self.tracks.pop(idx) + + self.tracks_in_current_image = {} + + for res in self.pose_results: + + # for each instance + bbox = res["bbox"] + keypoints = res["keypoints"] + idx = res["track_id"] % 255 + + if idx in self.tracks_in_current_image.keys(): + prWarning("Track with idx {} (track_id {} from results) already in the current image, maybe because there are more than 255 detections in the image".format( + idx, res["track_id"] + )) + continue + + if idx not in self.tracks.keys(): + prInfo("Adding a new track with idx {}".format(idx)) + self.tracks[idx] = {} + self.tracks[idx]["last_seen"] = image_count + self.tracks[idx]["keypoints_2d"] = [] + self.tracks[idx]["images_crop"] = [] + self.tracks[idx]["head_masks"] = [] + self.tracks[idx]["norm_body_centers"] = [] + self.tracks[idx]["bboxes"] = [] + self.tracks[idx]["depth_face"] = [] + self.tracks[idx]["gaze_yaw_rad"] = [] + self.tracks[idx]["gaze_pitch_rad"] = [] + + # add keypoint to the current track + self.tracks[idx]["last_seen"] = image_count + self.tracks[idx]["keypoints_2d"].append(keypoints) + self.tracks[idx]["bboxes"].append(bbox) + + self.tracks_in_current_image[idx] = { + "right_wrist_depth": None, + "right_wrist_pose": None, + "left_wrist_depth": None, + "left_wrist_pose": None, + "depth_center": None, + "pose_center": None, + "pose_from": None, + "depth_face": None, + "gaze_yaw_rad": None, + "gaze_pitch_rad": None, + } + + # if history is long enough, process the trajectory with MotionBERT + if self.args.use_mb and len(self.tracks[idx]["keypoints_2d"]) >= self.args.mb_clip_len: + prInfo("Running MotionBERT for track {}".format(idx)) + + # prepare motion + motion = np.asarray(self.tracks[idx]["keypoints_2d"]) # T, 17, 3 + motion = motion[-self.args.mb_clip_len:, :, :] # keep only the required len + assert(motion.shape[1] == 17) + assert(motion.shape[2] == 3) + motion_h36 = coco2h36m(motion) # input is h36 format + motion_h36_scaled = crop_scale(motion_h36) # scale [1,1], normalize, crop + + with torch.no_grad(): + current_input = torch.Tensor(motion_h36_scaled).unsqueeze(0).cuda() + tic = time.time() + predicted_3d_pos = self.motionbert_3d_model(current_input) + tac = time.time() + prTimer("MotionBERT", tic, tac) + # root relative + predicted_3d_pos[:,:,0,:] = 0 # [1,T,17,3] + + predicted_3d_pos_np = predicted_3d_pos[0,-1,:,:].cpu().numpy() # keep only the last prediction + if "keypoints_3d" in self.tracks[idx].keys(): + self.tracks[idx]["keypoints_3d"].append(predicted_3d_pos_np) + else: + self.tracks[idx]["keypoints_3d"] = [predicted_3d_pos_np] * self.args.mb_clip_len # add fake padding at the begining so the lists align + + # print("len compare", idx, len(self.tracks[idx]["keypoints_3d"]), len(self.tracks[idx]["keypoints_2d"]), color = "yan") + + + # (run for every track or only closest ?) add input for gafa processing + # if for everyone should run in batch + if self.args.use_gafa and (len(self.tracks[idx]["images_crop"]) >= self.args.gafa_n_frames or self.args.gafa_no_history): + gafa_tic = time.time() + + # Make sure that the image is rgb and not bgr, may need conversion ! + # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + # im_pil = Image.fromarray(img) + crop_img, head_mask, norm_body_center, head_bb_abs = self.get_gafa_input_from_current_image(rgb_array[:,:,::-1], keypoints, bbox) + + if self.args.gafa_no_history: + # no history : duplicate the last image + images = np.repeat(crop_img[np.newaxis, :, :, :], self.args.gafa_n_frames, axis = 0) # torch.Tensor of size [n_frames, 3, 256, 192] + head_masks = np.repeat(head_mask[np.newaxis, :, :, :], self.args.gafa_n_frames, axis = 0) # numpy.ndarray of size [n_frames, 3, 256, 192] + body_dvs = np.zeros((self.args.gafa_n_frames, 2)) # numpy.ndarray of size n_frames, 2 + + else: + # history : use the last saved n images + self.tracks[idx]["images_crop"].append(crop_img) + self.tracks[idx]["head_masks"].append(head_mask) + self.tracks[idx]["norm_body_centers"].append(norm_body_center) + + images = torch.stack(self.tracks[idx]["images_crop"][-self.args.gafa_n_frames:], dim = 0) # torch.Tensor of size n_frames, 3, 256, 192 + head_masks = np.asarray(self.tracks[idx]["head_masks"][-self.args.gafa_n_frames:]) # numpy.ndarray of size n_frames, 1, 256, 192 + norm_body_centers = np.asarray(self.tracks[idx]["norm_body_centers"][-self.args.gafa_n_frames:]) # numpy.ndarray of size n_frames, 2 + body_dvs = norm_body_centers - np.roll(norm_body_centers, shift=1, axis=0) # numpy.ndarray of size n_frames, 2 + + with torch.no_grad(): + # debug_dic = {} + + images = images.unsqueeze(0) #.cuda().float() + head_masks = torch.from_numpy(head_masks).unsqueeze(0) #.cuda().float() + body_dvs = torch.from_numpy(body_dvs).unsqueeze(0) #.cuda().float() + + # last_img = images[0, -1, : ,: ,:].clone() + # for i in range(7): + # images[0, i, : ,: ,:] = last_img + images = images.cuda().float() + + # last_mask = head_masks[0, -1, : ,: ,:].clone() + # for i in range(7): + # head_masks[0, i, : ,: ,:] = last_mask + head_masks = head_masks.cuda().float() + + # body_dvs = torch.zeros(body_dvs.shape) + body_dvs = body_dvs.cuda().float() + + # debug_dic["images"] = images.clone().cpu().numpy() + # debug_dic["head_masks"] = head_masks.clone().cpu().numpy() + # debug_dic["body_dvs"] = body_dvs.clone().cpu().numpy() + + tic = time.time() + gaze_res, head_outputs, body_outputs = self.gafa_model(images, head_masks, body_dvs) + tac = time.time() + prTimer("GAFA", tic, tac) + + # debug_dic["gaze_res"] = gaze_res["direction"].clone().cpu().numpy() + # debug_dic["head_outputs"] = head_outputs["direction"].clone().cpu().numpy() + # debug_dic["body_outputs"] = body_outputs["direction"].clone().cpu().numpy() + + # with open('./debug_dic.pickle', 'wb') as handle: + # pickle.dump(debug_dic, handle) + + # print("GAFA done", color = "green", background = "white") + + if not self.args.no_show: + self.plot_gaze_and_body_dir(gaze_res, head_outputs, body_outputs, head_bb_abs, bbox) + self.plot_xyxy_person_bbox(idx, head_bb_abs, depth_array.shape) + self.plot_gaze_text_info(gaze_res, head_outputs, body_outputs, head_bb_abs, idx) + + # For debug only + # plt.clf() + # plt.imshow(np.moveaxis(images.clone().cpu().numpy()[0,-1,:,:,:], 0, 2)) + # plt.title("Custom data") + # plt.pause(0.01) + + + gafa_tac = time.time() + prTimer("GAFA full", gafa_tic, gafa_tac) + + else: + if self.args.use_gafa and self.args.gafa_no_history: + prInfo("Did not add inputs because no GAFA history required") + elif self.args.use_gafa: + # do not accumulate if unused + crop_img, head_mask, norm_body_center, head_bb_abs = self.get_gafa_input_from_current_image(rgb_array[:,:,::-1], keypoints, bbox) + self.tracks[idx]["images_crop"].append(crop_img) + self.tracks[idx]["head_masks"].append(head_mask) + self.tracks[idx]["norm_body_centers"].append(norm_body_center) + prInfo("Didn't run GAFA yet, waiting for history") + + + if self.args.use_gaze_resnet: + with torch.no_grad(): + + # debug_dic = {"image_full": rgb_array} + + item = {"keypoints" : keypoints[:,:2]} + head_trans = head_transform_face(item) + head_bb = head_trans["bb"] + head_bb = np.array([head_bb['u'], head_bb['v'], head_bb['w'], head_bb['h']]).astype(np.int32) + + tic = time.time() + if self.args.use_face_detector: + face_bboxes, fd_kp = self.face_detector.detect(rgb_array) # or convert to bgr ?? ## only use the body detection so that we match easily... + prWarning("Using face_detector does not provide any matching to the current idx of the frame, only using first detection !") + else: + face_bboxes = np.array([[head_bb[0],head_bb[1],head_bb[0]+head_bb[2],head_bb[1]+head_bb[3]]]) + tac = time.time() + prTimer("Face detetction", tic, tac) + + if (face_bboxes.shape[0] > 0): + + x_min, y_min, x_max, y_max = map(int, face_bboxes[0,:4]) + head_image = rgb_array[y_min:y_max, x_min:x_max] + + if (head_image.shape[0] > 10) and (head_image.shape[1] > 10): + head_image = pre_process(head_image) + + # For debug + # plt.clf() + # plt.imshow(np.moveaxis(head_image.clone().cpu().numpy()[0,:,:,:], 0, 2)) + # plt.title("Custom data") + # plt.pause(0.01) + + # debug_dic["image"] = head_image + + pitch, yaw = self.gaze_estimation_model(head_image) + + # debug_dic["pitch"] = pitch + # debug_dic["yaw"] = yaw + + # with open('debuig_dic.pkl', 'wb') as fp: + # pickle.dump(debug_dic, fp) + + # Softmax beofre sum + pitch_predicted, yaw_predicted = F.softmax(pitch, dim=1), F.softmax(yaw, dim=1) + + # Mapping from binned (0 to 90) to angles (-180 to 180) or (0 to 28) to angles (-42, 42) + idx_tensor = torch.arange(90, device=self.args.device.lower(), dtype=torch.float32) + + pitch_predicted = torch.sum(pitch_predicted * idx_tensor, dim=1) * 4 - 180 + yaw_predicted = torch.sum(yaw_predicted * idx_tensor, dim=1) * 4 - 180 + + pitch_predicted = pitch_predicted.cpu().numpy() + yaw_predicted = yaw_predicted.cpu().numpy() + + # Degrees to Radians + pitch_predicted_rad = np.radians(pitch_predicted) + yaw_predicted_rad = np.radians(yaw_predicted) + + self.tracks_in_current_image[idx]["gaze_pitch_rad"] = pitch_predicted_rad + self.tracks_in_current_image[idx]["gaze_yaw_rad"] = yaw_predicted_rad + self.tracks[idx]["gaze_pitch_rad"].append(pitch_predicted_rad) + self.tracks[idx]["gaze_yaw_rad"].append(yaw_predicted_rad) + + self.plot_gaze_from_pitch_yaw(pitch_predicted_rad[0], yaw_predicted_rad[0], head_bb, idx, keypoints) + self.plot_gaze_angle_info(pitch_predicted[0], yaw_predicted[0], head_bb, idx) + + # get face depth + nose = keypoints[0,:2].astype(np.uint32) + leye = keypoints[1,:2].astype(np.uint32) + reye = keypoints[2,:2].astype(np.uint32) + + depth_nose = depth_array[np.clip(nose[1], 0, depth_array.shape[0] - 1), np.clip(nose[0], 0, depth_array.shape[1] - 1)] + depth_leye = depth_array[np.clip(leye[1], 0, depth_array.shape[0] - 1), np.clip(leye[0], 0, depth_array.shape[1] - 1)] + depth_reye = depth_array[np.clip(reye[1], 0, depth_array.shape[0] - 1), np.clip(reye[0], 0, depth_array.shape[1] - 1)] + + depth_face = np.median([depth_nose, depth_leye, depth_reye]) + + self.tracks_in_current_image[idx]["depth_face"] = depth_face + self.tracks[idx]["depth_face"].append(depth_face) + + self.plot_overlay_face_attention(self.tracks[idx], face_bboxes[0,:4]) + + + + if self.args.use_six_d_rep: + with torch.no_grad(): + + # debug_dic = {"image_full": rgb_array} + + item = {"keypoints" : keypoints[:,:2]} + head_trans = head_transform(item) + head_bb = head_trans["bb"] + head_bb = np.array([head_bb['u'], head_bb['v'], head_bb['w'], head_bb['h']]).astype(np.int32) + + tic = time.time() + if self.args.use_face_detector: + face_bboxes = self.sixdrep_detector.detect(rgb_array, (640,640)) # or convert to bgr ?? ## only use the body detection so that we match easily... + face_bboxes = face_bboxes.astype('int32') + prWarning("Using face_detector does not provide any matching to the current idx of the frame, only using first detection !") + else: + face_bboxes = np.array([[head_bb[0],head_bb[1],head_bb[0]+head_bb[2],head_bb[1]+head_bb[3]]]) + tac = time.time() + prTimer("Face detetction", tic, tac) + + facing_camera = ((keypoints[3,0] - keypoints[4,0]) > 20) + + if (face_bboxes.shape[0] > 0) and facing_camera: + x_min = face_bboxes[0,0] + y_min = face_bboxes[0,1] + x_max = face_bboxes[0,2] + y_max = face_bboxes[0,3] + box_w = abs(x_max - x_min) + box_h = abs(y_max - y_min) + + x_min = max(0, x_min - int(0.2 * box_h)) + y_min = max(0, y_min - int(0.2 * box_w)) + x_max = x_max + int(0.2 * box_h) + y_max = y_max + int(0.2 * box_w) + + head_image = rgb_array[y_min:y_max, x_min:x_max, :] + + if (head_image.shape[0] > 10) and (head_image.shape[1] > 10): + + + head_image = PILImage.fromarray(head_image) + head_image = head_image.convert('RGB') + head_image = sixdreptransform(head_image) + head_image = head_image.unsqueeze(0) + + head_image = head_image.cuda() + head_image = head_image.half() + + tic = time.time() + output = self.sixdrep_model(head_image) + tac = time.time() + prTimer("SixDRep", tic, tac) + + output = compute_euler(output) * 180 / np.pi + + p_output = output[:, 0].cpu() + y_output = output[:, 1].cpu() + r_output = output[:, 2].cpu() + + self.tracks_in_current_image[idx]["gaze_pitch_rad"] = np.deg2rad(p_output.item()) + self.tracks_in_current_image[idx]["gaze_yaw_rad"] = np.deg2rad(y_output.item()) + self.tracks[idx]["gaze_pitch_rad"].append(np.deg2rad(p_output.item())) + self.tracks[idx]["gaze_yaw_rad"].append(np.deg2rad(y_output.item())) + + self.plot_gaze_from_pitch_yaw(np.deg2rad(y_output.item()), np.deg2rad(p_output.item()), head_bb, idx, keypoints) # invert pitch compared to resnet + # self.plot_gaze_angle_info(y_output.item(), p_output.item(), head_bb, idx) # invert pitch compared to resnet + + # get face depth + nose = keypoints[0,:2].astype(np.uint32) + leye = keypoints[1,:2].astype(np.uint32) + reye = keypoints[2,:2].astype(np.uint32) + + depth_nose = depth_array[np.clip(nose[1], 0, depth_array.shape[0] - 1), np.clip(nose[0], 0, depth_array.shape[1] - 1)] + depth_leye = depth_array[np.clip(leye[1], 0, depth_array.shape[0] - 1), np.clip(leye[0], 0, depth_array.shape[1] - 1)] + depth_reye = depth_array[np.clip(reye[1], 0, depth_array.shape[0] - 1), np.clip(reye[0], 0, depth_array.shape[1] - 1)] + + depth_face = np.median([depth_nose, depth_leye, depth_reye]) + + self.tracks_in_current_image[idx]["depth_face"] = depth_face + self.tracks[idx]["depth_face"].append(depth_face) + + self.plot_overlay_face_attention_6d(self.tracks[idx], face_bboxes[0,:4], keypoints) + else: + self.tracks[idx]["gaze_pitch_rad"].append(np.deg2rad(180)) + self.tracks[idx]["gaze_yaw_rad"].append(np.deg2rad(180)) + + nose = keypoints[0,:2].astype(np.uint32) + leye = keypoints[1,:2].astype(np.uint32) + reye = keypoints[2,:2].astype(np.uint32) + + depth_nose = depth_array[np.clip(nose[1], 0, depth_array.shape[0] - 1), np.clip(nose[0], 0, depth_array.shape[1] - 1)] + depth_leye = depth_array[np.clip(leye[1], 0, depth_array.shape[0] - 1), np.clip(leye[0], 0, depth_array.shape[1] - 1)] + depth_reye = depth_array[np.clip(reye[1], 0, depth_array.shape[0] - 1), np.clip(reye[0], 0, depth_array.shape[1] - 1)] + depth_face = np.median([depth_nose, depth_leye, depth_reye]) + + self.tracks[idx]["depth_face"].append(depth_face) + + else: + self.tracks[idx]["gaze_pitch_rad"].append(np.deg2rad(180)) + self.tracks[idx]["gaze_yaw_rad"].append(np.deg2rad(180)) + + nose = keypoints[0,:2].astype(np.uint32) + leye = keypoints[1,:2].astype(np.uint32) + reye = keypoints[2,:2].astype(np.uint32) + + depth_nose = depth_array[np.clip(nose[1], 0, depth_array.shape[0] - 1), np.clip(nose[0], 0, depth_array.shape[1] - 1)] + depth_leye = depth_array[np.clip(leye[1], 0, depth_array.shape[0] - 1), np.clip(leye[0], 0, depth_array.shape[1] - 1)] + depth_reye = depth_array[np.clip(reye[1], 0, depth_array.shape[0] - 1), np.clip(reye[0], 0, depth_array.shape[1] - 1)] + depth_face = np.median([depth_nose, depth_leye, depth_reye]) + + self.tracks[idx]["depth_face"].append(depth_face) + + # Draw bb + bbox[4] *= 100 + bbox = bbox.astype(np.int32) + + if not self.args.no_show: + self.plot_xyxy_person_bbox(idx, bbox, depth_array.shape, self.tracks[idx]) + + # return the list of body center joints and also fill self.tracks_in_current_image[idx] + body_center_joints = self.process_keypoints(keypoints, depth_array, idx) + + # find the body center + if len(body_center_joints) == 4: + # if we managed to find the 4 points of the torso, search on the torso + body_center_joints = np.array( + body_center_joints + ) # lsho, rsho, lhip, rhip + lsho = body_center_joints[0, :] + rsho = body_center_joints[1, :] + lhip = body_center_joints[2, :] + rhip = body_center_joints[3, :] + + depths_torso, poses_torso = self.get_depth_and_poses_of_torso(depth_array, lsho, rsho, lhip, rhip, idx) + + # redraw bb with more info + if not self.args.no_show: + self.plot_xyxy_person_bbox(idx, bbox, depth_array.shape, self.tracks[idx], poses_torso) + + if len(depths_torso) > 3: + # at least 4 points to average decently + depth_body = np.array(depths_torso).mean() + pose_body = np.array(poses_torso).mean(axis=0) + self.tracks_in_current_image[idx][ + "depth_center" + ] = depth_body # mm + self.tracks_in_current_image[idx][ + "pose_center" + ] = pose_body # m + self.tracks_in_current_image[idx]["pose_from"] = "torso" + + # just for drawing + body_center = np.mean(body_center_joints, axis=0) + # Draw center of body + body_center = (int(body_center[0]), int(body_center[1])) + + if not self.light_display and not self.args.no_show: + self.plot_body_pose_data(body_center, depth_body, pose_body, idx) + + else: + # if we did not managed to find the 4 points of the torso, search in the bbox + prWarning( + "Can't use body center from shoulders and hips for track {} : do nothing".format( + idx + ) + ) + + # draw skeleton + if not self.args.no_show and not self.args.light_display: + self.plot_skeleton_2d(keypoints, idx) + + min_depth = 1e6 # mm + min_depth_idx = -1 + for idx, track_info in self.tracks_in_current_image.items(): + depth = track_info["depth_center"] + if depth is not None: + if depth < min_depth: + min_depth = depth + min_depth_idx = idx + + if min_depth_idx != -1: + pose_closest = self.tracks_in_current_image[min_depth_idx][ + "pose_center" + ] + yaw_closest_gaze = self.tracks_in_current_image[min_depth_idx]["gaze_yaw_rad"] + if yaw_closest_gaze is None: + yaw_closest = np.deg2rad(-180.0) + else: + yaw_closest = yaw_closest_gaze + prInfo( + "Using track {} as it is the closest".format(min_depth_idx) + ) + tf_msg = TransformStamped() + tf_msg.child_frame_id = args.namespace + "/human" + tf_msg.header.seq = current_frame_processing + tf_msg.header.stamp = current_timestamp + tf_msg.header.frame_id = current_frame_id + # adapt to robot camera frame convention on the robot + tf_msg.transform.translation.x = pose_closest[2] + tf_msg.transform.translation.y = -pose_closest[0] + tf_msg.transform.translation.z = -pose_closest[1] + + angle = np.arctan( + tf_msg.transform.translation.y + / tf_msg.transform.translation.x + ) + + # Rotate to have 'human' x axis looking towards the robot + rot = Rotation() + rot.DoRotZ(angle) + rot.DoRotY(np.pi) + qx, qy, qz, qw = rot.GetQuaternion() + + tf_msg.transform.rotation.x = qx + tf_msg.transform.rotation.y = qy + tf_msg.transform.rotation.z = qz + tf_msg.transform.rotation.w = qw + + + dist = np.sqrt( + tf_msg.transform.translation.x**2 + tf_msg.transform.translation.y**2 + tf_msg.transform.translation.z**2 + ) + + if dist < self.args.max_distance: # meters + self.goal_pub.publish(tf_msg) + prSuccess( + "Publishing coordinates {:.2f} {:.2f} {:.2f}".format( + pose_closest[0], pose_closest[1], pose_closest[2] + ) + ) + + self.tf_br.sendTransform(tf_msg) + + prSuccess( + "Publishing coordinates {:.2f} {:.2f} {:.2f} and yaw {:.2f}".format( + pose_closest[0], pose_closest[1], pose_closest[2], np.rad2deg(yaw_closest) + ) + ) + + self.tf_br.sendTransform(tf_msg) + + if not self.args.no_show: + # self.plot_det_text_info(pose_closest) + pass + + else: + + if not self.args.no_show: + # self.plot_det_text_info(None) + pass + + self.last_inferred_seq = current_frame_processing + + if self.args.save and not self.args.no_show: + self.save_output_image(image_count, image_seq_unique, timestamp) + + else: + prWarning( + "No inference because the current RGB frame has already been processed last_inferred_seq {} vs rgb_current_seq {}".format( + self.last_inferred_seq, self.rgb_current_seq + ) + ) + + if not self.args.no_show: + depth_array_disp = depth_array.copy() + depth_array_disp[depth_array_disp > 3000] = 3000 + depth_array_norm = ((depth_array_disp - depth_array_disp.min())) / ( + depth_array_disp.max() - depth_array_disp.min() + ) + depth_array_norm = depth_array_norm * 255 + depth_array_norm = depth_array_norm.astype(np.uint8) + depth_array_norm_colored = ( + self.depth_cmap[depth_array_norm] * 255 + ).astype(np.uint8) + + if self.args.save or self.args.light_save: + self.save_pcl(image_count, image_seq_unique, timestamp) + + if self.vis_img is not None: + full_display_array = np.zeros( + (rgb_array.shape[0] * 2, rgb_array.shape[1], 3), dtype=np.uint8 + ) + full_display_array[: rgb_array.shape[0], :, :] = self.vis_img + full_display_array[rgb_array.shape[0] :, :, :] = ( + depth_array_norm_colored + ) + + if not self.args.no_show: + cv2.imshow("RGBD window", full_display_array) + cv2.waitKey(3) + + else: + print("Images are None !") + + self.loop_rate.sleep() + + +if __name__ == "__main__": + + ## Parser with params + parser = ArgumentParser() + parser.add_argument( + "--det_config", + type=str, + default="./configs/detection/yolov3_d53_320_273e_coco.py", + help="Config file for detection | default = %(default)s", + ) + parser.add_argument( + "--det_checkpoint", + type=str, + default="./models/yolov3_d53_320_273e_coco-421362b6.pth", + help="Checkpoint file for detection | default = %(default)s", + ) + parser.add_argument( + "--pose_config", + type=str, + default="./configs/pose/ViTPose_small_coco_256x192.py", + help="Config file for pose | default = %(default)s", + ) + parser.add_argument( + "--pose_checkpoint", + type=str, + default="./models/vitpose_small.pth", + help="Checkpoint file for pose | default = %(default)s", + ) + parser.add_argument( + "--device", + default="cuda:0", + help="Device used for inference | default = %(default)s", + ) + parser.add_argument( + "--det_cat_id", + type=int, + default=1, + help="Category id for bounding box detection model (person) | default = %(default)s", + ) + parser.add_argument( + "--bbox_thr", + type=float, + default=0.3, + help="Bounding box score threshold | default = %(default)s", + ) + parser.add_argument( + "--kpt_thr", + type=float, + default=0.3, + help="Keypoint score threshold | default = %(default)s", + ) + parser.add_argument( + "--tracking_thr", + type=float, + default=0.3, + help="Tracking threshold | default = %(default)s", + ) + parser.add_argument( + "--euro", action="store_true", help="Using One_Euro_Filter for smoothing" + ) + # parser.add_argument('--rgb_topic', default = "orbbec/rgb", type=str, help='ROS topic for RGB image') + # parser.add_argument('--depth_topic', default = "orbbec/depth", type=str, help='ROS topic for depth image') + # parser.add_argument('--pcl_topic', default = "orbbec/pcl", type=str, help='ROS topic for pcl') + parser.add_argument( + "--namespace", + default="orbbec", + type=str, + help="ROS topic namespace for rgb, depth, pcl | default = %(default)s", + ) + parser.add_argument( + "--no_show", + action="store_true", + default=False, + help="whether to show visualizations | default = %(default)s", + ) + parser.add_argument( + "--save", + action="store_true", + default=False, + help="whether to save images (rgb and d and predictions and pcl) | default = %(default)s", + ) + parser.add_argument( + "--flip", + action="store_true", + default=False, + help="whether to flip images | default = %(default)s", + ) + parser.add_argument( + "--light_save", + action="store_true", + default=False, + help="whether to save only rgb and pcl (not optimized use the light_save of visualizer for optimized saving) | default = %(default)s", + ) + parser.add_argument( + "--display_all_detection", + "-dad", + action="store_true", + default=False, + help="whether to display all detections or only human | default = %(default)s", + ) + parser.add_argument( + "--light_display", + "-ld", + action="store_true", + default=False, + help="whether to display only skeletons | default = %(default)s", + ) + parser.add_argument("--fps", type=int, default=10, help="Node and recording fps") + parser.add_argument( + "--depth_cmap", + default="jet", + type=str, + help="mpl colormap for depth image | default = %(default)s", + ) + + parser.add_argument('--mb_3d_config', type=str, default = "./configs/pose3d/MB_ft_h36m.yaml", help='Config file for 3D poses | default = %(default)s') + parser.add_argument('--mb_checkpoint', type=str, default = "./checkpoint/pose3d/MB_train_h36m/best_epoch.bin", help='Checkpoint file for 3D poses | default = %(default)s') + parser.add_argument( + '--mb_clip_len', + type=int, + default=10, + help='Number of past frames to use for MotionBERT (default in model is 243) | default = %(default)s') + parser.add_argument( + '--max_frames_remove_tracks', + type=int, + default=2, + help='Number frames without the track present to keep going before removing a track | default = %(default)s') + parser.add_argument( + "--use_mb", + "-mb", + action="store_true", + default=False, + help="whether to use MotionBERT 3D Lifter | default = %(default)s", + ) + + parser.add_argument('--gafa_checkpoint', type=str, default = "./checkpoint/gafa/GazeNet_PyTorch.pt", help='Checkpoint file for 3D gaze estimation GAFA | default = %(default)s') + parser.add_argument( + '--gafa_n_frames', + type=int, + default=7, + help='Number of past frames to use for GAFA (default in model is 7) | default = %(default)s') + parser.add_argument( + "--use_gafa", + "-gafa", + action="store_true", + default=False, + help="whether to use GAFA 3D Gaze Estimation | default = %(default)s", + ) + parser.add_argument( + "--gafa_no_history", + "-gnh", + action="store_true", + default=False, + help="whether to use history in the GAFA sequence or fake it by copying last image | default = %(default)s", + ) + + parser.add_argument( + "--use_gaze_resnet", + "-resnet", + action="store_true", + default=False, + help="whether to use Gaze ResNet18 3D Gaze Estimation | default = %(default)s", + ) + parser.add_argument( + "--use_face_detector", + "-ufd", + action="store_true", + default=False, + help="whether to use Face Detector before gaze ResNet18 3D Gaze Estimation, or juste use bbox from keypoints | default = %(default)s", + ) + parser.add_argument( + "--use_six_d_rep", + "-sixdrep", + action="store_true", + default=False, + help="whether to use 6D rep head pose estimation instead of gaze estimation | default = %(default)s", + ) + parser.add_argument( + "--bb_min_threshold", + "-bbmt", + type=int, + default=0, + help="Minimum height of bb in pixels | default = %(default)s", + ) + parser.add_argument( + "--max_distance", + type=float, + default=2.5, + help="Maximum distance allowed for publishing human pose | default = %(default)s", + ) + + args = parser.parse_args() + + assert has_mmdet, "Please install mmdet to run the demo." + assert args.det_config is not None + assert args.det_checkpoint is not None + + if args.use_mb: + assert(has_mb), "Option --use_mb requires MotionBERT install" + + if args.use_gafa: + assert(args.use_gaze_resnet == False), "Option --use_gafa and --use_gaze_resnet are not compatible" + assert(args.use_six_d_rep == False), "Option --use_gafa and --use_six_d_rep are not compatible" + + if args.use_gaze_resnet: + assert(args.use_gafa == False), "Option --use_gaze_resnet and --use_gafa are not compatible" + assert(args.use_six_d_rep == False), "Option --use_gaze_resnet and --use_six_d_rep are not compatible" + + if args.use_six_d_rep: + assert(args.use_gaze_resnet == False), "Option --use_six_d_rep and --use_gaze_resnet are not compatible" + assert(args.use_gafa == False), "Option --use_six_d_rep and --use_gafa are not compatible" + + if args.use_gafa: + assert(has_gafa), "Option --use_gafa requires GAFA install" + + if args.use_six_d_rep: + assert(has_sixdrep), "Option --use_six_d_rep requires 6D Rep install" + + if args.use_gaze_resnet: + assert(has_gaze_est), "Option --use_gaze_resnet requires Gaze Estimation" + + prInfo("Loaded with args : {}".format(args)) + + rospy.init_node("python_orbbec_inference", anonymous=True) + my_node = InferenceNodeRGBD(args) + my_node.start() + cv2.destroyAllWindows() diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..13f12b8 --- /dev/null +++ b/run.sh @@ -0,0 +1 @@ +python rgbd_detect_3d_dir.py --flip -bbmt 200 --namespace orbbec --max_distance 2.5 -sixdrep \ No newline at end of file diff --git a/sixdrep/util.py b/sixdrep/util.py new file mode 100644 index 0000000..5f078be --- /dev/null +++ b/sixdrep/util.py @@ -0,0 +1,442 @@ +import math +import os +import random + +import cv2 +import numpy +import torch +from PIL import Image +from PIL import ImageEnhance + + +def setup_seed(): + """ + Setup random seed. + """ + random.seed(0) + numpy.random.seed(0) + torch.manual_seed(0) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + +def setup_multi_processes(): + """ + Setup multi-processing environment variables. + """ + import cv2 + from os import environ + from platform import system + + # set multiprocess start method as `fork` to speed up the training + if system() != 'Windows': + torch.multiprocessing.set_start_method('fork', force=True) + + # disable opencv multithreading to avoid system being overloaded + cv2.setNumThreads(0) + + # setup OMP threads + if 'OMP_NUM_THREADS' not in environ: + environ['OMP_NUM_THREADS'] = '1' + + # setup MKL threads + if 'MKL_NUM_THREADS' not in environ: + environ['MKL_NUM_THREADS'] = '1' + + +def plot_lr(args, optimizer, scheduler): + import copy + from matplotlib import pyplot + + optimizer = copy.copy(optimizer) + scheduler = copy.copy(scheduler) + + y = [] + for epoch in range(args.epochs): + y.append(optimizer.param_groups[-1]['lr']) + scheduler.step(epoch + 1, optimizer) + + pyplot.plot(y, '.-', label='LR') + pyplot.xlabel('epoch') + pyplot.ylabel('LR') + pyplot.grid() + pyplot.xlim(0, args.epochs) + pyplot.ylim(0) + pyplot.savefig('./weights/lr.png', dpi=200) + pyplot.close() + + +def strip_optimizer(filename): + x = torch.load(filename, map_location=torch.device('cpu')) + x['model'].half() # to FP16 + for p in x['model'].parameters(): + p.requires_grad = False + torch.save(x, filename) + + +def resample(): + return random.choice((Image.BILINEAR, Image.BICUBIC)) + + +def load_weights(model, ckpt): + dst = model.state_dict() + src = torch.load(ckpt)['model'] + src = src.cpu().float().state_dict() + + ckpt = {} + for k, v in src.items(): + if k in dst and v.shape == dst[k].shape: + ckpt[k] = v + model.load_state_dict(state_dict=ckpt, strict=False) + return model + + +def compute_euler(matrices): + shape = matrices.shape + sy = matrices[:, 0, 0] * matrices[:, 0, 0] + matrices[:, 1, 0] * matrices[:, 1, 0] + sy = torch.sqrt(sy) + singular = (sy < 1E-6).float() + + x = torch.atan2(matrices[:, 2, 1], matrices[:, 2, 2]) + y = torch.atan2(-matrices[:, 2, 0], sy) + z = torch.atan2(matrices[:, 1, 0], matrices[:, 0, 0]) + + xs = torch.atan2(-matrices[:, 1, 2], matrices[:, 1, 1]) + ys = torch.atan2(-matrices[:, 2, 0], sy) + zs = torch.zeros_like(z) + + device = matrices.device + out_euler = torch.zeros(shape[0], 3, device=device) + out_euler[:, 0] = x * (1 - singular) + xs * singular + out_euler[:, 1] = y * (1 - singular) + ys * singular + out_euler[:, 2] = z * (1 - singular) + zs * singular + return out_euler + + +def params(model, lr): + return [{'params': model.p1.parameters(), 'lr': lr}, + {'params': model.p2.parameters(), 'lr': lr}, + {'params': model.p3.parameters(), 'lr': lr}, + {'params': model.p4.parameters(), 'lr': lr}, + {'params': model.p5.parameters(), 'lr': lr}, + {'params': model.fc.parameters(), 'lr': lr * 10}] + + +class Resize: + def __init__(self, size: int): + self.size = size + + def __call__(self, image): + size = self.size + i, j, h, w = self.params(image.size) + image = image.crop((j, i, j + w, i + h)) + return image.resize([size, size], resample()) + + @staticmethod + def params(size): + scale = (0.8, 1.0) + ratio = (3. / 4., 4. / 3.) + for _ in range(10): + target_area = random.uniform(*scale) * size[0] * size[1] + aspect_ratio = math.exp(random.uniform(*(math.log(ratio[0]), math.log(ratio[1])))) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= size[0] and h <= size[1]: + i = random.randint(0, size[1] - h) + j = random.randint(0, size[0] - w) + return i, j, h, w + + if (size[0] / size[1]) < min(ratio): + w = size[0] + h = int(round(w / min(ratio))) + elif (size[0] / size[1]) > max(ratio): + h = size[1] + w = int(round(h * max(ratio))) + else: + w = size[0] + h = size[1] + i = (size[1] - h) // 2 + j = (size[0] - w) // 2 + return i, j, h, w + + +class ColorJitter: + def __init__(self, + p: float = 0.5, + brightness: float = 0.1, + saturation: float = 0.1, + contrast: float = 0.1): + self.brightness = (1 - brightness, 1 + brightness) + self.saturation = (1 - saturation, 1 + saturation) + self.contrast = (1 - contrast, 1 + contrast) + self.indices = [0, 1, 2] + self.p = p + + def __call__(self, image): + if random.random() > self.p: + return image + + b = random.uniform(self.brightness[0], self.brightness[1]) + s = random.uniform(self.saturation[0], self.saturation[1]) + c = random.uniform(self.contrast[0], self.contrast[1]) + + random.shuffle(self.indices) + + for i in self.indices: + if i == 0: + image = ImageEnhance.Brightness(image).enhance(b) # brightness + elif i == 1: + image = ImageEnhance.Contrast(image).enhance(c) # contrast + elif i == 2: + image = ImageEnhance.Color(image).enhance(s) # saturation + + return image + + +class AverageMeter: + def __init__(self): + self.num = 0 + self.sum = 0 + self.avg = 0 + + def update(self, v, n): + self.num = self.num + n + self.sum = self.sum + v * n + self.avg = self.sum / self.num + + +def plot_pose_cube(image, yaw, pitch, roll, tdx=None, tdy=None, size=150.): + p = pitch * numpy.pi / 180 + y = -(yaw * numpy.pi / 180) + r = roll * numpy.pi / 180 + if (tdx is not None) and (tdy is not None): + face_x = tdx - 0.50 * size + face_y = tdy - 0.50 * size + else: + height, width = image.shape[:2] + face_x = width / 2 - 0.5 * size + face_y = height / 2 - 0.5 * size + + x1 = size * (math.cos(y) * math.cos(r)) + face_x + y1 = size * (math.cos(p) * math.sin(r) + math.cos(r) * math.sin(p) * math.sin(y)) + face_y + x2 = size * (-math.cos(y) * math.sin(r)) + face_x + y2 = size * (math.cos(p) * math.cos(r) - math.sin(p) * math.sin(y) * math.sin(r)) + face_y + x3 = size * (math.sin(y)) + face_x + y3 = size * (-math.cos(y) * math.sin(p)) + face_y + + # Draw base in red + cv2.line(image, (int(face_x), int(face_y)), (int(x1), int(y1)), (0, 0, 255), 3) + cv2.line(image, (int(face_x), int(face_y)), (int(x2), int(y2)), (0, 0, 255), 3) + cv2.line(image, (int(x2), int(y2)), (int(x2 + x1 - face_x), int(y2 + y1 - face_y)), (0, 0, 255), 3) + cv2.line(image, (int(x1), int(y1)), (int(x1 + x2 - face_x), int(y1 + y2 - face_y)), (0, 0, 255), 3) + # Draw pillars in blue + cv2.line(image, (int(face_x), int(face_y)), (int(x3), int(y3)), (255, 0, 0), 2) + cv2.line(image, (int(x1), int(y1)), (int(x1 + x3 - face_x), int(y1 + y3 - face_y)), (255, 0, 0), 2) + cv2.line(image, (int(x2), int(y2)), (int(x2 + x3 - face_x), int(y2 + y3 - face_y)), (255, 0, 0), 2) + cv2.line(image, (int(x2 + x1 - face_x), int(y2 + y1 - face_y)), + (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (255, 0, 0), 2) + # Draw top in green + cv2.line(image, (int(x3 + x1 - face_x), int(y3 + y1 - face_y)), + (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (0, 255, 0), 2) + cv2.line(image, (int(x2 + x3 - face_x), int(y2 + y3 - face_y)), + (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (0, 255, 0), 2) + cv2.line(image, (int(x3), int(y3)), (int(x3 + x1 - face_x), int(y3 + y1 - face_y)), (0, 255, 0), 2) + cv2.line(image, (int(x3), int(y3)), (int(x3 + x2 - face_x), int(y3 + y2 - face_y)), (0, 255, 0), 2) + + return image + + +def distance2box(points, distance, max_shape=None): + x1 = points[:, 0] - distance[:, 0] + y1 = points[:, 1] - distance[:, 1] + x2 = points[:, 0] + distance[:, 2] + y2 = points[:, 1] + distance[:, 3] + if max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1]) + y1 = y1.clamp(min=0, max=max_shape[0]) + x2 = x2.clamp(min=0, max=max_shape[1]) + y2 = y2.clamp(min=0, max=max_shape[0]) + return numpy.stack([x1, y1, x2, y2], axis=-1) + + +def distance2kps(points, distance, max_shape=None): + outputs = [] + for i in range(0, distance.shape[1], 2): + p_x = points[:, i % 2] + distance[:, i] + p_y = points[:, i % 2 + 1] + distance[:, i + 1] + if max_shape is not None: + p_x = p_x.clamp(min=0, max=max_shape[1]) + p_y = p_y.clamp(min=0, max=max_shape[0]) + outputs.append(p_x) + outputs.append(p_y) + return numpy.stack(outputs, axis=-1) + + +class FaceDetector: + def __init__(self, onnx_path=None, session=None): + from onnxruntime import InferenceSession + self.session = session + + self.batched = False + if self.session is None: + assert onnx_path is not None + assert os.path.exists(onnx_path) + self.session = InferenceSession(onnx_path, + providers=['CUDAExecutionProvider']) + self.nms_thresh = 0.4 + self.center_cache = {} + input_cfg = self.session.get_inputs()[0] + input_shape = input_cfg.shape + if isinstance(input_shape[2], str): + self.input_size = None + else: + self.input_size = tuple(input_shape[2:4][::-1]) + input_name = input_cfg.name + outputs = self.session.get_outputs() + if len(outputs[0].shape) == 3: + self.batched = True + output_names = [] + for output in outputs: + output_names.append(output.name) + self.input_name = input_name + self.output_names = output_names + self.use_kps = False + self._num_anchors = 1 + if len(outputs) == 6: + self.fmc = 3 + self._feat_stride_fpn = [8, 16, 32] + self._num_anchors = 2 + elif len(outputs) == 9: + self.fmc = 3 + self._feat_stride_fpn = [8, 16, 32] + self._num_anchors = 2 + self.use_kps = True + elif len(outputs) == 10: + self.fmc = 5 + self._feat_stride_fpn = [8, 16, 32, 64, 128] + self._num_anchors = 1 + elif len(outputs) == 15: + self.fmc = 5 + self._feat_stride_fpn = [8, 16, 32, 64, 128] + self._num_anchors = 1 + self.use_kps = True + + def forward(self, x, score_thresh): + scores_list = [] + bboxes_list = [] + points_list = [] + input_size = tuple(x.shape[0:2][::-1]) + blob = cv2.dnn.blobFromImage(x, + 1.0 / 128, + input_size, + (127.5, 127.5, 127.5), swapRB=True) + outputs = self.session.run(self.output_names, {self.input_name: blob}) + input_height = blob.shape[2] + input_width = blob.shape[3] + fmc = self.fmc + for idx, stride in enumerate(self._feat_stride_fpn): + if self.batched: + scores = outputs[idx][0] + boxes = outputs[idx + fmc][0] + boxes = boxes * stride + else: + scores = outputs[idx] + boxes = outputs[idx + fmc] + boxes = boxes * stride + + height = input_height // stride + width = input_width // stride + key = (height, width, stride) + if key in self.center_cache: + anchor_centers = self.center_cache[key] + else: + anchor_centers = numpy.stack(numpy.mgrid[:height, :width][::-1], axis=-1) + anchor_centers = anchor_centers.astype(numpy.float32) + + anchor_centers = (anchor_centers * stride).reshape((-1, 2)) + if self._num_anchors > 1: + anchor_centers = numpy.stack([anchor_centers] * self._num_anchors, axis=1) + anchor_centers = anchor_centers.reshape((-1, 2)) + if len(self.center_cache) < 100: + self.center_cache[key] = anchor_centers + + pos_indices = numpy.where(scores >= score_thresh)[0] + bboxes = distance2box(anchor_centers, boxes) + pos_scores = scores[pos_indices] + pos_bboxes = bboxes[pos_indices] + scores_list.append(pos_scores) + bboxes_list.append(pos_bboxes) + return scores_list, bboxes_list + + def detect(self, image, input_size=None, score_threshold=0.5, max_num=0, metric='default'): + assert input_size is not None or self.input_size is not None + input_size = self.input_size if input_size is None else input_size + image_ratio = float(image.shape[0]) / image.shape[1] + model_ratio = float(input_size[1]) / input_size[0] + if image_ratio > model_ratio: + new_height = input_size[1] + new_width = int(new_height / image_ratio) + else: + new_width = input_size[0] + new_height = int(new_width * image_ratio) + det_scale = float(new_height) / image.shape[0] + resized_img = cv2.resize(image, (new_width, new_height)) + det_img = numpy.zeros((input_size[1], input_size[0], 3), dtype=numpy.uint8) + det_img[:new_height, :new_width, :] = resized_img + + scores_list, bboxes_list = self.forward(det_img, score_threshold) + + scores = numpy.vstack(scores_list) + scores_ravel = scores.ravel() + order = scores_ravel.argsort()[::-1] + bboxes = numpy.vstack(bboxes_list) / det_scale + pre_det = numpy.hstack((bboxes, scores)).astype(numpy.float32, copy=False) + pre_det = pre_det[order, :] + keep = self.nms(pre_det) + det = pre_det[keep, :] + if 0 < max_num < det.shape[0]: + area = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1]) + img_center = image.shape[0] // 2, image.shape[1] // 2 + offsets = numpy.vstack([(det[:, 0] + det[:, 2]) / 2 - img_center[1], + (det[:, 1] + det[:, 3]) / 2 - img_center[0]]) + offset_dist_squared = numpy.sum(numpy.power(offsets, 2.0), 0) + if metric == 'max': + values = area + else: + values = area - offset_dist_squared * 2.0 # some extra weight on the centering + index = numpy.argsort(values)[::-1] # some extra weight on the centering + index = index[0:max_num] + det = det[index, :] + return det + + def nms(self, outputs): + thresh = self.nms_thresh + x1 = outputs[:, 0] + y1 = outputs[:, 1] + x2 = outputs[:, 2] + y2 = outputs[:, 3] + scores = outputs[:, 4] + + order = scores.argsort()[::-1] + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = numpy.maximum(x1[i], x1[order[1:]]) + yy1 = numpy.maximum(y1[i], y1[order[1:]]) + xx2 = numpy.minimum(x2[i], x2[order[1:]]) + yy2 = numpy.minimum(y2[i], y2[order[1:]]) + + w = numpy.maximum(0.0, xx2 - xx1 + 1) + h = numpy.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + indices = numpy.where(ovr <= thresh)[0] + order = order[indices + 1] + + return keep diff --git a/sixdrep/utils.py b/sixdrep/utils.py new file mode 100644 index 0000000..b6366b6 --- /dev/null +++ b/sixdrep/utils.py @@ -0,0 +1,8 @@ +from torch.utils import data +from torchvision import transforms + +normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) +sixdreptransform = transforms.Compose([transforms.Resize(224 + 32), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize]) \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..67885e6 --- /dev/null +++ b/utils.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +import os +import numpy as np +import matplotlib.pyplot as plt +from print_color import print +import copy + +from functools import wraps +import time +import cv2 + +TEXT_SCALE = 1.0 + +def get_mpl_colormap(cmap_name): + cmap = plt.get_cmap(cmap_name) + + # Initialize the matplotlib color map + sm = plt.cm.ScalarMappable(cmap=cmap) + + # Obtain linear color range + color_range = sm.to_rgba(np.linspace(0, 1, 256), bytes=True)[:, 2::-1] + + return color_range.reshape(256, 3).astype(np.float32) / 255.0 + + +def prSuccess(text): + print(text, tag = "ok", tag_color = "green", color = "white") + +def prInfo(text): + print(text, tag = "info", tag_color = "cyan", color = "white") + +def prTimer(text, tic, tac): + print("{} {:.0f} ms".format(text, (tac-tic)*1000), tag = "timer", tag_color = "purple", color = "white") + +def prInfoBold(text): + print(text, tag = "info", tag_color = "cyan", color = "white", format = "bold") + +def prDebug(text): + print(text, tag = "debug", tag_color = "red", background = "white", color = "white") + +def prWarning(text): + print(text, tag = "warning", tag_color = "yellow", color = "white") + +def prError(text): + print(text, tag = "error", tag_color = "red", color = "white") + + +def draw_bbox_with_corners(image, bbox, color=(0, 255, 0), thickness=2, proportion=0.2): + x_min, y_min, x_max, y_max = map(int, bbox[:4]) + + width = x_max - x_min + height = y_max - y_min + + corner_length = int(proportion * min(width, height)) + + # Draw the rectangle + cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 1) + + # Top-left corner + cv2.line(image, (x_min, y_min), (x_min + corner_length, y_min), color, thickness) + cv2.line(image, (x_min, y_min), (x_min, y_min + corner_length), color, thickness) + + # Top-right corner + cv2.line(image, (x_max, y_min), (x_max - corner_length, y_min), color, thickness) + cv2.line(image, (x_max, y_min), (x_max, y_min + corner_length), color, thickness) + + # Bottom-left corner + cv2.line(image, (x_min, y_max), (x_min, y_max - corner_length), color, thickness) + cv2.line(image, (x_min, y_max), (x_min + corner_length, y_max), color, thickness) + + # Bottom-right corner + cv2.line(image, (x_max, y_max), (x_max, y_max - corner_length), color, thickness) + cv2.line(image, (x_max, y_max), (x_max - corner_length, y_max), color, thickness) + +YOLO_COCO_80_CLASSES = [ +"person", +"bicycle", +"car", +"motorbike", +"aeroplane", +"bus", +"train", +"truck", +"boat", +"traffic light", +"fire hydrant", +"stop sign", +"parking meter", +"bench", +"bird", +"cat", +"dog", +"horse", +"sheep", +"cow", +"elephant", +"bear", +"zebra", +"giraffe", +"backpack", +"umbrella", +"handbag", +"tie", +"suitcase", +"frisbee", +"skis", +"snowboard", +"sports ball", +"kite", +"baseball bat", +"baseball glove", +"skateboard", +"surfboard", +"tennis racket", +"bottle", +"wine glass", +"cup", +"fork", +"knife", +"spoon", +"bowl", +"banana", +"apple", +"sandwich", +"orange", +"broccoli", +"carrot", +"hot dog", +"pizza", +"donut", +"cake", +"chair", +"sofa", +"pottedplant", +"bed", +"diningtable", +"toilet", +"tvmonitor", +"laptop", +"mouse", +"remote", +"keyboard", +"cell phone", +"microwave", +"oven", +"toaster", +"sink", +"refrigerator", +"book", +"clock", +"vase", +"scissors", +"teddy bear", +"hair drier", +"toothbrush"] + +COCO17_JOINTS_LIMBS = [[0,1], [0,2], [1,2], [1,3], [4,2], [3,5], [4,6], [5,7],[6,8],[7,9],[8,10], [5,6],[11,12], [5,11],[6,12],[11,13],[12,14],[13,15],[14,16]] + +RANDOM_COLORS = np.array([ + [205, 150, 194], + [ 17, 155, 211], + [162, 121, 186], + [194, 242, 27], + [248, 79, 81], + [134, 159, 164], + [163, 7, 30], + [ 93, 9, 121], + [ 95, 54, 131], + [ 77, 23, 22], + [ 43, 17, 191], + [ 34, 198, 162], + [ 53, 5, 221], + [ 37, 74, 55], + [ 88, 204, 179], + [200, 84, 192], + [ 71, 75, 96], + [ 5, 250, 149], + [ 9, 216, 221], + [ 54, 115, 69], + [109, 92, 97], + [186, 191, 222], + [ 14, 41, 194], + [ 75, 246, 175], + [135, 112, 74], + [ 18, 185, 33], + [236, 129, 68], + [ 58, 226, 186], + [ 56, 63, 90], + [231, 40, 251], + [222, 112, 249], + [ 77, 37, 189], + [137, 94, 131], + [170, 233, 53], + [235, 29, 21], + [ 66, 96, 46], + [ 62, 29, 142], + [ 12, 193, 90], + [224, 151, 242], + [132, 221, 176], + [ 94, 75, 130], + [157, 220, 166], + [156, 47, 225], + [ 76, 176, 108], + [186, 189, 33], + [139, 223, 78], + [ 98, 169, 49], + [ 39, 154, 71], + [ 49, 191, 100], + [128, 170, 25], + [ 90, 127, 185], + [180, 213, 170], + [ 53, 153, 220], + [109, 211, 12], + [ 72, 125, 73], + [126, 220, 193], + [238, 38, 220], + [ 77, 76, 46], + [254, 186, 161], + [126, 226, 187], + [190, 142, 14], + [132, 146, 254], + [ 34, 39, 219], + [ 78, 114, 127], + [248, 145, 165], + [145, 64, 10], + [237, 84, 14], + [ 18, 245, 229], + [246, 40, 125], + [187, 210, 10], + [128, 197, 159], + [152, 179, 221], + [ 18, 159, 88], + [ 17, 205, 133], + [243, 111, 152], + [ 86, 60, 202], + [178, 71, 105], + [ 49, 141, 244], + [238, 169, 59], + [ 91, 190, 81], + [194, 113, 124], + [209, 214, 138], + [ 61, 251, 148], + [113, 75, 124], + [182, 147, 1], + [ 86, 119, 160], + [ 12, 253, 136], + [149, 38, 41], + [183, 161, 19], + [153, 4, 68], + [195, 147, 156], + [165, 30, 189], + [ 82, 55, 244], + [ 33, 25, 248], + [ 71, 193, 228], + [244, 37, 174], + [203, 6, 202], + [118, 209, 136], + [248, 144, 49], + [ 8, 145, 128], + [164, 24, 0], + [ 97, 196, 92], + [243, 146, 179], + [ 77, 144, 104], + [134, 63, 50], + [108, 155, 104], + [200, 124, 251], + [ 70, 35, 156], + [115, 57, 148], + [249, 236, 2], + [119, 245, 43], + [ 49, 101, 88], + [ 27, 188, 88], + [225, 20, 89], + [ 94, 249, 118], + [ 1, 150, 65], + [161, 77, 221], + [144, 227, 134], + [ 28, 231, 69], + [165, 141, 223], + [134, 124, 162], + [151, 18, 210], + [ 15, 39, 228], + [ 88, 192, 62], + [179, 36, 209], + [ 99, 11, 191], + [145, 76, 117], + [183, 212, 247], + [ 10, 52, 119], + [154, 218, 200], + [194, 227, 179], + [ 9, 73, 9], + [ 66, 19, 65], + [ 62, 201, 224], + [ 18, 100, 101], + [ 4, 29, 246], + [ 94, 47, 167], + [ 57, 85, 162], + [196, 245, 113], + [234, 87, 229], + [ 30, 199, 34], + [ 41, 216, 200], + [ 93, 155, 214], + [236, 132, 87], + [193, 191, 13], + [222, 140, 102], + [ 50, 194, 63], + [244, 103, 90], + [ 63, 234, 10], + [ 45, 138, 147], + [107, 11, 164], + [ 93, 196, 79], + [ 85, 20, 227], + [ 2, 74, 5], + [155, 243, 68], + [133, 102, 92], + [ 85, 27, 104], + [ 73, 69, 71], + [176, 159, 175], + [124, 113, 197], + [102, 221, 40], + [167, 164, 166], + [214, 8, 43], + [183, 139, 224], + [130, 21, 83], + [172, 11, 186], + [199, 183, 201], + [180, 166, 98], + [ 28, 22, 177], + [ 4, 227, 64], + [131, 2, 95], + [ 2, 164, 73], + [ 89, 247, 7], + [235, 93, 169], + [ 51, 230, 61], + [144, 144, 234], + [157, 22, 89], + [ 0, 48, 113], + [207, 63, 161], + [200, 3, 166], + [ 25, 92, 209], + [243, 201, 247], + [117, 78, 126], + [229, 99, 105], + [ 52, 184, 198], + [ 29, 127, 174], + [251, 113, 46], + [220, 148, 28], + [ 18, 228, 18], + [216, 178, 17], + [ 78, 54, 148], + [223, 253, 150], + [105, 69, 50], + [229, 162, 35], + [140, 47, 200], + [103, 195, 216], + [169, 23, 47], + [ 73, 208, 20], + [ 53, 184, 113], + [225, 211, 40], + [135, 163, 142], + [243, 236, 67], + [ 14, 20, 61], + [ 11, 27, 107], + [ 24, 145, 99], + [155, 150, 243], + [254, 153, 114], + [ 91, 182, 222], + [ 71, 216, 39], + [ 9, 55, 216], + [144, 1, 144], + [163, 166, 208], + [149, 53, 64], + [230, 45, 52], + [171, 157, 2], + [191, 43, 172], + [180, 84, 131], + [ 8, 40, 88], + [155, 63, 149], + [196, 150, 149], + [123, 219, 46], + [ 9, 63, 186], + [ 19, 54, 155], + [ 25, 43, 88], + [140, 174, 131], + [ 23, 158, 90], + [152, 141, 207], + [ 28, 160, 67], + [ 17, 54, 220], + [ 12, 186, 7], + [129, 17, 94], + [221, 84, 128], + [142, 172, 202], + [161, 214, 106], + [ 75, 208, 229], + [140, 39, 192], + [183, 116, 110], + [ 73, 104, 186], + [152, 191, 227], + [254, 1, 97], + [193, 189, 73], + [187, 108, 152], + [ 86, 224, 29], + [212, 192, 223], + [130, 109, 55], + [149, 130, 121], + [ 70, 125, 16], + [203, 54, 194], + [ 23, 91, 249], + [ 43, 73, 5], + [ 5, 165, 112], + [189, 148, 214], + [170, 56, 203], + [ 69, 45, 90], + [ 27, 169, 222], + [187, 80, 33] +]) + + + +def crop_scale(motion, scale_range=[1, 1]): + ''' + For input of MotionBERT + Motion: [(M), T, 17, 3]. + Normalize to [-1, 1] + ''' + result = copy.deepcopy(motion) + valid_coords = motion[motion[..., 2]!=0][:,:2] + if len(valid_coords) < 4: + return np.zeros(motion.shape) + xmin = min(valid_coords[:,0]) + xmax = max(valid_coords[:,0]) + ymin = min(valid_coords[:,1]) + ymax = max(valid_coords[:,1]) + ratio = np.random.uniform(low=scale_range[0], high=scale_range[1], size=1)[0] + scale = max(xmax-xmin, ymax-ymin) * ratio + if scale==0: + return np.zeros(motion.shape) + xs = (xmin+xmax-scale) / 2 + ys = (ymin+ymax-scale) / 2 + result[...,:2] = (motion[..., :2]- [xs,ys]) / scale + result[...,:2] = (result[..., :2] - 0.5) * 2 + result = np.clip(result, -1, 1) + return result + + +def coco2h36m(x): + ''' + Input: x ((M )x T x V x C) + + COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank} + + H36M: + 0: 'root', + 1: 'rhip', + 2: 'rkne', + 3: 'rank', + 4: 'lhip', + 5: 'lkne', + 6: 'lank', + 7: 'belly', + 8: 'neck', + 9: 'nose', + 10: 'head', + 11: 'lsho', + 12: 'lelb', + 13: 'lwri', + 14: 'rsho', + 15: 'relb', + 16: 'rwri' + ''' + y = np.zeros(x.shape) + y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5 + y[:,1,:] = x[:,12,:] + y[:,2,:] = x[:,14,:] + y[:,3,:] = x[:,16,:] + y[:,4,:] = x[:,11,:] + y[:,5,:] = x[:,13,:] + y[:,6,:] = x[:,15,:] + y[:,8,:] = (x[:,5,:] + x[:,6,:]) * 0.5 + y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5 + y[:,9,:] = x[:,0,:] + y[:,10,:] = (x[:,1,:] + x[:,2,:]) * 0.5 + y[:,11,:] = x[:,5,:] + y[:,12,:] = x[:,7,:] + y[:,13,:] = x[:,9,:] + y[:,14,:] = x[:,6,:] + y[:,15,:] = x[:,8,:] + y[:,16,:] = x[:,10,:] + return y + + +def timeit(func): + @wraps(func) + def wrapper_function(*args, **kwargs): + tic = time.time() + res = func(*args, **kwargs) + tac = time.time() + print("{} {:.0f} ms".format(func.__name__, (tac-tic)*1000), tag = "timer", tag_color = "purple", color = "white") + return res + return wrapper_function diff --git a/visualizer.py b/visualizer.py new file mode 100755 index 0000000..4794746 --- /dev/null +++ b/visualizer.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +import rospy +from sensor_msgs.msg import Image, PointCloud2 +from geometry_msgs.msg import Pose, Point +from cv_bridge import CvBridge +import cv2 +import os +import numpy as np +import matplotlib.pyplot as plt +from argparse import ArgumentParser +from datetime import datetime +import time +import json +from utils import * + +# remove numpy scientific notation +np.set_printoptions(suppress=True) + +class VisualizerNode(object): + def __init__(self, args): + + self.args = args + + self.rgb = None # Image frame + self.depth = None # Image frame + + self.pcl_array_rgb = None + self.pcl_array_xyz = None + + self.depth_cmap = get_mpl_colormap(args.depth_cmap) + self.depth_array_max_threshold = 3000 + + self.pcl_current_seq = -1 + self.rgb_current_seq = -1 + self.depth_current_seq = -1 + self.current_image_count = 0 + + self.br = CvBridge() + + prInfo("Setting node rate to {} fps".format(args.fps)) + self.loop_rate = rospy.Rate(args.fps) + + # make the output path + now = datetime.now() + timestamp = now.strftime("%Y_%m_%d_%H_%M_%S") + self.save_dir = os.path.join("output", "record_{:s}".format(timestamp)) + self.metadata = os.path.join(self.save_dir, "metadata.json") + self.save_dir_rgb = os.path.join(self.save_dir, "rgb") + self.save_dir_depth = os.path.join(self.save_dir, "depth") + self.save_dir_depth_color = os.path.join(self.save_dir, "depth_color") + self.save_dir_pcl_bin = os.path.join(self.save_dir, "pcl") + + if args.save or args.light_save: + prInfo("Saving to {}/[rgb][depth][depth_color]".format(self.save_dir)) + if not os.path.exists(self.save_dir): + prInfo("Creating directories to {}/[rgb][depth][depth_color]".format(self.save_dir)) + os.makedirs(self.save_dir) + os.makedirs(self.save_dir_rgb) + if not args.no_pcl: + os.makedirs(self.save_dir_pcl_bin) + + if not args.no_depth and args.save: + os.makedirs(self.save_dir_depth) + os.makedirs(self.save_dir_depth_color) + + args_dic = vars(args) + with open(self.metadata, 'w') as fp: + json.dump(args_dic, fp) + + prSuccess("Created directories to {}/[rgb][depth][depth_color][pcl]".format(self.save_dir)) + time.sleep(1) + + + # Subscribers + prInfo("Subscribing to {} for RGB".format(args.rgb_topic)) + self.rgb_sub = rospy.Subscriber(args.rgb_topic, Image, self.callback_rgb) + + if args.no_pcl: + prWarning("No PCL subscriber because option --no_pcl is enabled") + else: + prInfo("Subscribing to {} for PCL".format(args.pcl_topic)) + self.pcl_sub = rospy.Subscriber(args.pcl_topic, PointCloud2, self.callback_pcl) + + if args.no_depth: + prWarning("No depth subscriber because option --no_depth is enabled") + else: + prInfo("Subscribing to {} for depth".format(args.depth_topic)) + self.depth_sub = rospy.Subscriber(args.depth_topic,Image, self.callback_depth) + + def callback_pcl(self, msg): + pcl_array = np.frombuffer(msg.data, dtype=np.float32).reshape((msg.height, msg.width, -1)) + self.pcl_array_xyz = pcl_array[:,:,:3] + self.pcl_array_rgb = pcl_array[:,:,3:] + self.pcl_current_seq = msg.header.seq + rospy.loginfo('pcl received ({})...'.format(msg.header.seq)) + + def callback_rgb(self, msg): + self.rgb = self.br.imgmsg_to_cv2(msg, "bgr8") + self.rgb_current_seq = msg.header.seq + rospy.loginfo('RGB received ({})...'.format(msg.header.seq)) + + def callback_depth(self, msg): + self.depth = self.br.imgmsg_to_cv2(msg, "mono16") + self.depth_current_seq = msg.header.seq + rospy.loginfo('Depth received ({})...'.format(msg.header.seq)) + + def is_ready(self): + ready = (self.rgb is not None) and (self.args.no_depth or self.depth is not None) and (self.args.no_pcl or self.pcl_array_xyz is not None) + return ready + + def start(self): + + if self.args.light_save: + # create dict for saving afterwards and avoid losing time + saving_pcl = {} + saving_rgb = {} + + while not rospy.is_shutdown(): + + if self.is_ready(): + + image_count = self.current_image_count + image_seq_unique = self.rgb_current_seq + now = datetime.now() + timestamp = now.strftime("%Y_%m_%d_%H_%M_%S_%f") + + if self.args.save or self.args.light_save: + rgb_path = os.path.join(self.save_dir_rgb, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp)) + if self.args.save: + cv2.imwrite(rgb_path, self.rgb) + prSuccess("Saved RGB to {}".format(rgb_path)) + else: + saving_rgb[rgb_path] = self.rgb + + rgb_array = np.asarray(self.rgb) + + if not self.args.no_show: + full_display_height = rgb_array.shape[0] if self.args.no_depth else rgb_array.shape[0] * 2 + full_display_width = rgb_array.shape[1] if self.args.no_pcl else rgb_array.shape[1] * 2 + full_display_array = np.zeros((full_display_height, full_display_width, 3), dtype = np.uint8) + + full_display_array[:rgb_array.shape[0], :rgb_array.shape[1] ,:] = rgb_array + + if self.args.no_depth: + depth_array = None + else: + + if self.args.save: + depth_path = os.path.join(self.save_dir_depth, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp)) + cv2.imwrite(depth_path, self.depth) + prSuccess("Saved depth to {}".format(depth_path)) + + depth_array = np.asarray(self.depth) + depth_array[depth_array > self.depth_array_max_threshold] = self.depth_array_max_threshold + + depth_array_disp = depth_array.copy() + depth_array_disp[depth_array_disp > 3000] = 3000 + depth_array_norm = ((depth_array_disp - depth_array_disp.min())) / (depth_array_disp.max() - depth_array_disp.min()) + # depth_array_norm = ((depth_array - depth_array.min())) / (depth_array.max() - depth_array.min()) + depth_array_norm = depth_array_norm * 255 + depth_array_norm = depth_array_norm.astype(np.uint8) + depth_array_norm_colored = (self.depth_cmap[depth_array_norm] * 255).astype(np.uint8) + + if self.args.save: + depth_color_path = os.path.join(self.save_dir_depth_color, "{:08d}_seq_{:010d}_ts_{}.png".format(image_count, image_seq_unique, timestamp)) + cv2.imwrite(depth_color_path, depth_array_norm_colored) + prSuccess("Saved depth color (scaled) to {}".format(depth_color_path)) + + if not self.args.no_show: + full_display_array[rgb_array.shape[0]:, :rgb_array.shape[1] ,:] = depth_array_norm_colored + + if self.args.no_pcl: + pcl_rgb_norm = None + pcl_xyz_norm = None + else: + if self.args.save or self.args.light_save: + pcl_path = os.path.join(self.save_dir_pcl_bin, "{:08d}_seq_{:010d}_ts_{}.bin".format(image_count, image_seq_unique, timestamp)) + + if self.args.save: + self.pcl_array_xyz.tofile(pcl_path) + prSuccess("Saved pcl to {}".format(pcl_path)) + elif self.args.light_save: + saving_pcl[pcl_path] = self.pcl_array_xyz + + if not self.args.no_show: + pcl_rgb_color = (self.pcl_array_rgb * 255).astype(np.uint8) + max_dist = 3.0 # 3m in any dimension + min_dist = -3.0 # 3m in any dimension + pcl_xyz_crop = self.pcl_array_xyz.copy() + pcl_xyz_crop[pcl_xyz_crop > max_dist] = max_dist + pcl_xyz_crop[pcl_xyz_crop < min_dist] = min_dist + pcl_dist_norm = (pcl_xyz_crop - min_dist) / (max_dist - min_dist) + pcl_dist_color = (pcl_dist_norm * 255).astype(np.uint8) + full_display_array[rgb_array.shape[0]:, rgb_array.shape[1]: ,:] = pcl_rgb_color[:,:,::-1] + full_display_array[:rgb_array.shape[0], rgb_array.shape[1]: ,:] = pcl_dist_color + + if not self.args.no_show: + #format(self.rgb_current_seq, self.depth_current_seq, self.pcl_current_seq) + cv2.imshow("RGBD window", full_display_array) + cv2.waitKey(3) + + self.current_image_count += 1 + + if (self.current_image_count > 1000 and self.args.light_save): + prWarning("Finish here and save all 100 images !") + self.rgb_sub.unregister() + if not self.args.no_pcl: + self.pcl_sub.unregister() + if not self.args.no_depth: + self.depth_sub.unregister() + break + elif self.args.light_save: + prInfo("Collected image {} / 1000 before closing".format(self.current_image_count)) + + else: + rospy.logwarn("Do not display/save images because not initialized (rgb or depth or pcl)") + + self.loop_rate.sleep() + + if self.args.light_save: + + prWarning("Please wait while we save images and pcl !") + + if not self.args.no_pcl: + for key, value in saving_pcl.items(): + value.tofile(key) + prSuccess("Saved pcl to {}".format(key)) + + for key, value in saving_rgb.items(): + cv2.imwrite(key, value) + prSuccess("Saved rgb to {}".format(key)) + +if __name__ == '__main__': + + ## Parser with params + parser = ArgumentParser() + parser.add_argument('--rgb_topic', default = "orbbec/rgb", type=str, help='ROS topic for RGB image') + parser.add_argument('--depth_topic', default = "orbbec/depth", type=str, help='ROS topic for depth image') + parser.add_argument('--pcl_topic', default = "orbbec/pcl", type=str, help='ROS topic for pcl') + parser.add_argument( + '--no_depth', + action='store_true', + default=False, + help='Do not use depth subscriber / recorder / visualizer') + parser.add_argument( + '--no_pcl', + action='store_true', + default=False, + help='Do not use pcl subscriber / recorder / visualizer') + parser.add_argument( + '--no_show', + action='store_true', + default=False, + help='whether to show visualizations.') + parser.add_argument( + '--save', + action='store_true', + default=False, + help='whether to save images (rgb and d and pcl)') + parser.add_argument( + '--light_save', + action='store_true', + default=False, + help='whether to save only rgb and pcl') + parser.add_argument( + '--fps', + type=int, + default=30, + help='Node and recording fps') + parser.add_argument('--depth_cmap', default = "jet", type=str, help='mpl colormap for depth image') + + args = parser.parse_args() + prInfo("Loaded with args : {}".format(args)) + + rospy.init_node("python_orbbec_vis_save", anonymous=True) + my_node = VisualizerNode(args) + my_node.start() + cv2.destroyAllWindows() \ No newline at end of file