Skip to content

Commit 6ff0ee1

Browse files
committed
DarkVision dataset and configs added. ImageNetVid Data streaming code added that uses Zip files only
1 parent dd55ecc commit 6ff0ee1

File tree

209 files changed

+23376
-726
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

209 files changed

+23376
-726
lines changed
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#dataset settings
2+
dataset_type = 'DarkVisionDataset'
3+
data_ann_root="../dataset_annotations/darkvision/high_illumination_3.2/annotations/"
4+
data_root="../dataset_annotations/darkvision/high_illumination_3.2/videos/"
5+
6+
img_norm_cfg = dict(
7+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
8+
train_pipeline = [
9+
dict(type='LoadMultiImagesFromFile'),
10+
dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
11+
dict(type='SeqResize', img_scale=(640, 640), keep_ratio=False),
12+
dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
13+
dict(type='SeqNormalize', **img_norm_cfg),
14+
dict(type='SeqPad', size_divisor=32),
15+
dict(
16+
type='VideoCollect',
17+
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_instance_ids']),
18+
dict(type='ConcatVideoReferences'),
19+
dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
20+
]
21+
test_pipeline = [
22+
dict(type='LoadMultiImagesFromFile'),
23+
dict(type='SeqResize', img_scale=(640, 640), keep_ratio=False),
24+
dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
25+
dict(type='SeqNormalize', **img_norm_cfg),
26+
dict(type='SeqPad', size_divisor=32),
27+
dict(
28+
type='VideoCollect',
29+
keys=['img'],
30+
meta_keys=('num_left_ref_imgs', 'frame_stride')),
31+
dict(type='ConcatVideoReferences'),
32+
dict(type='MultiImagesToTensor', ref_prefix='ref'),
33+
dict(type='ToList')
34+
]
35+
36+
37+
#Writing code to combine all jsons for training and test, repsectively
38+
#the code will only be executed once to generate list of path for train and test data splits
39+
# then it will be commented
40+
# import os
41+
# train_ann_files = os.listdir(os.path.join(data_ann_root, "train"))
42+
# train_img_files = os.listdir(os.path.join(data_root, "train"))
43+
#now same thing for test
44+
# train_ann_files = os.listdir(os.path.join(data_ann_root, "test"))
45+
# train_img_files = os.listdir(os.path.join(data_root, "test"))
46+
# #
47+
# for i in range(len(train_ann_files)):
48+
# train_ann_files[i] = os.path.join(data_ann_root, "test", train_ann_files[i])
49+
# train_img_files[i] = os.path.join(data_ann_root, "test", train_img_files[i], "tif-low-light")
50+
#
51+
# print(train_ann_files)
52+
# print("\n\n", train_img_files)
53+
#Writing code to combine all jsons for training and test, repsectively ENDS HERE
54+
55+
56+
train_ann_files=['../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0014.json',
57+
'../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0019.json',
58+
'../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0003.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0012.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0022.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0025.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0002.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0007.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0021.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0017.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0005.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0004.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0010.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0001.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0016.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0011.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0026.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0023.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0013.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0020.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0018.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0009.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0006.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0015.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0008.json', '../dataset_annotations/darkvision/high_illumination_3.2/annotations/train/0024.json']
59+
# print(train_ann_files)
60+
61+
62+
63+
test_ann_files=['../dataset_annotations/darkvision/high_illumination_3.2/annotations/test/0027.json',
64+
'../dataset_annotations/darkvision/high_illumination_3.2/annotations/test/0030.json',
65+
'../dataset_annotations/darkvision/high_illumination_3.2/annotations/test/0031.json',
66+
'../dataset_annotations/darkvision/high_illumination_3.2/annotations/test/0029.json',
67+
'../dataset_annotations/darkvision/high_illumination_3.2/annotations/test/0032.json',
68+
'../dataset_annotations/darkvision/high_illumination_3.2/annotations/test/0028.json']
69+
# test_ann_files=['../dataset_annotations/darkvision/high_illumination_3.2/annotations/test/0032.json']
70+
# test_img_files=['../dataset_annotations/darkvision/high_illumination_3.2/videos/test/0032/tif-low-light']
71+
train_img_files = ['../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0011/tif-low-light',
72+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0007/tif-low-light',
73+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0010/tif-low-light',
74+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0009/tif-low-light',
75+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0013/tif-low-light',
76+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0015/tif-low-light',
77+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0001/tif-low-light',
78+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0018/tif-low-light',
79+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0017/tif-low-light',
80+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0004/tif-low-light',
81+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0005/tif-low-light',
82+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0002/tif-low-light',
83+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0023/tif-low-light',
84+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0026/tif-low-light',
85+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0021/tif-low-light',
86+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0014/tif-low-light',
87+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0019/tif-low-light',
88+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0006/tif-low-light',
89+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0022/tif-low-light',
90+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0008/tif-low-light',
91+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0024/tif-low-light',
92+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0003/tif-low-light',
93+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0012/tif-low-light',
94+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0025/tif-low-light',
95+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0020/tif-low-light',
96+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/train/0016/tif-low-light']
97+
test_img_files=['../dataset_annotations/darkvision/high_illumination_3.2/videos/test/0028/tif-low-light',
98+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/test/0030/tif-low-light',
99+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/test/0031/tif-low-light',
100+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/test/0032/tif-low-light',
101+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/test/0027/tif-low-light',
102+
'../dataset_annotations/darkvision/high_illumination_3.2/videos/test/0029/tif-low-light']
103+
104+
train_ann_files.sort()
105+
train_img_files.sort()
106+
test_ann_files.sort()
107+
test_img_files.sort()
108+
data = dict(
109+
samples_per_gpu=1,
110+
workers_per_gpu=1,
111+
train=dict(
112+
type='DarkVisionDataset',
113+
load_as_video=False,
114+
ann_file= train_ann_files,
115+
img_prefix=train_img_files,
116+
ref_img_sampler=dict(
117+
num_ref_imgs=2,
118+
frame_range=9,
119+
filter_key_img=True,
120+
method='bilateral_uniform'),
121+
pipeline=train_pipeline
122+
),
123+
val=dict(
124+
type='DarkVisionDataset',
125+
load_as_video=False,
126+
ann_file=test_ann_files,
127+
img_prefix=test_img_files,
128+
ref_img_sampler=dict(
129+
num_ref_imgs=30,
130+
frame_range=[-15, 15],
131+
stride=1,
132+
method='test_with_fix_stride'),
133+
pipeline=test_pipeline,
134+
test_mode=True),
135+
test=dict(
136+
type='DarkVisionDataset',
137+
load_as_video=False,
138+
ann_file=test_ann_files,
139+
img_prefix=test_img_files,
140+
ref_img_sampler=dict(
141+
num_ref_imgs=30,
142+
frame_range=[-15, 15],
143+
stride=1,
144+
method='test_with_fix_stride'),
145+
pipeline=test_pipeline,
146+
test_mode=True))

configs/_base_/datasets/imagenet_vid_fgfa_style.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# dataset settings
22
dataset_type = 'ImagenetVIDDataset'
3-
data_root = 'data/ILSVRC/'
3+
# data_ann_root = '/ds-av/public_datasets/imagenet/pre/ILSVRC2015/COCO-Annotations/'
4+
data_ann_root = "../dataset_annotations/imagenetVID_2015/"
5+
data_root = "../dataset_annotations/imagenetVID_2015/Data/"
6+
# data_root = "/ds-av/public_datasets/imagenet/raw/Data/"
47
img_norm_cfg = dict(
58
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
69
train_pipeline = [
@@ -32,12 +35,12 @@
3235
]
3336
data = dict(
3437
samples_per_gpu=1,
35-
workers_per_gpu=2,
38+
workers_per_gpu=1,
3639
train=[
3740
dict(
3841
type=dataset_type,
39-
ann_file=data_root + 'annotations/imagenet_vid_train.json',
40-
img_prefix=data_root + 'Data/VID',
42+
ann_file=data_ann_root+'imagenet_vid_train.json',
43+
img_prefix=data_root+'VID/',
4144
ref_img_sampler=dict(
4245
num_ref_imgs=2,
4346
frame_range=9,
@@ -47,8 +50,8 @@
4750
dict(
4851
type=dataset_type,
4952
load_as_video=False,
50-
ann_file=data_root + 'annotations/imagenet_det_30plus1cls.json',
51-
img_prefix=data_root + 'Data/DET',
53+
ann_file=data_ann_root+'imagenet_det_30plus1cls.json',
54+
img_prefix=data_root+'DET',
5255
ref_img_sampler=dict(
5356
num_ref_imgs=2,
5457
frame_range=0,
@@ -58,8 +61,8 @@
5861
],
5962
val=dict(
6063
type=dataset_type,
61-
ann_file=data_root + 'annotations/imagenet_vid_val.json',
62-
img_prefix=data_root + 'Data/VID',
64+
ann_file=data_ann_root+'imagenet_vid_val.json',
65+
img_prefix=data_root+'VID',
6366
ref_img_sampler=dict(
6467
num_ref_imgs=30,
6568
frame_range=[-15, 15],
@@ -69,8 +72,8 @@
6972
test_mode=True),
7073
test=dict(
7174
type=dataset_type,
72-
ann_file=data_root + 'annotations/imagenet_vid_val.json',
73-
img_prefix=data_root + 'Data/VID',
75+
ann_file=data_ann_root+'imagenet_vid_val.json',
76+
img_prefix=data_root+'VID',
7477
ref_img_sampler=dict(
7578
num_ref_imgs=30,
7679
frame_range=[-15, 15],
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#This config is different from standard imagenet_vid_fgfa_style config in multiple ways:
2+
# - Instead of data directories, zip file paths are given for video frames
3+
# - LoadMultiImagesFromZipFile pipeline is used which is specifically created for streaming data in loading.py
4+
# - dataset_type is ImagenetVIDDatasetStreaming which is specifically created for streaming data in imagenet_vid_datastream.py
5+
6+
7+
# dataset settings
8+
dataset_type = 'ImagenetVIDDatasetStreaming'
9+
data_root = "/ds-av/public_datasets/imagenet/original/preprocessed/ILSVRC2015/"
10+
11+
12+
img_norm_cfg = dict(
13+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
14+
train_pipeline = [
15+
dict(type='LoadMultiImagesFromZipFile'),
16+
dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
17+
dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
18+
dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
19+
dict(type='SeqNormalize', **img_norm_cfg),
20+
dict(type='SeqPad', size_divisor=16),
21+
dict(
22+
type='VideoCollect',
23+
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_instance_ids']),
24+
dict(type='ConcatVideoReferences'),
25+
dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
26+
]
27+
test_pipeline = [
28+
dict(type='LoadMultiImagesFromZipFile'),
29+
dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
30+
dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
31+
dict(type='SeqNormalize', **img_norm_cfg),
32+
dict(type='SeqPad', size_divisor=16),
33+
dict(
34+
type='VideoCollect',
35+
keys=['img'],
36+
meta_keys=('num_left_ref_imgs', 'frame_stride')),
37+
dict(type='ConcatVideoReferences'),
38+
dict(type='MultiImagesToTensor', ref_prefix='ref'),
39+
dict(type='ToList')
40+
]
41+
data = dict(
42+
samples_per_gpu=1,
43+
workers_per_gpu=1,
44+
train=[
45+
dict(
46+
type=dataset_type,
47+
ann_file=data_root+'COCO-Annotations/imagenet_vid_train.json',
48+
img_prefix=data_root+'VID/train.zip',
49+
ref_img_sampler=dict(
50+
num_ref_imgs=2,
51+
frame_range=9,
52+
filter_key_img=True,
53+
method='bilateral_uniform'),
54+
pipeline=train_pipeline),
55+
dict(
56+
type=dataset_type,
57+
load_as_video=False,
58+
ann_file=data_root+'COCO-Annotations/imagenet_det_30plus1cls.json',
59+
img_prefix=data_root+'DET/DET_train.zip',
60+
ref_img_sampler=dict(
61+
num_ref_imgs=2,
62+
frame_range=0,
63+
filter_key_img=False,
64+
method='bilateral_uniform'),
65+
pipeline=train_pipeline)
66+
],
67+
val=dict(
68+
type=dataset_type,
69+
ann_file=data_root+'COCO-Annotations/imagenet_vid_val.json',
70+
img_prefix=data_root+'VID/val.zip',
71+
ref_img_sampler=dict(
72+
num_ref_imgs=30,
73+
frame_range=[-15, 15],
74+
stride=1,
75+
method='test_with_fix_stride'),
76+
pipeline=test_pipeline,
77+
test_mode=True),
78+
test=dict(
79+
type=dataset_type,
80+
ann_file=data_root+'COCO-Annotations/imagenet_vid_val.json',
81+
img_prefix=data_root+'VID/val.zip',
82+
ref_img_sampler=dict(
83+
num_ref_imgs=30,
84+
frame_range=[-15, 15],
85+
stride=1,
86+
method='test_with_fix_stride'),
87+
pipeline=test_pipeline,
88+
test_mode=True))

configs/_base_/default_runtime.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@
2020
opencv_num_threads = 0
2121
# set multi-process start method as `fork` to speed up the training
2222
mp_start_method = 'fork'
23+
24+
auto_scale_lr = dict(enable=True, base_batch_size=16)
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
model = dict(
2+
detector=dict(
3+
type='DeformableDETR',
4+
backbone=dict(
5+
type='ResNet',
6+
depth=50,
7+
num_stages=4,
8+
out_indices=(1, 2, 3),
9+
frozen_stages=1,
10+
norm_cfg=dict(type='BN', requires_grad=False),
11+
norm_eval=True,
12+
style='pytorch',
13+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
14+
neck=dict(
15+
type='ChannelMapper',
16+
in_channels=[512, 1024, 2048],
17+
kernel_size=1,
18+
out_channels=256,
19+
act_cfg=None,
20+
norm_cfg=dict(type='GN', num_groups=32),
21+
num_outs=4),
22+
bbox_head=dict(
23+
type='DeformableDETRHead',
24+
num_query=300,
25+
num_classes=30,
26+
in_channels=2048,
27+
sync_cls_avg_factor=True,
28+
as_two_stage=False,
29+
transformer=dict(
30+
type='DeformableDetrTransformer',
31+
encoder=dict(
32+
type='DetrTransformerEncoder',
33+
num_layers=6,
34+
transformerlayers=dict(
35+
type='BaseTransformerLayer',
36+
attn_cfgs=dict(
37+
type='MultiScaleDeformableAttention', embed_dims=256),
38+
feedforward_channels=1024,
39+
ffn_dropout=0.1,
40+
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
41+
decoder=dict(
42+
type='DeformableDetrTransformerDecoder',
43+
num_layers=6,
44+
return_intermediate=True,
45+
transformerlayers=dict(
46+
type='DetrTransformerDecoderLayer',
47+
attn_cfgs=[
48+
dict(
49+
type='MultiheadAttention',
50+
embed_dims=256,
51+
num_heads=8,
52+
dropout=0.1),
53+
dict(
54+
type='MultiScaleDeformableAttention',
55+
embed_dims=256)
56+
],
57+
feedforward_channels=1024,
58+
ffn_dropout=0.1,
59+
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
60+
'ffn', 'norm')))),
61+
positional_encoding=dict(
62+
type='SinePositionalEncoding',
63+
num_feats=128,
64+
normalize=True,
65+
offset=-0.5),
66+
loss_cls=dict(
67+
type='FocalLoss',
68+
use_sigmoid=True,
69+
gamma=2.0,
70+
alpha=0.25,
71+
loss_weight=2.0),
72+
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
73+
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
74+
# training and testing settings
75+
train_cfg=dict(
76+
assigner=dict(
77+
type='HungarianAssigner',
78+
cls_cost=dict(type='FocalLossCost', weight=2.0),
79+
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
80+
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
81+
test_cfg=dict(max_per_img=100)
82+
)
83+
)

0 commit comments

Comments
 (0)