diff --git a/Makefile b/Makefile index f664a727489..2c5fdb6697d 100644 --- a/Makefile +++ b/Makefile @@ -85,7 +85,7 @@ LDFLAGS+= -L/usr/local/cudnn/lib64 -lcudnn endif endif -OBJ=http_stream.o gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o +OBJ=http_stream.o gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o upsample_layer.o ifeq ($(GPU), 1) LDFLAGS+= -lstdc++ OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o diff --git a/build/darknet/darknet.vcxproj b/build/darknet/darknet.vcxproj index e8896edaa82..de0293ec82c 100644 --- a/build/darknet/darknet.vcxproj +++ b/build/darknet/darknet.vcxproj @@ -227,10 +227,12 @@ + + @@ -279,7 +281,9 @@ + + diff --git a/build/darknet/darknet_no_gpu.vcxproj b/build/darknet/darknet_no_gpu.vcxproj index e1c4c37f65a..9ce9b36a0e3 100644 --- a/build/darknet/darknet_no_gpu.vcxproj +++ b/build/darknet/darknet_no_gpu.vcxproj @@ -224,10 +224,12 @@ + + @@ -276,7 +278,9 @@ + + diff --git a/build/darknet/x64/cfg/yolov3.cfg b/build/darknet/x64/cfg/yolov3.cfg new file mode 100644 index 00000000000..5f3ab621302 --- /dev/null +++ b/build/darknet/x64/cfg/yolov3.cfg @@ -0,0 +1,789 @@ +[net] +# Testing +batch=1 +subdivisions=1 +# Training +# batch=64 +# subdivisions=16 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + diff --git a/build/darknet/x64/darknet_demo_mjpeg_stream.cmd b/build/darknet/x64/darknet_demo_mjpeg_stream.cmd index 044b5b782a2..43148da97bc 100644 --- a/build/darknet/x64/darknet_demo_mjpeg_stream.cmd +++ b/build/darknet/x64/darknet_demo_mjpeg_stream.cmd @@ -1,7 +1,7 @@ rem Run this file and then open URL in Chrome/Firefox: rem http://localhost:8090 rem Or open: http://ip-address:8090 -darknet.exe detector demo data/voc.data yolo-voc.cfg yolo-voc.weights test.mp4 -i 0 -http_port 8090 +darknet.exe detector demo data/voc.data yolo-voc.cfg yolo-voc.weights test.mp4 -i 0 -http_port 8090 -dont_show pause \ No newline at end of file diff --git a/build/darknet/x64/darknet_yolo_v3.cmd b/build/darknet/x64/darknet_yolo_v3.cmd new file mode 100644 index 00000000000..a2fa22005d7 --- /dev/null +++ b/build/darknet/x64/darknet_yolo_v3.cmd @@ -0,0 +1,5 @@ + +darknet.exe detector test data/coco.data yolov3.cfg yolov3.weights -i 0 -thresh 0.25 dogr.jpg + + +pause \ No newline at end of file diff --git a/build/darknet/x64/yolov3.cfg b/build/darknet/x64/yolov3.cfg new file mode 100644 index 00000000000..5f3ab621302 --- /dev/null +++ b/build/darknet/x64/yolov3.cfg @@ -0,0 +1,789 @@ +[net] +# Testing +batch=1 +subdivisions=1 +# Training +# batch=64 +# subdivisions=16 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + diff --git a/build/darknet/yolo_cpp_dll.vcxproj b/build/darknet/yolo_cpp_dll.vcxproj index b68c5b4c3c3..c33be37f244 100644 --- a/build/darknet/yolo_cpp_dll.vcxproj +++ b/build/darknet/yolo_cpp_dll.vcxproj @@ -229,10 +229,12 @@ + + @@ -283,7 +285,9 @@ + + diff --git a/build/darknet/yolo_cpp_dll_no_gpu.vcxproj b/build/darknet/yolo_cpp_dll_no_gpu.vcxproj index c1d1d30e9e1..56753c45480 100644 --- a/build/darknet/yolo_cpp_dll_no_gpu.vcxproj +++ b/build/darknet/yolo_cpp_dll_no_gpu.vcxproj @@ -213,10 +213,12 @@ + + @@ -266,7 +268,9 @@ + + diff --git a/cfg/yolov3.cfg b/cfg/yolov3.cfg new file mode 100644 index 00000000000..5f3ab621302 --- /dev/null +++ b/cfg/yolov3.cfg @@ -0,0 +1,789 @@ +[net] +# Testing +batch=1 +subdivisions=1 +# Training +# batch=64 +# subdivisions=16 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + diff --git a/image_yolov3.sh b/image_yolov3.sh new file mode 100644 index 00000000000..49cc5eb66db --- /dev/null +++ b/image_yolov3.sh @@ -0,0 +1,6 @@ + + +./darknet detector test ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights data/dog.jpg -i 0 -thresh 0.25 + + + diff --git a/src/blas.c b/src/blas.c index 6d565e96737..4ff0b8343ab 100644 --- a/src/blas.c +++ b/src/blas.c @@ -291,3 +291,19 @@ void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, i } } +void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out) +{ + int i, j, k, b; + for (b = 0; b < batch; ++b) { + for (k = 0; k < c; ++k) { + for (j = 0; j < h*stride; ++j) { + for (i = 0; i < w*stride; ++i) { + int in_index = b*w*h*c + k*w*h + (j / stride)*w + i / stride; + int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i; + if (forward) out[out_index] = scale*in[in_index]; + else in[in_index] += scale*out[out_index]; + } + } + } + } +} \ No newline at end of file diff --git a/src/blas.h b/src/blas.h index e1bfbf01e8d..c40422ac46e 100644 --- a/src/blas.h +++ b/src/blas.h @@ -36,6 +36,7 @@ void l2_cpu(int n, float *pred, float *truth, float *delta, float *error); void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c); void softmax(float *input, int n, float temp, float *output, int stride); +void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out); #ifdef GPU #include "cuda.h" @@ -84,5 +85,7 @@ void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out); +void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out); + #endif #endif diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 97b59779298..1edbbbde950 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -784,3 +784,34 @@ extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float t check_error(cudaPeekAtLastError()); } + +__global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out) +{ + size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if (i >= N) return; + int out_index = i; + int out_w = i % (w*stride); + i = i / (w*stride); + int out_h = i % (h*stride); + i = i / (h*stride); + int out_c = i%c; + i = i / c; + int b = i%batch; + + int in_w = out_w / stride; + int in_h = out_h / stride; + int in_c = out_c; + + int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w; + + + if (forward) out[out_index] += scale * x[in_index]; + else atomicAdd(x + in_index, scale * out[out_index]); +} + +extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out) +{ + size_t size = w*h*c*batch*stride*stride; + upsample_kernel << > >(size, in, w, h, c, batch, stride, forward, scale, out); + check_error(cudaPeekAtLastError()); +} \ No newline at end of file diff --git a/src/box.c b/src/box.c index 1dc12c0614f..e02685d68e5 100644 --- a/src/box.c +++ b/src/box.c @@ -276,6 +276,92 @@ void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh free(s); } +int nms_comparator_v3(const void *pa, const void *pb) +{ + detection a = *(detection *)pa; + detection b = *(detection *)pb; + float diff = 0; + if (b.sort_class >= 0) { + diff = a.prob[b.sort_class] - b.prob[b.sort_class]; + } + else { + diff = a.objectness - b.objectness; + } + if (diff < 0) return 1; + else if (diff > 0) return -1; + return 0; +} + +void do_nms_obj_v3(detection *dets, int total, int classes, float thresh) +{ + int i, j, k; + k = total - 1; + for (i = 0; i <= k; ++i) { + if (dets[i].objectness == 0) { + detection swap = dets[i]; + dets[i] = dets[k]; + dets[k] = swap; + --k; + --i; + } + } + total = k + 1; + + for (i = 0; i < total; ++i) { + dets[i].sort_class = -1; + } + + qsort(dets, total, sizeof(detection), nms_comparator_v3); + for (i = 0; i < total; ++i) { + if (dets[i].objectness == 0) continue; + box a = dets[i].bbox; + for (j = i + 1; j < total; ++j) { + if (dets[j].objectness == 0) continue; + box b = dets[j].bbox; + if (box_iou(a, b) > thresh) { + dets[j].objectness = 0; + for (k = 0; k < classes; ++k) { + dets[j].prob[k] = 0; + } + } + } + } +} + +void do_nms_sort_v3(detection *dets, int total, int classes, float thresh) +{ + int i, j, k; + k = total - 1; + for (i = 0; i <= k; ++i) { + if (dets[i].objectness == 0) { + detection swap = dets[i]; + dets[i] = dets[k]; + dets[k] = swap; + --k; + --i; + } + } + total = k + 1; + + for (k = 0; k < classes; ++k) { + for (i = 0; i < total; ++i) { + dets[i].sort_class = k; + } + qsort(dets, total, sizeof(detection), nms_comparator_v3); + for (i = 0; i < total; ++i) { + //printf(" k = %d, \t i = %d \n", k, i); + if (dets[i].prob[k] == 0) continue; + box a = dets[i].bbox; + for (j = i + 1; j < total; ++j) { + box b = dets[j].bbox; + if (box_iou(a, b) > thresh) { + dets[j].prob[k] = 0; + } + } + } + } +} + void do_nms(box *boxes, float **probs, int total, int classes, float thresh) { int i, j, k; diff --git a/src/box.h b/src/box.h index a5f8cee3cb0..c023e20eb08 100644 --- a/src/box.h +++ b/src/box.h @@ -9,12 +9,23 @@ typedef struct{ float dx, dy, dw, dh; } dbox; +typedef struct detection { + box bbox; + int classes; + float *prob; + float *mask; + float objectness; + int sort_class; +} detection; + box float_to_box(float *f); float box_iou(box a, box b); float box_rmse(box a, box b); dbox diou(box a, box b); void do_nms(box *boxes, float **probs, int total, int classes, float thresh); void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh); +void do_nms_sort_v3(detection *dets, int total, int classes, float thresh); +void do_nms_obj_v3(detection *dets, int total, int classes, float thresh); box decode_box(box b, box anchor); box encode_box(box b, box anchor); diff --git a/src/demo.c b/src/demo.c index e0a6ed35269..3b4e92daeb8 100644 --- a/src/demo.c +++ b/src/demo.c @@ -50,6 +50,7 @@ static IplImage* ipl_images[FRAMES]; static float *avg; void draw_detections_cv(IplImage* show_img, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes); +void draw_detections_cv_v3(IplImage* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes); void show_image_cv_ipl(IplImage *disp, const char *name); image get_image_from_stream_resize(CvCapture *cap, int w, int h, IplImage** in_img, int use_webcam); IplImage* in_img; @@ -77,7 +78,7 @@ void *fetch_in_thread(void *ptr) void *detect_in_thread(void *ptr) { - float nms = .4; + float nms = .45; // 0.4F layer l = net.layers[net.n-1]; float *X = det_s.data; @@ -88,6 +89,7 @@ void *detect_in_thread(void *ptr) l.output = avg; free_image(det_s); + /* if(l.type == DETECTION){ get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0); } else if (l.type == REGION){ @@ -96,6 +98,12 @@ void *detect_in_thread(void *ptr) error("Last layer must produce detections\n"); } if (nms > 0) do_nms(boxes, probs, l.w*l.h*l.n, l.classes, nms); + */ + int letter = 0; + int nboxes = 0; + detection *dets = get_network_boxes(&net, det.w, det.h, demo_thresh, demo_thresh, 0, 1, &nboxes, letter); + if (nms) do_nms_obj_v3(dets, nboxes, l.classes, nms); + printf("\033[2J"); printf("\033[1;1H"); printf("\nFPS:%.1f\n",fps); @@ -108,7 +116,9 @@ void *detect_in_thread(void *ptr) demo_index = (demo_index + 1)%FRAMES; //draw_detections(det, l.w*l.h*l.n, demo_thresh, boxes, probs, demo_names, demo_alphabet, demo_classes); - draw_detections_cv(det_img, l.w*l.h*l.n, demo_thresh, boxes, probs, demo_names, demo_alphabet, demo_classes); + draw_detections_cv_v3(det_img, dets, nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes); + //draw_detections_cv(det_img, l.w*l.h*l.n, demo_thresh, boxes, probs, demo_names, demo_alphabet, demo_classes); + free(dets); return 0; } @@ -122,7 +132,7 @@ double get_wall_time() return (double)time.tv_sec + (double)time.tv_usec * .000001; } -void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, +void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show) { //skip = frame_skip; @@ -303,7 +313,7 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch } } #else -void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show) +void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show) { fprintf(stderr, "Demo needs OpenCV for webcam images.\n"); } diff --git a/src/detection_layer.c b/src/detection_layer.c index fd5a4198311..0a1c1070af8 100644 --- a/src/detection_layer.c +++ b/src/detection_layer.c @@ -285,3 +285,31 @@ void backward_detection_layer_gpu(detection_layer l, network_state state) } #endif +void get_detection_detections(layer l, int w, int h, float thresh, detection *dets) +{ + int i, j, n; + float *predictions = l.output; + //int per_cell = 5*num+classes; + for (i = 0; i < l.side*l.side; ++i) { + int row = i / l.side; + int col = i % l.side; + for (n = 0; n < l.n; ++n) { + int index = i*l.n + n; + int p_index = l.side*l.side*l.classes + i*l.n + n; + float scale = predictions[p_index]; + int box_index = l.side*l.side*(l.classes + l.n) + (i*l.n + n) * 4; + box b; + b.x = (predictions[box_index + 0] + col) / l.side * w; + b.y = (predictions[box_index + 1] + row) / l.side * h; + b.w = pow(predictions[box_index + 2], (l.sqrt ? 2 : 1)) * w; + b.h = pow(predictions[box_index + 3], (l.sqrt ? 2 : 1)) * h; + dets[index].bbox = b; + dets[index].objectness = scale; + for (j = 0; j < l.classes; ++j) { + int class_index = i*l.classes; + float prob = scale*predictions[class_index + j]; + dets[index].prob[j] = (prob > thresh) ? prob : 0; + } + } + } +} \ No newline at end of file diff --git a/src/detection_layer.h b/src/detection_layer.h index e847a094ccf..9d2da9282fe 100644 --- a/src/detection_layer.h +++ b/src/detection_layer.h @@ -10,6 +10,7 @@ detection_layer make_detection_layer(int batch, int inputs, int n, int size, int void forward_detection_layer(const detection_layer l, network_state state); void backward_detection_layer(const detection_layer l, network_state state); void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness); +void get_detection_detections(layer l, int w, int h, float thresh, detection *dets); #ifdef GPU void forward_detection_layer_gpu(const detection_layer l, network_state state); diff --git a/src/detector.c b/src/detector.c index 3dfbce677e0..9581e5c8c69 100644 --- a/src/detector.c +++ b/src/detector.c @@ -1000,7 +1000,7 @@ void calc_anchors(char *datacfg, int num_of_clusters, int final_width, int final } #endif // OPENCV -void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, int dont_show) +void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, int dont_show) { list *options = read_data_cfg(datacfg); char *name_list = option_find_str(options, "names", "data/names.list"); @@ -1017,7 +1017,7 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam char buff[256]; char *input = buff; int j; - float nms=.4; + float nms=.45; // 0.4F while(1){ if(filename){ strncpy(input, filename, 256); @@ -1030,21 +1030,27 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam strtok(input, "\n"); } image im = load_image_color(input,0,0); - image sized = resize_image(im, net.w, net.h); - //image sized = letterbox_image(im, net.w, net.h); + int letter = 0; + //image sized = resize_image(im, net.w, net.h); + image sized = letterbox_image(im, net.w, net.h); letter = 1; layer l = net.layers[net.n-1]; - box *boxes = calloc(l.w*l.h*l.n, sizeof(box)); - float **probs = calloc(l.w*l.h*l.n, sizeof(float *)); - for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *)); + //box *boxes = calloc(l.w*l.h*l.n, sizeof(box)); + //float **probs = calloc(l.w*l.h*l.n, sizeof(float *)); + //for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *)); float *X = sized.data; time=clock(); network_predict(net, X); printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); - get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0); - if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms); - draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes); + //get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0); + // if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms); + //draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes); + int nboxes = 0; + detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letter); + if (nms) do_nms_sort_v3(dets, nboxes, l.classes, nms); + draw_detections_v3(im, dets, nboxes, thresh, names, alphabet, l.classes); + free_detections(dets, nboxes); save_image(im, "predictions"); if (!dont_show) { show_image(im, "predictions"); @@ -1052,8 +1058,8 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam free_image(im); free_image(sized); - free(boxes); - free_ptrs((void **)probs, l.w*l.h*l.n); + //free(boxes); + //free_ptrs((void **)probs, l.w*l.h*l.n); #ifdef OPENCV if (!dont_show) { cvWaitKey(0); @@ -1071,7 +1077,8 @@ void run_detector(int argc, char **argv) int http_stream_port = find_int_arg(argc, argv, "-http_port", -1); char *out_filename = find_char_arg(argc, argv, "-out_filename", 0); char *prefix = find_char_arg(argc, argv, "-prefix", 0); - float thresh = find_float_arg(argc, argv, "-thresh", .24); + float thresh = find_float_arg(argc, argv, "-thresh", .25); // 0.24 + float hier_thresh = find_float_arg(argc, argv, "-hier", .5); int cam_index = find_int_arg(argc, argv, "-c", 0); int frame_skip = find_int_arg(argc, argv, "-s", 0); int num_of_clusters = find_int_arg(argc, argv, "-num_of_clusters", 5); @@ -1112,7 +1119,7 @@ void run_detector(int argc, char **argv) if(weights) if (weights[strlen(weights) - 1] == 0x0d) weights[strlen(weights) - 1] = 0; char *filename = (argc > 6) ? argv[6]: 0; - if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, dont_show); + if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, dont_show); else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear, dont_show); else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights); else if(0==strcmp(argv[2], "recall")) validate_detector_recall(datacfg, cfg, weights); @@ -1125,7 +1132,7 @@ void run_detector(int argc, char **argv) char **names = get_labels(name_list); if(filename) if (filename[strlen(filename) - 1] == 0x0d) filename[strlen(filename) - 1] = 0; - demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, out_filename, + demo(cfg, weights, thresh, hier_thresh, cam_index, filename, names, classes, frame_skip, prefix, out_filename, http_stream_port, dont_show); } } diff --git a/src/image.c b/src/image.c index 84919ebc62f..d29f0c1e88f 100644 --- a/src/image.c +++ b/src/image.c @@ -93,6 +93,23 @@ image get_label(image **characters, char *string, int size) return b; } +image get_label_v3(image **characters, char *string, int size) +{ + size = size / 10; + if (size > 7) size = 7; + image label = make_empty_image(0, 0, 0); + while (*string) { + image l = characters[size][(int)*string]; + image n = tile_images(label, l, -size - 1 + (size + 1) / 2); + free_image(label); + label = n; + ++string; + } + image b = border_image(label, label.h*.25); + free_image(label); + return b; +} + void draw_label(image a, int r, int c, image label, const float *rgb) { int w = label.w; @@ -183,6 +200,80 @@ image **load_alphabet() return alphabets; } +void draw_detections_v3(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes) +{ + int i, j; + + for (i = 0; i < num; ++i) { + char labelstr[4096] = { 0 }; + int class_id = -1; + for (j = 0; j < classes; ++j) { + if (dets[i].prob[j] > thresh) { + if (class_id < 0) { + strcat(labelstr, names[j]); + class_id = j; + } + else { + strcat(labelstr, ", "); + strcat(labelstr, names[j]); + } + printf("%s: %.0f%%\n", names[j], dets[i].prob[j] * 100); + } + } + if (class_id >= 0) { + int width = im.h * .006; + + /* + if(0){ + width = pow(prob, 1./2.)*10+1; + alphabet = 0; + } + */ + + //printf("%d %s: %.0f%%\n", i, names[class_id], prob*100); + int offset = class_id * 123457 % classes; + float red = get_color(2, offset, classes); + float green = get_color(1, offset, classes); + float blue = get_color(0, offset, classes); + float rgb[3]; + + //width = prob*20+2; + + rgb[0] = red; + rgb[1] = green; + rgb[2] = blue; + box b = dets[i].bbox; + //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h); + + int left = (b.x - b.w / 2.)*im.w; + int right = (b.x + b.w / 2.)*im.w; + int top = (b.y - b.h / 2.)*im.h; + int bot = (b.y + b.h / 2.)*im.h; + + if (left < 0) left = 0; + if (right > im.w - 1) right = im.w - 1; + if (top < 0) top = 0; + if (bot > im.h - 1) bot = im.h - 1; + + draw_box_width(im, left, top, right, bot, width, red, green, blue); + if (alphabet) { + image label = get_label_v3(alphabet, labelstr, (im.h*.03)); + draw_label(im, top + width, left, label, rgb); + free_image(label); + } + if (dets[i].mask) { + image mask = float_to_image(14, 14, 1, dets[i].mask); + image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h); + image tmask = threshold_image(resized_mask, .5); + embed_image(tmask, im, left, top); + free_image(mask); + free_image(resized_mask); + free_image(tmask); + } + } + } +} + void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes) { int i; @@ -245,6 +336,93 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs, } #ifdef OPENCV + +void draw_detections_cv_v3(IplImage* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes) +{ + int i, j; + if (!show_img) return; + + for (i = 0; i < num; ++i) { + char labelstr[4096] = { 0 }; + int class_id = -1; + for (j = 0; j < classes; ++j) { + if (dets[i].prob[j] > thresh) { + if (class_id < 0) { + strcat(labelstr, names[j]); + class_id = j; + } + else { + strcat(labelstr, ", "); + strcat(labelstr, names[j]); + } + printf("%s: %.0f%%\n", names[j], dets[i].prob[j] * 100); + } + } + if (class_id >= 0) { + int width = show_img->height * .006; + + /* + if(0){ + width = pow(prob, 1./2.)*10+1; + alphabet = 0; + } + */ + + //printf("%d %s: %.0f%%\n", i, names[class_id], prob*100); + int offset = class_id * 123457 % classes; + float red = get_color(2, offset, classes); + float green = get_color(1, offset, classes); + float blue = get_color(0, offset, classes); + float rgb[3]; + + //width = prob*20+2; + + rgb[0] = red; + rgb[1] = green; + rgb[2] = blue; + box b = dets[i].bbox; + //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h); + + int left = (b.x - b.w / 2.)*show_img->width; + int right = (b.x + b.w / 2.)*show_img->width; + int top = (b.y - b.h / 2.)*show_img->height; + int bot = (b.y + b.h / 2.)*show_img->height; + + if (left < 0) left = 0; + if (right > show_img->width - 1) right = show_img->width - 1; + if (top < 0) top = 0; + if (bot > show_img->height - 1) bot = show_img->height - 1; + + float const font_size = show_img->height / 1000.F; + CvPoint pt1, pt2, pt_text, pt_text_bg1, pt_text_bg2; + pt1.x = left; + pt1.y = top; + pt2.x = right; + pt2.y = bot; + pt_text.x = left; + pt_text.y = top - 12; + pt_text_bg1.x = left; + pt_text_bg1.y = top - (10 + 25 * font_size); + pt_text_bg2.x = right; + pt_text_bg2.y = top; + CvScalar color; + color.val[0] = red * 256; + color.val[1] = green * 256; + color.val[2] = blue * 256; + + cvRectangle(show_img, pt1, pt2, color, width, 8, 0); + //printf("left=%d, right=%d, top=%d, bottom=%d, obj_id=%d, obj=%s \n", left, right, top, bot, class_id, names[class_id]); + cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, width, 8, 0); + cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, CV_FILLED, 8, 0); // filled + CvScalar black_color; + black_color.val[0] = 0; + CvFont font; + cvInitFont(&font, CV_FONT_HERSHEY_SIMPLEX, font_size, font_size, 0, font_size * 3, 8); + cvPutText(show_img, names[class_id], pt_text, &font, black_color); + } + } +} + void draw_detections_cv(IplImage* show_img, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes) { int i; diff --git a/src/image.h b/src/image.h index 165a62415ee..b88cb4b1e17 100644 --- a/src/image.h +++ b/src/image.h @@ -23,6 +23,7 @@ void draw_bbox(image a, box bbox, int w, float r, float g, float b); void draw_label(image a, int r, int c, image label, const float *rgb); void write_label(image a, int r, int c, image *characters, char *string, float *rgb); void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **labels, int classes); +void draw_detections_v3(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes); image image_distance(image a, image b); void scale_image(image m, float s); image crop_image(image im, int dx, int dy, int w, int h); diff --git a/src/layer.h b/src/layer.h index 3a0e03ddca6..5291df90052 100644 --- a/src/layer.h +++ b/src/layer.h @@ -33,7 +33,9 @@ typedef enum { NETWORK, XNOR, REGION, + YOLO, REORG, + UPSAMPLE, REORG_OLD, BLANK } LAYER_TYPE; @@ -109,6 +111,9 @@ struct layer{ int noadjust; int reorg; int log; + int tanh; + int *mask; + int total; int adam; float B1; @@ -133,7 +138,10 @@ struct layer{ float class_scale; int bias_match; int random; + float ignore_thresh; + float truth_thresh; float thresh; + float focus; int classfix; int absolute; diff --git a/src/network.c b/src/network.c index 61f87c5df0c..8619158ab04 100644 --- a/src/network.c +++ b/src/network.c @@ -27,6 +27,7 @@ #include "dropout_layer.h" #include "route_layer.h" #include "shortcut_layer.h" +#include "yolo_layer.h" int get_current_batch(network net) { @@ -499,6 +500,107 @@ float *network_predict(network net, float *input) return out; } +int num_detections(network *net, float thresh) +{ + int i; + int s = 0; + for (i = 0; i < net->n; ++i) { + layer l = net->layers[i]; + if (l.type == YOLO) { + s += yolo_num_detections(l, thresh); + } + if (l.type == DETECTION || l.type == REGION) { + s += l.w*l.h*l.n; + } + } + return s; +} + +detection *make_network_boxes(network *net, float thresh, int *num) +{ + layer l = net->layers[net->n - 1]; + int i; + int nboxes = num_detections(net, thresh); + if (num) *num = nboxes; + detection *dets = calloc(nboxes, sizeof(detection)); + for (i = 0; i < nboxes; ++i) { + dets[i].prob = calloc(l.classes, sizeof(float)); + if (l.coords > 4) { + dets[i].mask = calloc(l.coords - 4, sizeof(float)); + } + } + return dets; +} + + +void custom_get_region_detections(layer l, int w, int h, int net_w, int net_h, float thresh, int *map, float hier, int relative, detection *dets, int letter) +{ + box *boxes = calloc(l.w*l.h*l.n, sizeof(box)); + float **probs = calloc(l.w*l.h*l.n, sizeof(float *)); + int i, j; + for (j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *)); + get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, map); + for (j = 0; j < l.w*l.h*l.n; ++j) { + dets[j].classes = l.classes; + dets[j].bbox = boxes[j]; + dets[j].objectness = 1; + for (i = 0; i < l.classes; ++i) dets[j].prob[i] = probs[j][i]; + } + + free(boxes); + free_ptrs((void **)probs, l.w*l.h*l.n); +} + +void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets, int letter) +{ + int j; + for (j = 0; j < net->n; ++j) { + layer l = net->layers[j]; + if (l.type == YOLO) { + int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter); + dets += count; + } + if (l.type == REGION) { + custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter); + //get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets); + dets += l.w*l.h*l.n; + } + if (l.type == DETECTION) { + get_detection_detections(l, w, h, thresh, dets); + dets += l.w*l.h*l.n; + } + } +} + +detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter) +{ + detection *dets = make_network_boxes(net, thresh, num); + fill_network_boxes(net, w, h, thresh, hier, map, relative, dets, letter); + return dets; +} + +void free_detections(detection *dets, int n) +{ + int i; + for (i = 0; i < n; ++i) { + free(dets[i].prob); + if (dets[i].mask) free(dets[i].mask); + } + free(dets); +} + +float *network_predict_image(network *net, image im) +{ + image imr = letterbox_image(im, net->w, net->h); + set_batch_network(net, 1); + float *p = network_predict(*net, imr.data); + free_image(imr); + return p; +} + +int network_width(network *net) { return net->w; } +int network_height(network *net) { return net->h; } + matrix network_predict_data_multi(network net, data test, int n) { int i,j,b,m; diff --git a/src/network.h b/src/network.h index 2d28e810615..d7f86c10d08 100644 --- a/src/network.h +++ b/src/network.h @@ -132,6 +132,7 @@ int resize_network(network *net, int w, int h); void set_batch_network(network *net, int b); int get_network_input_size(network net); float get_network_cost(network net); +detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter); int get_network_nuisance(network net); int get_network_background(network net); diff --git a/src/parser.c b/src/parser.c index 9949c50416b..a37ef1c3a75 100644 --- a/src/parser.c +++ b/src/parser.c @@ -30,6 +30,8 @@ #include "shortcut_layer.h" #include "softmax_layer.h" #include "utils.h" +#include "upsample_layer.h" +#include "yolo_layer.h" #include typedef struct{ @@ -47,6 +49,7 @@ LAYER_TYPE string_to_layer_type(char * type) if (strcmp(type, "[cost]")==0) return COST; if (strcmp(type, "[detection]")==0) return DETECTION; if (strcmp(type, "[region]")==0) return REGION; + if (strcmp(type, "[yolo]") == 0) return YOLO; if (strcmp(type, "[local]")==0) return LOCAL; if (strcmp(type, "[conv]")==0 || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL; @@ -71,6 +74,7 @@ LAYER_TYPE string_to_layer_type(char * type) if (strcmp(type, "[soft]")==0 || strcmp(type, "[softmax]")==0) return SOFTMAX; if (strcmp(type, "[route]")==0) return ROUTE; + if (strcmp(type, "[upsample]") == 0) return UPSAMPLE; return BLANK; } @@ -235,6 +239,65 @@ softmax_layer parse_softmax(list *options, size_params params) return layer; } +int *parse_yolo_mask(char *a, int *num) +{ + int *mask = 0; + if (a) { + int len = strlen(a); + int n = 1; + int i; + for (i = 0; i < len; ++i) { + if (a[i] == ',') ++n; + } + mask = calloc(n, sizeof(int)); + for (i = 0; i < n; ++i) { + int val = atoi(a); + mask[i] = val; + a = strchr(a, ',') + 1; + } + *num = n; + } + return mask; +} + +layer parse_yolo(list *options, size_params params) +{ + int classes = option_find_int(options, "classes", 20); + int total = option_find_int(options, "num", 1); + int num = total; + + char *a = option_find_str(options, "mask", 0); + int *mask = parse_yolo_mask(a, &num); + layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes); + assert(l.outputs == params.inputs); + + l.max_boxes = option_find_int_quiet(options, "max", 90); + l.jitter = option_find_float(options, "jitter", .2); + + l.ignore_thresh = option_find_float(options, "ignore_thresh", .5); + l.truth_thresh = option_find_float(options, "truth_thresh", 1); + l.random = option_find_int_quiet(options, "random", 0); + + char *map_file = option_find_str(options, "map", 0); + if (map_file) l.map = read_map(map_file); + + a = option_find_str(options, "anchors", 0); + if (a) { + int len = strlen(a); + int n = 1; + int i; + for (i = 0; i < len; ++i) { + if (a[i] == ',') ++n; + } + for (i = 0; i < n; ++i) { + float bias = atof(a); + l.biases[i] = bias; + a = strchr(a, ',') + 1; + } + } + return l; +} + layer parse_region(list *options, size_params params) { int coords = option_find_int(options, "coords", 4); @@ -469,6 +532,15 @@ layer parse_activation(list *options, size_params params) return l; } +layer parse_upsample(list *options, size_params params, network net) +{ + + int stride = option_find_int(options, "stride", 2); + layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride); + l.scale = option_find_float_quiet(options, "scale", 1); + return l; +} + route_layer parse_route(list *options, size_params params, network net) { char *l = option_find(options, "layers"); @@ -665,6 +737,8 @@ network parse_network_cfg_custom(char *filename, int batch) l = parse_cost(options, params); }else if(lt == REGION){ l = parse_region(options, params); + }else if (lt == YOLO) { + l = parse_yolo(options, params); }else if(lt == DETECTION){ l = parse_detection(options, params); }else if(lt == SOFTMAX){ @@ -684,6 +758,8 @@ network parse_network_cfg_custom(char *filename, int batch) l = parse_avgpool(options, params); }else if(lt == ROUTE){ l = parse_route(options, params, net); + }else if (lt == UPSAMPLE) { + l = parse_upsample(options, params, net); }else if(lt == SHORTCUT){ l = parse_shortcut(options, params, net); }else if(lt == DROPOUT){ diff --git a/src/region_layer.c b/src/region_layer.c index f1799066550..5f8e4cc6311 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -130,12 +130,14 @@ void delta_region_class(float *output, float *delta, int index, int class_id, in } else { // Focal loss if (focal_loss) { - // Focal Loss for Dense Object Detection: http://blog.csdn.net/linmingan/article/details/77885832 + // Focal Loss float alpha = 0.5; // 0.25 or 0.5 //float gamma = 2; // hardcoded in many places of the grad-formula int ti = index + class_id; - float grad = -2 * (1 - output[ti])*logf(fmaxf(output[ti], 0.0000001))*output[ti] + (1 - output[ti])*(1 - output[ti]); + float pt = output[ti] + 0.000000000000001F; + //float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1); // http://blog.csdn.net/linmingan/article/details/77885832 + float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1); // https://github.com/unsky/focal-loss for (n = 0; n < classes; ++n) { delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]); @@ -165,6 +167,13 @@ float tisnan(float x) return (x != x); } +static int entry_index(layer l, int batch, int location, int entry) +{ + int n = location / (l.w*l.h); + int loc = location % (l.w*l.h); + return batch*l.outputs + n*l.w*l.h*(l.coords + l.classes + 1) + entry*l.w*l.h + loc; +} + void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output); void forward_region_layer(const region_layer l, network_state state) { @@ -454,3 +463,109 @@ void backward_region_layer_gpu(region_layer l, network_state state) } #endif + +void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative) +{ + int i; + int new_w = 0; + int new_h = 0; + if (((float)netw / w) < ((float)neth / h)) { + new_w = netw; + new_h = (h * netw) / w; + } + else { + new_h = neth; + new_w = (w * neth) / h; + } + for (i = 0; i < n; ++i) { + box b = dets[i].bbox; + b.x = (b.x - (netw - new_w) / 2. / netw) / ((float)new_w / netw); + b.y = (b.y - (neth - new_h) / 2. / neth) / ((float)new_h / neth); + b.w *= (float)netw / new_w; + b.h *= (float)neth / new_h; + if (!relative) { + b.x *= w; + b.w *= w; + b.y *= h; + b.h *= h; + } + dets[i].bbox = b; + } +} + +void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets) +{ + int i, j, n, z; + float *predictions = l.output; + if (l.batch == 2) { + float *flip = l.output + l.outputs; + for (j = 0; j < l.h; ++j) { + for (i = 0; i < l.w / 2; ++i) { + for (n = 0; n < l.n; ++n) { + for (z = 0; z < l.classes + l.coords + 1; ++z) { + int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i; + int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1); + float swap = flip[i1]; + flip[i1] = flip[i2]; + flip[i2] = swap; + if (z == 0) { + flip[i1] = -flip[i1]; + flip[i2] = -flip[i2]; + } + } + } + } + } + for (i = 0; i < l.outputs; ++i) { + l.output[i] = (l.output[i] + flip[i]) / 2.; + } + } + for (i = 0; i < l.w*l.h; ++i) { + int row = i / l.w; + int col = i % l.w; + for (n = 0; n < l.n; ++n) { + int index = n*l.w*l.h + i; + for (j = 0; j < l.classes; ++j) { + dets[index].prob[j] = 0; + } + int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords); + int box_index = entry_index(l, 0, n*l.w*l.h + i, 0); + int mask_index = entry_index(l, 0, n*l.w*l.h + i, 4); + float scale = l.background ? 1 : predictions[obj_index]; + dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h, l.w*l.h); + dets[index].objectness = scale > thresh ? scale : 0; + if (dets[index].mask) { + for (j = 0; j < l.coords - 4; ++j) { + dets[index].mask[j] = l.output[mask_index + j*l.w*l.h]; + } + } + + int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + !l.background); + if (l.softmax_tree) { + + hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0, l.w*l.h); + if (map) { + for (j = 0; j < 200; ++j) { + int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + map[j]); + float prob = scale*predictions[class_index]; + dets[index].prob[j] = (prob > thresh) ? prob : 0; + } + } + else { + int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h); + dets[index].prob[j] = (scale > thresh) ? scale : 0; + } + } + else { + if (dets[index].objectness) { + for (j = 0; j < l.classes; ++j) { + int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j); + float prob = scale*predictions[class_index]; + dets[index].prob[j] = (prob > thresh) ? prob : 0; + } + } + } + } + } + correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative); +} \ No newline at end of file diff --git a/src/region_layer.h b/src/region_layer.h index 0c754af71f2..d0de76ad3df 100644 --- a/src/region_layer.h +++ b/src/region_layer.h @@ -11,6 +11,7 @@ void forward_region_layer(const region_layer l, network_state state); void backward_region_layer(const region_layer l, network_state state); void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map); void resize_region_layer(layer *l, int w, int h); +void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets); #ifdef GPU void forward_region_layer_gpu(const region_layer l, network_state state); diff --git a/src/tree.c b/src/tree.c index dfa4178727d..35ac3de19db 100644 --- a/src/tree.c +++ b/src/tree.c @@ -50,6 +50,38 @@ void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leave } } +int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride) +{ + float p = 1; + int group = 0; + int i; + while (1) { + float max = 0; + int max_i = 0; + + for (i = 0; i < hier->group_size[group]; ++i) { + int index = i + hier->group_offset[group]; + float val = predictions[(i + hier->group_offset[group])*stride]; + if (val > max) { + max_i = index; + max = val; + } + } + if (p*max > thresh) { + p = p*max; + group = hier->child[max_i]; + if (hier->child[max_i] < 0) return max_i; + } + else if (group == 0) { + return max_i; + } + else { + return hier->parent[hier->group_offset[group]]; + } + } + return 0; +} + tree *read_tree(char *filename) { tree t = {0}; diff --git a/src/tree.h b/src/tree.h index c3f49797dd7..6983adf5603 100644 --- a/src/tree.h +++ b/src/tree.h @@ -5,6 +5,7 @@ typedef struct{ int *leaf; int n; int *parent; + int *child; int *group; char **name; @@ -14,6 +15,7 @@ typedef struct{ } tree; tree *read_tree(char *filename); +int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride); void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves); void change_leaves(tree *t, char *leaf_list); float get_hierarchy_probability(float *x, tree *hier, int c); diff --git a/src/upsample_layer.c b/src/upsample_layer.c new file mode 100644 index 00000000000..1aa5a150793 --- /dev/null +++ b/src/upsample_layer.c @@ -0,0 +1,106 @@ +#include "upsample_layer.h" +#include "cuda.h" +#include "blas.h" + +#include + +layer make_upsample_layer(int batch, int w, int h, int c, int stride) +{ + layer l = {0}; + l.type = UPSAMPLE; + l.batch = batch; + l.w = w; + l.h = h; + l.c = c; + l.out_w = w*stride; + l.out_h = h*stride; + l.out_c = c; + if(stride < 0){ + stride = -stride; + l.reverse=1; + l.out_w = w/stride; + l.out_h = h/stride; + } + l.stride = stride; + l.outputs = l.out_w*l.out_h*l.out_c; + l.inputs = l.w*l.h*l.c; + l.delta = calloc(l.outputs*batch, sizeof(float)); + l.output = calloc(l.outputs*batch, sizeof(float));; + + l.forward = forward_upsample_layer; + l.backward = backward_upsample_layer; + #ifdef GPU + l.forward_gpu = forward_upsample_layer_gpu; + l.backward_gpu = backward_upsample_layer_gpu; + + l.delta_gpu = cuda_make_array(l.delta, l.outputs*batch); + l.output_gpu = cuda_make_array(l.output, l.outputs*batch); + #endif + if(l.reverse) fprintf(stderr, "downsample %2dx %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c); + else fprintf(stderr, "upsample %2dx %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c); + return l; +} + +void resize_upsample_layer(layer *l, int w, int h) +{ + l->w = w; + l->h = h; + l->out_w = w*l->stride; + l->out_h = h*l->stride; + if(l->reverse){ + l->out_w = w/l->stride; + l->out_h = h/l->stride; + } + l->outputs = l->out_w*l->out_h*l->out_c; + l->inputs = l->h*l->w*l->c; + l->delta = realloc(l->delta, l->outputs*l->batch*sizeof(float)); + l->output = realloc(l->output, l->outputs*l->batch*sizeof(float)); + +#ifdef GPU + cuda_free(l->output_gpu); + cuda_free(l->delta_gpu); + l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch); + l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch); +#endif + +} + +void forward_upsample_layer(const layer l, network_state net) +{ + fill_cpu(l.outputs*l.batch, 0, l.output, 1); + if(l.reverse){ + upsample_cpu(l.output, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input); + }else{ + upsample_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output); + } +} + +void backward_upsample_layer(const layer l, network_state state) +{ + if(l.reverse){ + upsample_cpu(l.delta, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, state.delta); + }else{ + upsample_cpu(state.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta); + } +} + +#ifdef GPU +void forward_upsample_layer_gpu(const layer l, network_state state) +{ + fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); + if(l.reverse){ + upsample_gpu(l.output_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, state.input); + }else{ + upsample_gpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output_gpu); + } +} + +void backward_upsample_layer_gpu(const layer l, network_state state) +{ + if(l.reverse){ + upsample_gpu(l.delta_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, state.delta); + }else{ + upsample_gpu(state.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta_gpu); + } +} +#endif diff --git a/src/upsample_layer.h b/src/upsample_layer.h new file mode 100644 index 00000000000..4c7ac30396d --- /dev/null +++ b/src/upsample_layer.h @@ -0,0 +1,17 @@ +#ifndef UPSAMPLE_LAYER_H +#define UPSAMPLE_LAYER_H +#include "cuda.h" +#include "layer.h" +#include "network.h" + +layer make_upsample_layer(int batch, int w, int h, int c, int stride); +void forward_upsample_layer(const layer l, network net); +void backward_upsample_layer(const layer l, network net); +void resize_upsample_layer(layer *l, int w, int h); + +#ifdef GPU +void forward_upsample_layer_gpu(const layer l, network net); +void backward_upsample_layer_gpu(const layer l, network net); +#endif + +#endif diff --git a/src/utils.c b/src/utils.c index d6bdbf60387..a97d9661433 100644 --- a/src/utils.c +++ b/src/utils.c @@ -545,6 +545,15 @@ int max_index(float *a, int n) return max_i; } +int int_index(int *a, int val, int n) +{ + int i; + for (i = 0; i < n; ++i) { + if (a[i] == val) return i; + } + return -1; +} + int rand_int(int min, int max) { if (max < min){ diff --git a/src/utils.h b/src/utils.h index 9949d6d0e30..eab26229ea9 100644 --- a/src/utils.h +++ b/src/utils.h @@ -66,6 +66,7 @@ void print_statistics(float *a, int n); unsigned int random_gen(); float random_float(); float rand_uniform_strong(float min, float max); +int int_index(int *a, int val, int n); #endif diff --git a/src/yolo_layer.c b/src/yolo_layer.c new file mode 100644 index 00000000000..46846ef7324 --- /dev/null +++ b/src/yolo_layer.c @@ -0,0 +1,381 @@ +#include "yolo_layer.h" +#include "activations.h" +#include "blas.h" +#include "box.h" +#include "cuda.h" +#include "utils.h" + +#include +#include +#include +#include + +layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes) +{ + int i; + layer l = {0}; + l.type = YOLO; + + l.n = n; + l.total = total; + l.batch = batch; + l.h = h; + l.w = w; + l.c = n*(classes + 4 + 1); + l.out_w = l.w; + l.out_h = l.h; + l.out_c = l.c; + l.classes = classes; + l.cost = calloc(1, sizeof(float)); + l.biases = calloc(total*2, sizeof(float)); + if(mask) l.mask = mask; + else{ + l.mask = calloc(n, sizeof(int)); + for(i = 0; i < n; ++i){ + l.mask[i] = i; + } + } + l.bias_updates = calloc(n*2, sizeof(float)); + l.outputs = h*w*n*(classes + 4 + 1); + l.inputs = l.outputs; + l.truths = 90*(4 + 1); + l.delta = calloc(batch*l.outputs, sizeof(float)); + l.output = calloc(batch*l.outputs, sizeof(float)); + for(i = 0; i < total*2; ++i){ + l.biases[i] = .5; + } + + l.forward = forward_yolo_layer; + l.backward = backward_yolo_layer; +#ifdef GPU + l.forward_gpu = forward_yolo_layer_gpu; + l.backward_gpu = backward_yolo_layer_gpu; + l.output_gpu = cuda_make_array(l.output, batch*l.outputs); + l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs); +#endif + + fprintf(stderr, "detection\n"); + srand(0); + + return l; +} + +void resize_yolo_layer(layer *l, int w, int h) +{ + l->w = w; + l->h = h; + + l->outputs = h*w*l->n*(l->classes + 4 + 1); + l->inputs = l->outputs; + + l->output = realloc(l->output, l->batch*l->outputs*sizeof(float)); + l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float)); + +#ifdef GPU + cuda_free(l->delta_gpu); + cuda_free(l->output_gpu); + + l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); + l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); +#endif +} + +box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride) +{ + box b; + b.x = (i + x[index + 0*stride]) / lw; + b.y = (j + x[index + 1*stride]) / lh; + b.w = exp(x[index + 2*stride]) * biases[2*n] / w; + b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h; + return b; +} + +float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride) +{ + box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride); + float iou = box_iou(pred, truth); + + float tx = (truth.x*lw - i); + float ty = (truth.y*lh - j); + float tw = log(truth.w*w / biases[2*n]); + float th = log(truth.h*h / biases[2*n + 1]); + + delta[index + 0*stride] = scale * (tx - x[index + 0*stride]); + delta[index + 1*stride] = scale * (ty - x[index + 1*stride]); + delta[index + 2*stride] = scale * (tw - x[index + 2*stride]); + delta[index + 3*stride] = scale * (th - x[index + 3*stride]); + return iou; +} + + +void delta_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat) +{ + int n; + if (delta[index]){ + delta[index + stride*class] = 1 - output[index + stride*class]; + if(avg_cat) *avg_cat += output[index + stride*class]; + return; + } + for(n = 0; n < classes; ++n){ + delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n]; + if(n == class && avg_cat) *avg_cat += output[index + stride*n]; + } +} + +static int entry_index(layer l, int batch, int location, int entry) +{ + int n = location / (l.w*l.h); + int loc = location % (l.w*l.h); + return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc; +} + +void forward_yolo_layer(const layer l, network_state state) +{ + int i,j,b,t,n; + memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float)); + +#ifndef GPU + for (b = 0; b < l.batch; ++b){ + for(n = 0; n < l.n; ++n){ + int index = entry_index(l, b, n*l.w*l.h, 0); + activate_array(l.output + index, 2*l.w*l.h, LOGISTIC); + index = entry_index(l, b, n*l.w*l.h, 4); + activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC); + } + } +#endif + + memset(l.delta, 0, l.outputs * l.batch * sizeof(float)); + if(!state.train) return; + float avg_iou = 0; + float recall = 0; + float recall75 = 0; + float avg_cat = 0; + float avg_obj = 0; + float avg_anyobj = 0; + int count = 0; + int class_count = 0; + *(l.cost) = 0; + for (b = 0; b < l.batch; ++b) { + for (j = 0; j < l.h; ++j) { + for (i = 0; i < l.w; ++i) { + for (n = 0; n < l.n; ++n) { + int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); + box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h); + float best_iou = 0; + int best_t = 0; + for(t = 0; t < l.max_boxes; ++t){ + box truth = float_to_box(state.truth + t*(4 + 1) + b*l.truths, 1); + if(!truth.x) break; + float iou = box_iou(pred, truth); + if (iou > best_iou) { + best_iou = iou; + best_t = t; + } + } + int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4); + avg_anyobj += l.output[obj_index]; + l.delta[obj_index] = 0 - l.output[obj_index]; + if (best_iou > l.ignore_thresh) { + l.delta[obj_index] = 0; + } + if (best_iou > l.truth_thresh) { + l.delta[obj_index] = 1 - l.output[obj_index]; + + int class = state.truth[best_t*(4 + 1) + b*l.truths + 4]; + if (l.map) class = l.map[class]; + int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1); + delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0); + box truth = float_to_box(state.truth + best_t*(4 + 1) + b*l.truths, 1); + delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h); + } + } + } + } + for(t = 0; t < l.max_boxes; ++t){ + box truth = float_to_box(state.truth + t*(4 + 1) + b*l.truths, 1); + + if(!truth.x) break; + float best_iou = 0; + int best_n = 0; + i = (truth.x * l.w); + j = (truth.y * l.h); + box truth_shift = truth; + truth_shift.x = truth_shift.y = 0; + for(n = 0; n < l.total; ++n){ + box pred = {0}; + pred.w = l.biases[2*n]/ state.net.w; + pred.h = l.biases[2*n+1]/ state.net.h; + float iou = box_iou(pred, truth_shift); + if (iou > best_iou){ + best_iou = iou; + best_n = n; + } + } + + int mask_n = int_index(l.mask, best_n, l.n); + if(mask_n >= 0){ + int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0); + float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h); + + int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4); + avg_obj += l.output[obj_index]; + l.delta[obj_index] = 1 - l.output[obj_index]; + + int class = state.truth[t*(4 + 1) + b*l.truths + 4]; + if (l.map) class = l.map[class]; + int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1); + delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat); + + ++count; + ++class_count; + if(iou > .5) recall += 1; + if(iou > .75) recall75 += 1; + avg_iou += iou; + } + } + } + *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2); + printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d\n", state.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count); +} + +void backward_yolo_layer(const layer l, network_state state) +{ + axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1); +} + +void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter) +{ + int i; + int new_w=0; + int new_h=0; + if (letter) { + if (((float)netw / w) < ((float)neth / h)) { + new_w = netw; + new_h = (h * netw) / w; + } + else { + new_h = neth; + new_w = (w * neth) / h; + } + } + else { + new_w = netw; + new_h = neth; + } + for (i = 0; i < n; ++i){ + box b = dets[i].bbox; + b.x = (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); + b.y = (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); + b.w *= (float)netw/new_w; + b.h *= (float)neth/new_h; + if(!relative){ + b.x *= w; + b.w *= w; + b.y *= h; + b.h *= h; + } + dets[i].bbox = b; + } +} + +int yolo_num_detections(layer l, float thresh) +{ + int i, n; + int count = 0; + for (i = 0; i < l.w*l.h; ++i){ + for(n = 0; n < l.n; ++n){ + int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4); + if(l.output[obj_index] > thresh){ + ++count; + } + } + } + return count; +} + +void avg_flipped_yolo(layer l) +{ + int i,j,n,z; + float *flip = l.output + l.outputs; + for (j = 0; j < l.h; ++j) { + for (i = 0; i < l.w/2; ++i) { + for (n = 0; n < l.n; ++n) { + for(z = 0; z < l.classes + 4 + 1; ++z){ + int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i; + int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1); + float swap = flip[i1]; + flip[i1] = flip[i2]; + flip[i2] = swap; + if(z == 0){ + flip[i1] = -flip[i1]; + flip[i2] = -flip[i2]; + } + } + } + } + } + for(i = 0; i < l.outputs; ++i){ + l.output[i] = (l.output[i] + flip[i])/2.; + } +} + +int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter) +{ + int i,j,n; + float *predictions = l.output; + if (l.batch == 2) avg_flipped_yolo(l); + int count = 0; + for (i = 0; i < l.w*l.h; ++i){ + int row = i / l.w; + int col = i % l.w; + for(n = 0; n < l.n; ++n){ + int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4); + float objectness = predictions[obj_index]; + if(objectness <= thresh) continue; + int box_index = entry_index(l, 0, n*l.w*l.h + i, 0); + dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h); + dets[count].objectness = objectness; + dets[count].classes = l.classes; + for(j = 0; j < l.classes; ++j){ + int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j); + float prob = objectness*predictions[class_index]; + dets[count].prob[j] = (prob > thresh) ? prob : 0; + } + ++count; + } + } + correct_yolo_boxes(dets, count, w, h, netw, neth, relative, letter); + return count; +} + +#ifdef GPU + +void forward_yolo_layer_gpu(const layer l, network_state state) +{ + copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1); + int b, n; + for (b = 0; b < l.batch; ++b){ + for(n = 0; n < l.n; ++n){ + int index = entry_index(l, b, n*l.w*l.h, 0); + activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC); + index = entry_index(l, b, n*l.w*l.h, 4); + activate_array_ongpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC); + } + } + if(!state.train || l.onlyforward){ + cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs); + return; + } + + cuda_pull_array(l.output_gpu, state.input, l.batch*l.inputs); + forward_yolo_layer(l, state); + cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs); +} + +void backward_yolo_layer_gpu(const layer l, network_state state) +{ + axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1); +} +#endif + diff --git a/src/yolo_layer.h b/src/yolo_layer.h new file mode 100644 index 00000000000..3ad5d26bd19 --- /dev/null +++ b/src/yolo_layer.h @@ -0,0 +1,20 @@ +#ifndef YOLO_LAYER_H +#define YOLO_LAYER_H + +//#include "darknet.h" +#include "layer.h" +#include "network.h" + +layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes); +void forward_yolo_layer(const layer l, network net); +void backward_yolo_layer(const layer l, network net); +void resize_yolo_layer(layer *l, int w, int h); +int yolo_num_detections(layer l, float thresh); +int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter); + +#ifdef GPU +void forward_yolo_layer_gpu(const layer l, network net); +void backward_yolo_layer_gpu(layer l, network net); +#endif + +#endif diff --git a/video_yolov3.sh b/video_yolov3.sh new file mode 100644 index 00000000000..2d0346acb88 --- /dev/null +++ b/video_yolov3.sh @@ -0,0 +1,6 @@ + + +./darknet detector demo ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights test50.mp4 -i 0 -thresh 0.25 + + +