diff --git a/Makefile b/Makefile
index f664a727489..2c5fdb6697d 100644
--- a/Makefile
+++ b/Makefile
@@ -85,7 +85,7 @@ LDFLAGS+= -L/usr/local/cudnn/lib64 -lcudnn
 endif
 endif
 
-OBJ=http_stream.o gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o
+OBJ=http_stream.o gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o upsample_layer.o
 ifeq ($(GPU), 1) 
 LDFLAGS+= -lstdc++ 
 OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
diff --git a/build/darknet/darknet.vcxproj b/build/darknet/darknet.vcxproj
index e8896edaa82..de0293ec82c 100644
--- a/build/darknet/darknet.vcxproj
+++ b/build/darknet/darknet.vcxproj
@@ -227,10 +227,12 @@
     <ClCompile Include="..\..\src\swag.c" />
     <ClCompile Include="..\..\src\tag.c" />
     <ClCompile Include="..\..\src\tree.c" />
+    <ClCompile Include="..\..\src\upsample_layer.c" />
     <ClCompile Include="..\..\src\utils.c" />
     <ClCompile Include="..\..\src\voxel.c" />
     <ClCompile Include="..\..\src\writing.c" />
     <ClCompile Include="..\..\src\yolo.c" />
+    <ClCompile Include="..\..\src\yolo_layer.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\activations.h" />
@@ -279,7 +281,9 @@
     <ClInclude Include="..\..\src\stb_image_write.h" />
     <ClInclude Include="..\..\src\tree.h" />
     <ClInclude Include="..\..\src\unistd.h" />
+    <ClInclude Include="..\..\src\upsample_layer.h" />
     <ClInclude Include="..\..\src\utils.h" />
+    <ClInclude Include="..\..\src\yolo_layer.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/build/darknet/darknet_no_gpu.vcxproj b/build/darknet/darknet_no_gpu.vcxproj
index e1c4c37f65a..9ce9b36a0e3 100644
--- a/build/darknet/darknet_no_gpu.vcxproj
+++ b/build/darknet/darknet_no_gpu.vcxproj
@@ -224,10 +224,12 @@
     <ClCompile Include="..\..\src\swag.c" />
     <ClCompile Include="..\..\src\tag.c" />
     <ClCompile Include="..\..\src\tree.c" />
+    <ClCompile Include="..\..\src\upsample_layer.c" />
     <ClCompile Include="..\..\src\utils.c" />
     <ClCompile Include="..\..\src\voxel.c" />
     <ClCompile Include="..\..\src\writing.c" />
     <ClCompile Include="..\..\src\yolo.c" />
+    <ClCompile Include="..\..\src\yolo_layer.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\activations.h" />
@@ -276,7 +278,9 @@
     <ClInclude Include="..\..\src\stb_image_write.h" />
     <ClInclude Include="..\..\src\tree.h" />
     <ClInclude Include="..\..\src\unistd.h" />
+    <ClInclude Include="..\..\src\upsample_layer.h" />
     <ClInclude Include="..\..\src\utils.h" />
+    <ClInclude Include="..\..\src\yolo_layer.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets" />
diff --git a/build/darknet/x64/cfg/yolov3.cfg b/build/darknet/x64/cfg/yolov3.cfg
new file mode 100644
index 00000000000..5f3ab621302
--- /dev/null
+++ b/build/darknet/x64/cfg/yolov3.cfg
@@ -0,0 +1,789 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=16
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
diff --git a/build/darknet/x64/darknet_demo_mjpeg_stream.cmd b/build/darknet/x64/darknet_demo_mjpeg_stream.cmd
index 044b5b782a2..43148da97bc 100644
--- a/build/darknet/x64/darknet_demo_mjpeg_stream.cmd
+++ b/build/darknet/x64/darknet_demo_mjpeg_stream.cmd
@@ -1,7 +1,7 @@
 rem Run this file and then open URL in Chrome/Firefox: rem http://localhost:8090
 rem Or open: http://ip-address:8090
 
-darknet.exe detector demo data/voc.data yolo-voc.cfg yolo-voc.weights test.mp4 -i 0 -http_port 8090
+darknet.exe detector demo data/voc.data yolo-voc.cfg yolo-voc.weights test.mp4 -i 0 -http_port 8090 -dont_show
 
 
 pause
\ No newline at end of file
diff --git a/build/darknet/x64/darknet_yolo_v3.cmd b/build/darknet/x64/darknet_yolo_v3.cmd
new file mode 100644
index 00000000000..a2fa22005d7
--- /dev/null
+++ b/build/darknet/x64/darknet_yolo_v3.cmd
@@ -0,0 +1,5 @@
+
+darknet.exe detector test data/coco.data yolov3.cfg yolov3.weights -i 0 -thresh 0.25 dogr.jpg
+
+
+pause
\ No newline at end of file
diff --git a/build/darknet/x64/yolov3.cfg b/build/darknet/x64/yolov3.cfg
new file mode 100644
index 00000000000..5f3ab621302
--- /dev/null
+++ b/build/darknet/x64/yolov3.cfg
@@ -0,0 +1,789 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=16
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
diff --git a/build/darknet/yolo_cpp_dll.vcxproj b/build/darknet/yolo_cpp_dll.vcxproj
index b68c5b4c3c3..c33be37f244 100644
--- a/build/darknet/yolo_cpp_dll.vcxproj
+++ b/build/darknet/yolo_cpp_dll.vcxproj
@@ -229,10 +229,12 @@
     <ClCompile Include="..\..\src\swag.c" />
     <ClCompile Include="..\..\src\tag.c" />
     <ClCompile Include="..\..\src\tree.c" />
+    <ClCompile Include="..\..\src\upsample_layer.c" />
     <ClCompile Include="..\..\src\utils.c" />
     <ClCompile Include="..\..\src\voxel.c" />
     <ClCompile Include="..\..\src\writing.c" />
     <ClCompile Include="..\..\src\yolo.c" />
+    <ClCompile Include="..\..\src\yolo_layer.c" />
     <ClCompile Include="..\..\src\yolo_v2_class.cpp" />
     <ClCompile Include="..\..\src\yolo_v2_class.hpp" />
   </ItemGroup>
@@ -283,7 +285,9 @@
     <ClInclude Include="..\..\src\stb_image_write.h" />
     <ClInclude Include="..\..\src\tree.h" />
     <ClInclude Include="..\..\src\unistd.h" />
+    <ClInclude Include="..\..\src\upsample_layer.h" />
     <ClInclude Include="..\..\src\utils.h" />
+    <ClInclude Include="..\..\src\yolo_layer.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/build/darknet/yolo_cpp_dll_no_gpu.vcxproj b/build/darknet/yolo_cpp_dll_no_gpu.vcxproj
index c1d1d30e9e1..56753c45480 100644
--- a/build/darknet/yolo_cpp_dll_no_gpu.vcxproj
+++ b/build/darknet/yolo_cpp_dll_no_gpu.vcxproj
@@ -213,10 +213,12 @@
     <ClCompile Include="..\..\src\swag.c" />
     <ClCompile Include="..\..\src\tag.c" />
     <ClCompile Include="..\..\src\tree.c" />
+    <ClCompile Include="..\..\src\upsample_layer.c" />
     <ClCompile Include="..\..\src\utils.c" />
     <ClCompile Include="..\..\src\voxel.c" />
     <ClCompile Include="..\..\src\writing.c" />
     <ClCompile Include="..\..\src\yolo.c" />
+    <ClCompile Include="..\..\src\yolo_layer.c" />
     <ClCompile Include="..\..\src\yolo_v2_class.cpp" />
     <ClCompile Include="..\..\src\yolo_v2_class.hpp" />
   </ItemGroup>
@@ -266,7 +268,9 @@
     <ClInclude Include="..\..\src\stb_image_write.h" />
     <ClInclude Include="..\..\src\tree.h" />
     <ClInclude Include="..\..\src\unistd.h" />
+    <ClInclude Include="..\..\src\upsample_layer.h" />
     <ClInclude Include="..\..\src\utils.h" />
+    <ClInclude Include="..\..\src\yolo_layer.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets" />
diff --git a/cfg/yolov3.cfg b/cfg/yolov3.cfg
new file mode 100644
index 00000000000..5f3ab621302
--- /dev/null
+++ b/cfg/yolov3.cfg
@@ -0,0 +1,789 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=16
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
diff --git a/image_yolov3.sh b/image_yolov3.sh
new file mode 100644
index 00000000000..49cc5eb66db
--- /dev/null
+++ b/image_yolov3.sh
@@ -0,0 +1,6 @@
+
+
+./darknet detector test ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights data/dog.jpg -i 0 -thresh 0.25
+
+
+
diff --git a/src/blas.c b/src/blas.c
index 6d565e96737..4ff0b8343ab 100644
--- a/src/blas.c
+++ b/src/blas.c
@@ -291,3 +291,19 @@ void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, i
     }
 }
 
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+	int i, j, k, b;
+	for (b = 0; b < batch; ++b) {
+		for (k = 0; k < c; ++k) {
+			for (j = 0; j < h*stride; ++j) {
+				for (i = 0; i < w*stride; ++i) {
+					int in_index = b*w*h*c + k*w*h + (j / stride)*w + i / stride;
+					int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
+					if (forward) out[out_index] = scale*in[in_index];
+					else in[in_index] += scale*out[out_index];
+				}
+			}
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/blas.h b/src/blas.h
index e1bfbf01e8d..c40422ac46e 100644
--- a/src/blas.h
+++ b/src/blas.h
@@ -36,6 +36,7 @@ void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
 
 void softmax(float *input, int n, float temp, float *output, int stride);
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 
 #ifdef GPU
 #include "cuda.h"
@@ -84,5 +85,7 @@ void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2,
 
 void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);
 
+void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
+
 #endif
 #endif
diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu
index 97b59779298..1edbbbde950 100644
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@@ -784,3 +784,34 @@ extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float t
     check_error(cudaPeekAtLastError());
 }
 
+
+__global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+	size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+	if (i >= N) return;
+	int out_index = i;
+	int out_w = i % (w*stride);
+	i = i / (w*stride);
+	int out_h = i % (h*stride);
+	i = i / (h*stride);
+	int out_c = i%c;
+	i = i / c;
+	int b = i%batch;
+
+	int in_w = out_w / stride;
+	int in_h = out_h / stride;
+	int in_c = out_c;
+
+	int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;
+
+
+	if (forward) out[out_index] += scale * x[in_index];
+	else atomicAdd(x + in_index, scale * out[out_index]);
+}
+
+extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+	size_t size = w*h*c*batch*stride*stride;
+	upsample_kernel << <cuda_gridsize(size), BLOCK >> >(size, in, w, h, c, batch, stride, forward, scale, out);
+	check_error(cudaPeekAtLastError());
+}
\ No newline at end of file
diff --git a/src/box.c b/src/box.c
index 1dc12c0614f..e02685d68e5 100644
--- a/src/box.c
+++ b/src/box.c
@@ -276,6 +276,92 @@ void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh
     free(s);
 }
 
+int nms_comparator_v3(const void *pa, const void *pb)
+{
+	detection a = *(detection *)pa;
+	detection b = *(detection *)pb;
+	float diff = 0;
+	if (b.sort_class >= 0) {
+		diff = a.prob[b.sort_class] - b.prob[b.sort_class];
+	}
+	else {
+		diff = a.objectness - b.objectness;
+	}
+	if (diff < 0) return 1;
+	else if (diff > 0) return -1;
+	return 0;
+}
+
+void do_nms_obj_v3(detection *dets, int total, int classes, float thresh)
+{
+	int i, j, k;
+	k = total - 1;
+	for (i = 0; i <= k; ++i) {
+		if (dets[i].objectness == 0) {
+			detection swap = dets[i];
+			dets[i] = dets[k];
+			dets[k] = swap;
+			--k;
+			--i;
+		}
+	}
+	total = k + 1;
+
+	for (i = 0; i < total; ++i) {
+		dets[i].sort_class = -1;
+	}
+
+	qsort(dets, total, sizeof(detection), nms_comparator_v3);
+	for (i = 0; i < total; ++i) {
+		if (dets[i].objectness == 0) continue;
+		box a = dets[i].bbox;
+		for (j = i + 1; j < total; ++j) {
+			if (dets[j].objectness == 0) continue;
+			box b = dets[j].bbox;
+			if (box_iou(a, b) > thresh) {
+				dets[j].objectness = 0;
+				for (k = 0; k < classes; ++k) {
+					dets[j].prob[k] = 0;
+				}
+			}
+		}
+	}
+}
+
+void do_nms_sort_v3(detection *dets, int total, int classes, float thresh)
+{
+	int i, j, k;
+	k = total - 1;
+	for (i = 0; i <= k; ++i) {
+		if (dets[i].objectness == 0) {
+			detection swap = dets[i];
+			dets[i] = dets[k];
+			dets[k] = swap;
+			--k;
+			--i;
+		}
+	}
+	total = k + 1;
+
+	for (k = 0; k < classes; ++k) {
+		for (i = 0; i < total; ++i) {
+			dets[i].sort_class = k;
+		}
+		qsort(dets, total, sizeof(detection), nms_comparator_v3);
+		for (i = 0; i < total; ++i) {
+			//printf("  k = %d, \t i = %d \n", k, i);
+			if (dets[i].prob[k] == 0) continue;
+			box a = dets[i].bbox;
+			for (j = i + 1; j < total; ++j) {
+				box b = dets[j].bbox;
+				if (box_iou(a, b) > thresh) {
+					dets[j].prob[k] = 0;
+				}
+			}
+		}
+	}
+}
+
 void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
 {
     int i, j, k;
diff --git a/src/box.h b/src/box.h
index a5f8cee3cb0..c023e20eb08 100644
--- a/src/box.h
+++ b/src/box.h
@@ -9,12 +9,23 @@ typedef struct{
     float dx, dy, dw, dh;
 } dbox;
 
+typedef struct detection {
+	box bbox;
+	int classes;
+	float *prob;
+	float *mask;
+	float objectness;
+	int sort_class;
+} detection;
+
 box float_to_box(float *f);
 float box_iou(box a, box b);
 float box_rmse(box a, box b);
 dbox diou(box a, box b);
 void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
 void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh);
+void do_nms_sort_v3(detection *dets, int total, int classes, float thresh);
+void do_nms_obj_v3(detection *dets, int total, int classes, float thresh);
 box decode_box(box b, box anchor);
 box encode_box(box b, box anchor);
 
diff --git a/src/demo.c b/src/demo.c
index e0a6ed35269..3b4e92daeb8 100644
--- a/src/demo.c
+++ b/src/demo.c
@@ -50,6 +50,7 @@ static IplImage* ipl_images[FRAMES];
 static float *avg;
 
 void draw_detections_cv(IplImage* show_img, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes);
+void draw_detections_cv_v3(IplImage* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes);
 void show_image_cv_ipl(IplImage *disp, const char *name);
 image get_image_from_stream_resize(CvCapture *cap, int w, int h, IplImage** in_img, int use_webcam);
 IplImage* in_img;
@@ -77,7 +78,7 @@ void *fetch_in_thread(void *ptr)
 
 void *detect_in_thread(void *ptr)
 {
-    float nms = .4;
+    float nms = .45;	// 0.4F
 
     layer l = net.layers[net.n-1];
     float *X = det_s.data;
@@ -88,6 +89,7 @@ void *detect_in_thread(void *ptr)
     l.output = avg;
 
     free_image(det_s);
+	/*
     if(l.type == DETECTION){
         get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0);
     } else if (l.type == REGION){
@@ -96,6 +98,12 @@ void *detect_in_thread(void *ptr)
         error("Last layer must produce detections\n");
     }
     if (nms > 0) do_nms(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+	*/
+	int letter = 0;
+	int nboxes = 0;
+	detection *dets = get_network_boxes(&net, det.w, det.h, demo_thresh, demo_thresh, 0, 1, &nboxes, letter);
+	if (nms) do_nms_obj_v3(dets, nboxes, l.classes, nms);
+
     printf("\033[2J");
     printf("\033[1;1H");
     printf("\nFPS:%.1f\n",fps);
@@ -108,7 +116,9 @@ void *detect_in_thread(void *ptr)
     demo_index = (demo_index + 1)%FRAMES;
 	    
 	//draw_detections(det, l.w*l.h*l.n, demo_thresh, boxes, probs, demo_names, demo_alphabet, demo_classes);
-	draw_detections_cv(det_img, l.w*l.h*l.n, demo_thresh, boxes, probs, demo_names, demo_alphabet, demo_classes);
+	draw_detections_cv_v3(det_img, dets, nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes);
+	//draw_detections_cv(det_img, l.w*l.h*l.n, demo_thresh, boxes, probs, demo_names, demo_alphabet, demo_classes);
+	free(dets);
 
 	return 0;
 }
@@ -122,7 +132,7 @@ double get_wall_time()
     return (double)time.tv_sec + (double)time.tv_usec * .000001;
 }
 
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, 
+void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes,
 	int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show)
 {
     //skip = frame_skip;
@@ -303,7 +313,7 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch
 	}
 }
 #else
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show)
+void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show)
 {
     fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
 }
diff --git a/src/detection_layer.c b/src/detection_layer.c
index fd5a4198311..0a1c1070af8 100644
--- a/src/detection_layer.c
+++ b/src/detection_layer.c
@@ -285,3 +285,31 @@ void backward_detection_layer_gpu(detection_layer l, network_state state)
 }
 #endif
 
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets)
+{
+	int i, j, n;
+	float *predictions = l.output;
+	//int per_cell = 5*num+classes;
+	for (i = 0; i < l.side*l.side; ++i) {
+		int row = i / l.side;
+		int col = i % l.side;
+		for (n = 0; n < l.n; ++n) {
+			int index = i*l.n + n;
+			int p_index = l.side*l.side*l.classes + i*l.n + n;
+			float scale = predictions[p_index];
+			int box_index = l.side*l.side*(l.classes + l.n) + (i*l.n + n) * 4;
+			box b;
+			b.x = (predictions[box_index + 0] + col) / l.side * w;
+			b.y = (predictions[box_index + 1] + row) / l.side * h;
+			b.w = pow(predictions[box_index + 2], (l.sqrt ? 2 : 1)) * w;
+			b.h = pow(predictions[box_index + 3], (l.sqrt ? 2 : 1)) * h;
+			dets[index].bbox = b;
+			dets[index].objectness = scale;
+			for (j = 0; j < l.classes; ++j) {
+				int class_index = i*l.classes;
+				float prob = scale*predictions[class_index + j];
+				dets[index].prob[j] = (prob > thresh) ? prob : 0;
+			}
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/detection_layer.h b/src/detection_layer.h
index e847a094ccf..9d2da9282fe 100644
--- a/src/detection_layer.h
+++ b/src/detection_layer.h
@@ -10,6 +10,7 @@ detection_layer make_detection_layer(int batch, int inputs, int n, int size, int
 void forward_detection_layer(const detection_layer l, network_state state);
 void backward_detection_layer(const detection_layer l, network_state state);
 void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness);
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets);
 
 #ifdef GPU
 void forward_detection_layer_gpu(const detection_layer l, network_state state);
diff --git a/src/detector.c b/src/detector.c
index 3dfbce677e0..9581e5c8c69 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -1000,7 +1000,7 @@ void calc_anchors(char *datacfg, int num_of_clusters, int final_width, int final
 }
 #endif // OPENCV
 
-void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, int dont_show)
+void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, int dont_show)
 {
     list *options = read_data_cfg(datacfg);
     char *name_list = option_find_str(options, "names", "data/names.list");
@@ -1017,7 +1017,7 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
     char buff[256];
     char *input = buff;
     int j;
-    float nms=.4;
+    float nms=.45;	// 0.4F
     while(1){
         if(filename){
             strncpy(input, filename, 256);
@@ -1030,21 +1030,27 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
             strtok(input, "\n");
         }
         image im = load_image_color(input,0,0);
-        image sized = resize_image(im, net.w, net.h);
-		//image sized = letterbox_image(im, net.w, net.h);
+		int letter = 0;
+        //image sized = resize_image(im, net.w, net.h);
+		image sized = letterbox_image(im, net.w, net.h); letter = 1;
         layer l = net.layers[net.n-1];
 
-        box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-        float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-        for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
+        //box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
+        //float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
+        //for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
 
         float *X = sized.data;
         time=clock();
         network_predict(net, X);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0);
-        if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-        draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
+        //get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0);
+		// if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+		//draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
+		int nboxes = 0;
+		detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letter);
+		if (nms) do_nms_sort_v3(dets, nboxes, l.classes, nms);
+		draw_detections_v3(im, dets, nboxes, thresh, names, alphabet, l.classes);
+		free_detections(dets, nboxes);
         save_image(im, "predictions");
 		if (!dont_show) {
 			show_image(im, "predictions");
@@ -1052,8 +1058,8 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
 
         free_image(im);
         free_image(sized);
-        free(boxes);
-        free_ptrs((void **)probs, l.w*l.h*l.n);
+        //free(boxes);
+        //free_ptrs((void **)probs, l.w*l.h*l.n);
 #ifdef OPENCV
 		if (!dont_show) {
 			cvWaitKey(0);
@@ -1071,7 +1077,8 @@ void run_detector(int argc, char **argv)
 	int http_stream_port = find_int_arg(argc, argv, "-http_port", -1);
 	char *out_filename = find_char_arg(argc, argv, "-out_filename", 0);
     char *prefix = find_char_arg(argc, argv, "-prefix", 0);
-    float thresh = find_float_arg(argc, argv, "-thresh", .24);
+    float thresh = find_float_arg(argc, argv, "-thresh", .25);	// 0.24
+	float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
     int cam_index = find_int_arg(argc, argv, "-c", 0);
     int frame_skip = find_int_arg(argc, argv, "-s", 0);
 	int num_of_clusters = find_int_arg(argc, argv, "-num_of_clusters", 5);
@@ -1112,7 +1119,7 @@ void run_detector(int argc, char **argv)
 	if(weights)
 		if (weights[strlen(weights) - 1] == 0x0d) weights[strlen(weights) - 1] = 0;
     char *filename = (argc > 6) ? argv[6]: 0;
-    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, dont_show);
+    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, dont_show);
     else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear, dont_show);
     else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights);
     else if(0==strcmp(argv[2], "recall")) validate_detector_recall(datacfg, cfg, weights);
@@ -1125,7 +1132,7 @@ void run_detector(int argc, char **argv)
         char **names = get_labels(name_list);
 		if(filename)
 			if (filename[strlen(filename) - 1] == 0x0d) filename[strlen(filename) - 1] = 0;
-        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, out_filename, 
+        demo(cfg, weights, thresh, hier_thresh, cam_index, filename, names, classes, frame_skip, prefix, out_filename,
 			http_stream_port, dont_show);
     }
 }
diff --git a/src/image.c b/src/image.c
index 84919ebc62f..d29f0c1e88f 100644
--- a/src/image.c
+++ b/src/image.c
@@ -93,6 +93,23 @@ image get_label(image **characters, char *string, int size)
     return b;
 }
 
+image get_label_v3(image **characters, char *string, int size)
+{
+	size = size / 10;
+	if (size > 7) size = 7;
+	image label = make_empty_image(0, 0, 0);
+	while (*string) {
+		image l = characters[size][(int)*string];
+		image n = tile_images(label, l, -size - 1 + (size + 1) / 2);
+		free_image(label);
+		label = n;
+		++string;
+	}
+	image b = border_image(label, label.h*.25);
+	free_image(label);
+	return b;
+}
+
 void draw_label(image a, int r, int c, image label, const float *rgb)
 {
     int w = label.w;
@@ -183,6 +200,80 @@ image **load_alphabet()
     return alphabets;
 }
 
+void draw_detections_v3(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes)
+{
+	int i, j;
+
+	for (i = 0; i < num; ++i) {
+		char labelstr[4096] = { 0 };
+		int class_id = -1;
+		for (j = 0; j < classes; ++j) {
+			if (dets[i].prob[j] > thresh) {
+				if (class_id < 0) {
+					strcat(labelstr, names[j]);
+					class_id = j;
+				}
+				else {
+					strcat(labelstr, ", ");
+					strcat(labelstr, names[j]);
+				}
+				printf("%s: %.0f%%\n", names[j], dets[i].prob[j] * 100);
+			}
+		}
+		if (class_id >= 0) {
+			int width = im.h * .006;
+
+			/*
+			if(0){
+			width = pow(prob, 1./2.)*10+1;
+			alphabet = 0;
+			}
+			*/
+
+			//printf("%d %s: %.0f%%\n", i, names[class_id], prob*100);
+			int offset = class_id * 123457 % classes;
+			float red = get_color(2, offset, classes);
+			float green = get_color(1, offset, classes);
+			float blue = get_color(0, offset, classes);
+			float rgb[3];
+
+			//width = prob*20+2;
+
+			rgb[0] = red;
+			rgb[1] = green;
+			rgb[2] = blue;
+			box b = dets[i].bbox;
+			//printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+
+			int left = (b.x - b.w / 2.)*im.w;
+			int right = (b.x + b.w / 2.)*im.w;
+			int top = (b.y - b.h / 2.)*im.h;
+			int bot = (b.y + b.h / 2.)*im.h;
+
+			if (left < 0) left = 0;
+			if (right > im.w - 1) right = im.w - 1;
+			if (top < 0) top = 0;
+			if (bot > im.h - 1) bot = im.h - 1;
+
+			draw_box_width(im, left, top, right, bot, width, red, green, blue);
+			if (alphabet) {
+				image label = get_label_v3(alphabet, labelstr, (im.h*.03));
+				draw_label(im, top + width, left, label, rgb);
+				free_image(label);
+			}
+			if (dets[i].mask) {
+				image mask = float_to_image(14, 14, 1, dets[i].mask);
+				image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
+				image tmask = threshold_image(resized_mask, .5);
+				embed_image(tmask, im, left, top);
+				free_image(mask);
+				free_image(resized_mask);
+				free_image(tmask);
+			}
+		}
+	}
+}
+
 void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes)
 {
     int i;
@@ -245,6 +336,93 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
 }
 
 #ifdef OPENCV
+
+void draw_detections_cv_v3(IplImage* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes)
+{
+	int i, j;
+	if (!show_img) return;
+
+	for (i = 0; i < num; ++i) {
+		char labelstr[4096] = { 0 };
+		int class_id = -1;
+		for (j = 0; j < classes; ++j) {
+			if (dets[i].prob[j] > thresh) {
+				if (class_id < 0) {
+					strcat(labelstr, names[j]);
+					class_id = j;
+				}
+				else {
+					strcat(labelstr, ", ");
+					strcat(labelstr, names[j]);
+				}
+				printf("%s: %.0f%%\n", names[j], dets[i].prob[j] * 100);
+			}
+		}
+		if (class_id >= 0) {
+			int width = show_img->height * .006;
+
+			/*
+			if(0){
+			width = pow(prob, 1./2.)*10+1;
+			alphabet = 0;
+			}
+			*/
+
+			//printf("%d %s: %.0f%%\n", i, names[class_id], prob*100);
+			int offset = class_id * 123457 % classes;
+			float red = get_color(2, offset, classes);
+			float green = get_color(1, offset, classes);
+			float blue = get_color(0, offset, classes);
+			float rgb[3];
+
+			//width = prob*20+2;
+
+			rgb[0] = red;
+			rgb[1] = green;
+			rgb[2] = blue;
+			box b = dets[i].bbox;
+			//printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+
+			int left = (b.x - b.w / 2.)*show_img->width;
+			int right = (b.x + b.w / 2.)*show_img->width;
+			int top = (b.y - b.h / 2.)*show_img->height;
+			int bot = (b.y + b.h / 2.)*show_img->height;
+
+			if (left < 0) left = 0;
+			if (right > show_img->width - 1) right = show_img->width - 1;
+			if (top < 0) top = 0;
+			if (bot > show_img->height - 1) bot = show_img->height - 1;
+
+			float const font_size = show_img->height / 1000.F;
+			CvPoint pt1, pt2, pt_text, pt_text_bg1, pt_text_bg2;
+			pt1.x = left;
+			pt1.y = top;
+			pt2.x = right;
+			pt2.y = bot;
+			pt_text.x = left;
+			pt_text.y = top - 12;
+			pt_text_bg1.x = left;
+			pt_text_bg1.y = top - (10 + 25 * font_size);
+			pt_text_bg2.x = right;
+			pt_text_bg2.y = top;
+			CvScalar color;
+			color.val[0] = red * 256;
+			color.val[1] = green * 256;
+			color.val[2] = blue * 256;
+
+			cvRectangle(show_img, pt1, pt2, color, width, 8, 0);
+			//printf("left=%d, right=%d, top=%d, bottom=%d, obj_id=%d, obj=%s \n", left, right, top, bot, class_id, names[class_id]);
+			cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, width, 8, 0);
+			cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, CV_FILLED, 8, 0);	// filled
+			CvScalar black_color;
+			black_color.val[0] = 0;
+			CvFont font;
+			cvInitFont(&font, CV_FONT_HERSHEY_SIMPLEX, font_size, font_size, 0, font_size * 3, 8);
+			cvPutText(show_img, names[class_id], pt_text, &font, black_color);
+		}
+	}
+}
+
 void draw_detections_cv(IplImage* show_img, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes)
 {
 	int i;
diff --git a/src/image.h b/src/image.h
index 165a62415ee..b88cb4b1e17 100644
--- a/src/image.h
+++ b/src/image.h
@@ -23,6 +23,7 @@ void draw_bbox(image a, box bbox, int w, float r, float g, float b);
 void draw_label(image a, int r, int c, image label, const float *rgb);
 void write_label(image a, int r, int c, image *characters, char *string, float *rgb);
 void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **labels, int classes);
+void draw_detections_v3(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes);
 image image_distance(image a, image b);
 void scale_image(image m, float s);
 image crop_image(image im, int dx, int dy, int w, int h);
diff --git a/src/layer.h b/src/layer.h
index 3a0e03ddca6..5291df90052 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -33,7 +33,9 @@ typedef enum {
     NETWORK,
     XNOR,
     REGION,
+	YOLO,
     REORG,
+	UPSAMPLE,
 	REORG_OLD,
     BLANK
 } LAYER_TYPE;
@@ -109,6 +111,9 @@ struct layer{
     int noadjust;
     int reorg;
     int log;
+	int tanh;
+	int *mask;
+	int total;
 
     int adam;
     float B1;
@@ -133,7 +138,10 @@ struct layer{
     float class_scale;
     int bias_match;
     int random;
+	float ignore_thresh;
+	float truth_thresh;
     float thresh;
+	float focus;
     int classfix;
     int absolute;
 
diff --git a/src/network.c b/src/network.c
index 61f87c5df0c..8619158ab04 100644
--- a/src/network.c
+++ b/src/network.c
@@ -27,6 +27,7 @@
 #include "dropout_layer.h"
 #include "route_layer.h"
 #include "shortcut_layer.h"
+#include "yolo_layer.h"
 
 int get_current_batch(network net)
 {
@@ -499,6 +500,107 @@ float *network_predict(network net, float *input)
     return out;
 }
 
+int num_detections(network *net, float thresh)
+{
+	int i;
+	int s = 0;
+	for (i = 0; i < net->n; ++i) {
+		layer l = net->layers[i];
+		if (l.type == YOLO) {
+			s += yolo_num_detections(l, thresh);
+		}
+		if (l.type == DETECTION || l.type == REGION) {
+			s += l.w*l.h*l.n;
+		}
+	}
+	return s;
+}
+
+detection *make_network_boxes(network *net, float thresh, int *num)
+{
+	layer l = net->layers[net->n - 1];
+	int i;
+	int nboxes = num_detections(net, thresh);
+	if (num) *num = nboxes;
+	detection *dets = calloc(nboxes, sizeof(detection));
+	for (i = 0; i < nboxes; ++i) {
+		dets[i].prob = calloc(l.classes, sizeof(float));
+		if (l.coords > 4) {
+			dets[i].mask = calloc(l.coords - 4, sizeof(float));
+		}
+	}
+	return dets;
+}
+
+
+void custom_get_region_detections(layer l, int w, int h, int net_w, int net_h, float thresh, int *map, float hier, int relative, detection *dets, int letter)
+{
+	box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
+	float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
+	int i, j;
+	for (j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
+	get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, map);
+	for (j = 0; j < l.w*l.h*l.n; ++j) {
+		dets[j].classes = l.classes;
+		dets[j].bbox = boxes[j];
+		dets[j].objectness = 1;
+		for (i = 0; i < l.classes; ++i) dets[j].prob[i] = probs[j][i];
+	}
+
+	free(boxes);
+	free_ptrs((void **)probs, l.w*l.h*l.n);
+}
+
+void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets, int letter)
+{
+	int j;
+	for (j = 0; j < net->n; ++j) {
+		layer l = net->layers[j];
+		if (l.type == YOLO) {
+			int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
+			dets += count;
+		}
+		if (l.type == REGION) {
+			custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
+			//get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
+			dets += l.w*l.h*l.n;
+		}
+		if (l.type == DETECTION) {
+			get_detection_detections(l, w, h, thresh, dets);
+			dets += l.w*l.h*l.n;
+		}
+	}
+}
+
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter)
+{
+	detection *dets = make_network_boxes(net, thresh, num);
+	fill_network_boxes(net, w, h, thresh, hier, map, relative, dets, letter);
+	return dets;
+}
+
+void free_detections(detection *dets, int n)
+{
+	int i;
+	for (i = 0; i < n; ++i) {
+		free(dets[i].prob);
+		if (dets[i].mask) free(dets[i].mask);
+	}
+	free(dets);
+}
+
+float *network_predict_image(network *net, image im)
+{
+	image imr = letterbox_image(im, net->w, net->h);
+	set_batch_network(net, 1);
+	float *p = network_predict(*net, imr.data);
+	free_image(imr);
+	return p;
+}
+
+int network_width(network *net) { return net->w; }
+int network_height(network *net) { return net->h; }
+
 matrix network_predict_data_multi(network net, data test, int n)
 {
     int i,j,b,m;
diff --git a/src/network.h b/src/network.h
index 2d28e810615..d7f86c10d08 100644
--- a/src/network.h
+++ b/src/network.h
@@ -132,6 +132,7 @@ int resize_network(network *net, int w, int h);
 void set_batch_network(network *net, int b);
 int get_network_input_size(network net);
 float get_network_cost(network net);
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter);
 
 int get_network_nuisance(network net);
 int get_network_background(network net);
diff --git a/src/parser.c b/src/parser.c
index 9949c50416b..a37ef1c3a75 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -30,6 +30,8 @@
 #include "shortcut_layer.h"
 #include "softmax_layer.h"
 #include "utils.h"
+#include "upsample_layer.h"
+#include "yolo_layer.h"
 #include <stdint.h>
 
 typedef struct{
@@ -47,6 +49,7 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[cost]")==0) return COST;
     if (strcmp(type, "[detection]")==0) return DETECTION;
     if (strcmp(type, "[region]")==0) return REGION;
+	if (strcmp(type, "[yolo]") == 0) return YOLO;
     if (strcmp(type, "[local]")==0) return LOCAL;
     if (strcmp(type, "[conv]")==0
             || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL;
@@ -71,6 +74,7 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[soft]")==0
             || strcmp(type, "[softmax]")==0) return SOFTMAX;
     if (strcmp(type, "[route]")==0) return ROUTE;
+	if (strcmp(type, "[upsample]") == 0) return UPSAMPLE;
     return BLANK;
 }
 
@@ -235,6 +239,65 @@ softmax_layer parse_softmax(list *options, size_params params)
     return layer;
 }
 
+int *parse_yolo_mask(char *a, int *num)
+{
+	int *mask = 0;
+	if (a) {
+		int len = strlen(a);
+		int n = 1;
+		int i;
+		for (i = 0; i < len; ++i) {
+			if (a[i] == ',') ++n;
+		}
+		mask = calloc(n, sizeof(int));
+		for (i = 0; i < n; ++i) {
+			int val = atoi(a);
+			mask[i] = val;
+			a = strchr(a, ',') + 1;
+		}
+		*num = n;
+	}
+	return mask;
+}
+
+layer parse_yolo(list *options, size_params params)
+{
+	int classes = option_find_int(options, "classes", 20);
+	int total = option_find_int(options, "num", 1);
+	int num = total;
+
+	char *a = option_find_str(options, "mask", 0);
+	int *mask = parse_yolo_mask(a, &num);
+	layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes);
+	assert(l.outputs == params.inputs);
+
+	l.max_boxes = option_find_int_quiet(options, "max", 90);
+	l.jitter = option_find_float(options, "jitter", .2);
+
+	l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
+	l.truth_thresh = option_find_float(options, "truth_thresh", 1);
+	l.random = option_find_int_quiet(options, "random", 0);
+
+	char *map_file = option_find_str(options, "map", 0);
+	if (map_file) l.map = read_map(map_file);
+
+	a = option_find_str(options, "anchors", 0);
+	if (a) {
+		int len = strlen(a);
+		int n = 1;
+		int i;
+		for (i = 0; i < len; ++i) {
+			if (a[i] == ',') ++n;
+		}
+		for (i = 0; i < n; ++i) {
+			float bias = atof(a);
+			l.biases[i] = bias;
+			a = strchr(a, ',') + 1;
+		}
+	}
+	return l;
+}
+
 layer parse_region(list *options, size_params params)
 {
     int coords = option_find_int(options, "coords", 4);
@@ -469,6 +532,15 @@ layer parse_activation(list *options, size_params params)
     return l;
 }
 
+layer parse_upsample(list *options, size_params params, network net)
+{
+
+	int stride = option_find_int(options, "stride", 2);
+	layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
+	l.scale = option_find_float_quiet(options, "scale", 1);
+	return l;
+}
+
 route_layer parse_route(list *options, size_params params, network net)
 {
     char *l = option_find(options, "layers");   
@@ -665,6 +737,8 @@ network parse_network_cfg_custom(char *filename, int batch)
             l = parse_cost(options, params);
         }else if(lt == REGION){
             l = parse_region(options, params);
+		}else if (lt == YOLO) {
+			l = parse_yolo(options, params);
         }else if(lt == DETECTION){
             l = parse_detection(options, params);
         }else if(lt == SOFTMAX){
@@ -684,6 +758,8 @@ network parse_network_cfg_custom(char *filename, int batch)
             l = parse_avgpool(options, params);
         }else if(lt == ROUTE){
             l = parse_route(options, params, net);
+		}else if (lt == UPSAMPLE) {
+			l = parse_upsample(options, params, net);
         }else if(lt == SHORTCUT){
             l = parse_shortcut(options, params, net);
         }else if(lt == DROPOUT){
diff --git a/src/region_layer.c b/src/region_layer.c
index f1799066550..5f8e4cc6311 100644
--- a/src/region_layer.c
+++ b/src/region_layer.c
@@ -130,12 +130,14 @@ void delta_region_class(float *output, float *delta, int index, int class_id, in
     } else {		
 		// Focal loss
 		if (focal_loss) {
-			// Focal Loss for Dense Object Detection: http://blog.csdn.net/linmingan/article/details/77885832
+			// Focal Loss
 			float alpha = 0.5;	// 0.25 or 0.5
 			//float gamma = 2;	// hardcoded in many places of the grad-formula	
 
 			int ti = index + class_id;
-			float grad = -2 * (1 - output[ti])*logf(fmaxf(output[ti], 0.0000001))*output[ti] + (1 - output[ti])*(1 - output[ti]);
+			float pt = output[ti] + 0.000000000000001F;
+			//float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1);	// http://blog.csdn.net/linmingan/article/details/77885832	
+			float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1);		// https://github.com/unsky/focal-loss
 
 			for (n = 0; n < classes; ++n) {
 				delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
@@ -165,6 +167,13 @@ float tisnan(float x)
     return (x != x);
 }
 
+static int entry_index(layer l, int batch, int location, int entry)
+{
+	int n = location / (l.w*l.h);
+	int loc = location % (l.w*l.h);
+	return batch*l.outputs + n*l.w*l.h*(l.coords + l.classes + 1) + entry*l.w*l.h + loc;
+}
+
 void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output);
 void forward_region_layer(const region_layer l, network_state state)
 {
@@ -454,3 +463,109 @@ void backward_region_layer_gpu(region_layer l, network_state state)
 }
 #endif
 
+
+void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+{
+	int i;
+	int new_w = 0;
+	int new_h = 0;
+	if (((float)netw / w) < ((float)neth / h)) {
+		new_w = netw;
+		new_h = (h * netw) / w;
+	}
+	else {
+		new_h = neth;
+		new_w = (w * neth) / h;
+	}
+	for (i = 0; i < n; ++i) {
+		box b = dets[i].bbox;
+		b.x = (b.x - (netw - new_w) / 2. / netw) / ((float)new_w / netw);
+		b.y = (b.y - (neth - new_h) / 2. / neth) / ((float)new_h / neth);
+		b.w *= (float)netw / new_w;
+		b.h *= (float)neth / new_h;
+		if (!relative) {
+			b.x *= w;
+			b.w *= w;
+			b.y *= h;
+			b.h *= h;
+		}
+		dets[i].bbox = b;
+	}
+}
+
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets)
+{
+	int i, j, n, z;
+	float *predictions = l.output;
+	if (l.batch == 2) {
+		float *flip = l.output + l.outputs;
+		for (j = 0; j < l.h; ++j) {
+			for (i = 0; i < l.w / 2; ++i) {
+				for (n = 0; n < l.n; ++n) {
+					for (z = 0; z < l.classes + l.coords + 1; ++z) {
+						int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+						int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+						float swap = flip[i1];
+						flip[i1] = flip[i2];
+						flip[i2] = swap;
+						if (z == 0) {
+							flip[i1] = -flip[i1];
+							flip[i2] = -flip[i2];
+						}
+					}
+				}
+			}
+		}
+		for (i = 0; i < l.outputs; ++i) {
+			l.output[i] = (l.output[i] + flip[i]) / 2.;
+		}
+	}
+	for (i = 0; i < l.w*l.h; ++i) {
+		int row = i / l.w;
+		int col = i % l.w;
+		for (n = 0; n < l.n; ++n) {
+			int index = n*l.w*l.h + i;
+			for (j = 0; j < l.classes; ++j) {
+				dets[index].prob[j] = 0;
+			}
+			int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords);
+			int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
+			int mask_index = entry_index(l, 0, n*l.w*l.h + i, 4);
+			float scale = l.background ? 1 : predictions[obj_index];
+			dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h, l.w*l.h);
+			dets[index].objectness = scale > thresh ? scale : 0;
+			if (dets[index].mask) {
+				for (j = 0; j < l.coords - 4; ++j) {
+					dets[index].mask[j] = l.output[mask_index + j*l.w*l.h];
+				}
+			}
+
+			int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + !l.background);
+			if (l.softmax_tree) {
+
+				hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0, l.w*l.h);
+				if (map) {
+					for (j = 0; j < 200; ++j) {
+						int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + map[j]);
+						float prob = scale*predictions[class_index];
+						dets[index].prob[j] = (prob > thresh) ? prob : 0;
+					}
+				}
+				else {
+					int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
+					dets[index].prob[j] = (scale > thresh) ? scale : 0;
+				}
+			}
+			else {
+				if (dets[index].objectness) {
+					for (j = 0; j < l.classes; ++j) {
+						int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j);
+						float prob = scale*predictions[class_index];
+						dets[index].prob[j] = (prob > thresh) ? prob : 0;
+					}
+				}
+			}
+		}
+	}
+	correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative);
+}
\ No newline at end of file
diff --git a/src/region_layer.h b/src/region_layer.h
index 0c754af71f2..d0de76ad3df 100644
--- a/src/region_layer.h
+++ b/src/region_layer.h
@@ -11,6 +11,7 @@ void forward_region_layer(const region_layer l, network_state state);
 void backward_region_layer(const region_layer l, network_state state);
 void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map);
 void resize_region_layer(layer *l, int w, int h);
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets);
 
 #ifdef GPU
 void forward_region_layer_gpu(const region_layer l, network_state state);
diff --git a/src/tree.c b/src/tree.c
index dfa4178727d..35ac3de19db 100644
--- a/src/tree.c
+++ b/src/tree.c
@@ -50,6 +50,38 @@ void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leave
     }
 }
 
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride)
+{
+	float p = 1;
+	int group = 0;
+	int i;
+	while (1) {
+		float max = 0;
+		int max_i = 0;
+
+		for (i = 0; i < hier->group_size[group]; ++i) {
+			int index = i + hier->group_offset[group];
+			float val = predictions[(i + hier->group_offset[group])*stride];
+			if (val > max) {
+				max_i = index;
+				max = val;
+			}
+		}
+		if (p*max > thresh) {
+			p = p*max;
+			group = hier->child[max_i];
+			if (hier->child[max_i] < 0) return max_i;
+		}
+		else if (group == 0) {
+			return max_i;
+		}
+		else {
+			return hier->parent[hier->group_offset[group]];
+		}
+	}
+	return 0;
+}
+
 tree *read_tree(char *filename)
 {
     tree t = {0};
diff --git a/src/tree.h b/src/tree.h
index c3f49797dd7..6983adf5603 100644
--- a/src/tree.h
+++ b/src/tree.h
@@ -5,6 +5,7 @@ typedef struct{
     int *leaf;
     int n;
     int *parent;
+	int *child;
     int *group;
     char **name;
 
@@ -14,6 +15,7 @@ typedef struct{
 } tree;
 
 tree *read_tree(char *filename);
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride);
 void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves);
 void change_leaves(tree *t, char *leaf_list);
 float get_hierarchy_probability(float *x, tree *hier, int c);
diff --git a/src/upsample_layer.c b/src/upsample_layer.c
new file mode 100644
index 00000000000..1aa5a150793
--- /dev/null
+++ b/src/upsample_layer.c
@@ -0,0 +1,106 @@
+#include "upsample_layer.h"
+#include "cuda.h"
+#include "blas.h"
+
+#include <stdio.h>
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride)
+{
+    layer l = {0};
+    l.type = UPSAMPLE;
+    l.batch = batch;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+    l.out_w = w*stride;
+    l.out_h = h*stride;
+    l.out_c = c;
+    if(stride < 0){
+        stride = -stride;
+        l.reverse=1;
+        l.out_w = w/stride;
+        l.out_h = h/stride;
+    }
+    l.stride = stride;
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = l.w*l.h*l.c;
+    l.delta =  calloc(l.outputs*batch, sizeof(float));
+    l.output = calloc(l.outputs*batch, sizeof(float));;
+
+    l.forward = forward_upsample_layer;
+    l.backward = backward_upsample_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_upsample_layer_gpu;
+    l.backward_gpu = backward_upsample_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+    #endif
+    if(l.reverse) fprintf(stderr, "downsample         %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    else fprintf(stderr, "upsample           %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    return l;
+}
+
+void resize_upsample_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+    l->out_w = w*l->stride;
+    l->out_h = h*l->stride;
+    if(l->reverse){
+        l->out_w = w/l->stride;
+        l->out_h = h/l->stride;
+    }
+    l->outputs = l->out_w*l->out_h*l->out_c;
+    l->inputs = l->h*l->w*l->c;
+    l->delta =  realloc(l->delta, l->outputs*l->batch*sizeof(float));
+    l->output = realloc(l->output, l->outputs*l->batch*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+    
+}
+
+void forward_upsample_layer(const layer l, network_state net)
+{
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+    if(l.reverse){
+        upsample_cpu(l.output, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input);
+    }else{
+        upsample_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output);
+    }
+}
+
+void backward_upsample_layer(const layer l, network_state state)
+{
+    if(l.reverse){
+        upsample_cpu(l.delta, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, state.delta);
+    }else{
+        upsample_cpu(state.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta);
+    }
+}
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network_state state)
+{
+    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    if(l.reverse){
+        upsample_gpu(l.output_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, state.input);
+    }else{
+        upsample_gpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output_gpu);
+    }
+}
+
+void backward_upsample_layer_gpu(const layer l, network_state state)
+{
+    if(l.reverse){
+        upsample_gpu(l.delta_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, state.delta);
+    }else{
+        upsample_gpu(state.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta_gpu);
+    }
+}
+#endif
diff --git a/src/upsample_layer.h b/src/upsample_layer.h
new file mode 100644
index 00000000000..4c7ac30396d
--- /dev/null
+++ b/src/upsample_layer.h
@@ -0,0 +1,17 @@
+#ifndef UPSAMPLE_LAYER_H
+#define UPSAMPLE_LAYER_H
+#include "cuda.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride);
+void forward_upsample_layer(const layer l, network net);
+void backward_upsample_layer(const layer l, network net);
+void resize_upsample_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network net);
+void backward_upsample_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/src/utils.c b/src/utils.c
index d6bdbf60387..a97d9661433 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -545,6 +545,15 @@ int max_index(float *a, int n)
     return max_i;
 }
 
+int int_index(int *a, int val, int n)
+{
+	int i;
+	for (i = 0; i < n; ++i) {
+		if (a[i] == val) return i;
+	}
+	return -1;
+}
+
 int rand_int(int min, int max)
 {
     if (max < min){
diff --git a/src/utils.h b/src/utils.h
index 9949d6d0e30..eab26229ea9 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -66,6 +66,7 @@ void print_statistics(float *a, int n);
 unsigned int random_gen();
 float random_float();
 float rand_uniform_strong(float min, float max);
+int int_index(int *a, int val, int n);
 
 #endif
 
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
new file mode 100644
index 00000000000..46846ef7324
--- /dev/null
+++ b/src/yolo_layer.c
@@ -0,0 +1,381 @@
+#include "yolo_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "cuda.h"
+#include "utils.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes)
+{
+    int i;
+    layer l = {0};
+    l.type = YOLO;
+
+    l.n = n;
+    l.total = total;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = n*(classes + 4 + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.cost = calloc(1, sizeof(float));
+    l.biases = calloc(total*2, sizeof(float));
+    if(mask) l.mask = mask;
+    else{
+        l.mask = calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            l.mask[i] = i;
+        }
+    }
+    l.bias_updates = calloc(n*2, sizeof(float));
+    l.outputs = h*w*n*(classes + 4 + 1);
+    l.inputs = l.outputs;
+    l.truths = 90*(4 + 1);
+    l.delta = calloc(batch*l.outputs, sizeof(float));
+    l.output = calloc(batch*l.outputs, sizeof(float));
+    for(i = 0; i < total*2; ++i){
+        l.biases[i] = .5;
+    }
+
+    l.forward = forward_yolo_layer;
+    l.backward = backward_yolo_layer;
+#ifdef GPU
+    l.forward_gpu = forward_yolo_layer_gpu;
+    l.backward_gpu = backward_yolo_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+#endif
+
+    fprintf(stderr, "detection\n");
+    srand(0);
+
+    return l;
+}
+
+void resize_yolo_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->n*(l->classes + 4 + 1);
+    l->inputs = l->outputs;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
+    l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
+#endif
+}
+
+box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
+{
+    box b;
+    b.x = (i + x[index + 0*stride]) / lw;
+    b.y = (j + x[index + 1*stride]) / lh;
+    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
+    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
+    return b;
+}
+
+float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
+{
+    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
+    float iou = box_iou(pred, truth);
+
+    float tx = (truth.x*lw - i);
+    float ty = (truth.y*lh - j);
+    float tw = log(truth.w*w / biases[2*n]);
+    float th = log(truth.h*h / biases[2*n + 1]);
+
+    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
+    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
+    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
+    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
+    return iou;
+}
+
+
+void delta_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat)
+{
+    int n;
+    if (delta[index]){
+        delta[index + stride*class] = 1 - output[index + stride*class];
+        if(avg_cat) *avg_cat += output[index + stride*class];
+        return;
+    }
+    for(n = 0; n < classes; ++n){
+        delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
+        if(n == class && avg_cat) *avg_cat += output[index + stride*n];
+    }
+}
+
+static int entry_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc;
+}
+
+void forward_yolo_layer(const layer l, network_state state)
+{
+    int i,j,b,t,n;
+    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+
+#ifndef GPU
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, 4);
+            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+#endif
+
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if(!state.train) return;
+    float avg_iou = 0;
+    float recall = 0;
+    float recall75 = 0;
+    float avg_cat = 0;
+    float avg_obj = 0;
+    float avg_anyobj = 0;
+    int count = 0;
+    int class_count = 0;
+    *(l.cost) = 0;
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);
+                    float best_iou = 0;
+                    int best_t = 0;
+                    for(t = 0; t < l.max_boxes; ++t){
+                        box truth = float_to_box(state.truth + t*(4 + 1) + b*l.truths, 1);
+                        if(!truth.x) break;
+                        float iou = box_iou(pred, truth);
+                        if (iou > best_iou) {
+                            best_iou = iou;
+                            best_t = t;
+                        }
+                    }
+                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = 0 - l.output[obj_index];
+                    if (best_iou > l.ignore_thresh) {
+                        l.delta[obj_index] = 0;
+                    }
+                    if (best_iou > l.truth_thresh) {
+                        l.delta[obj_index] = 1 - l.output[obj_index];
+
+                        int class = state.truth[best_t*(4 + 1) + b*l.truths + 4];
+                        if (l.map) class = l.map[class];
+                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
+                        delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
+                        box truth = float_to_box(state.truth + best_t*(4 + 1) + b*l.truths, 1);
+                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+                    }
+                }
+            }
+        }
+        for(t = 0; t < l.max_boxes; ++t){
+            box truth = float_to_box(state.truth + t*(4 + 1) + b*l.truths, 1);
+
+            if(!truth.x) break;
+            float best_iou = 0;
+            int best_n = 0;
+            i = (truth.x * l.w);
+            j = (truth.y * l.h);
+            box truth_shift = truth;
+            truth_shift.x = truth_shift.y = 0;
+            for(n = 0; n < l.total; ++n){
+                box pred = {0};
+                pred.w = l.biases[2*n]/ state.net.w;
+                pred.h = l.biases[2*n+1]/ state.net.h;
+                float iou = box_iou(pred, truth_shift);
+                if (iou > best_iou){
+                    best_iou = iou;
+                    best_n = n;
+                }
+            }
+
+            int mask_n = int_index(l.mask, best_n, l.n);
+            if(mask_n >= 0){
+                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+
+                int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
+                avg_obj += l.output[obj_index];
+                l.delta[obj_index] = 1 - l.output[obj_index];
+
+                int class = state.truth[t*(4 + 1) + b*l.truths + 4];
+                if (l.map) class = l.map[class];
+                int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
+                delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
+
+                ++count;
+                ++class_count;
+                if(iou > .5) recall += 1;
+                if(iou > .75) recall75 += 1;
+                avg_iou += iou;
+            }
+        }
+    }
+    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", state.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
+}
+
+void backward_yolo_layer(const layer l, network_state state)
+{
+   axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+}
+
+void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
+{
+    int i;
+    int new_w=0;
+    int new_h=0;
+	if (letter) {
+		if (((float)netw / w) < ((float)neth / h)) {
+			new_w = netw;
+			new_h = (h * netw) / w;
+		}
+		else {
+			new_h = neth;
+			new_w = (w * neth) / h;
+		}
+	}
+	else {
+		new_w = netw;
+		new_h = neth;
+	}
+    for (i = 0; i < n; ++i){
+        box b = dets[i].bbox;
+        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
+        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); 
+        b.w *= (float)netw/new_w;
+        b.h *= (float)neth/new_h;
+        if(!relative){
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
+}
+
+int yolo_num_detections(layer l, float thresh)
+{
+    int i, n;
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
+            if(l.output[obj_index] > thresh){
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+void avg_flipped_yolo(layer l)
+{
+    int i,j,n,z;
+    float *flip = l.output + l.outputs;
+    for (j = 0; j < l.h; ++j) {
+        for (i = 0; i < l.w/2; ++i) {
+            for (n = 0; n < l.n; ++n) {
+                for(z = 0; z < l.classes + 4 + 1; ++z){
+                    int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                    int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                    float swap = flip[i1];
+                    flip[i1] = flip[i2];
+                    flip[i2] = swap;
+                    if(z == 0){
+                        flip[i1] = -flip[i1];
+                        flip[i2] = -flip[i2];
+                    }
+                }
+            }
+        }
+    }
+    for(i = 0; i < l.outputs; ++i){
+        l.output[i] = (l.output[i] + flip[i])/2.;
+    }
+}
+
+int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter)
+{
+    int i,j,n;
+    float *predictions = l.output;
+    if (l.batch == 2) avg_flipped_yolo(l);
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        int row = i / l.w;
+        int col = i % l.w;
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
+            float objectness = predictions[obj_index];
+            if(objectness <= thresh) continue;
+            int box_index  = entry_index(l, 0, n*l.w*l.h + i, 0);
+            dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
+            dets[count].objectness = objectness;
+            dets[count].classes = l.classes;
+            for(j = 0; j < l.classes; ++j){
+                int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
+                float prob = objectness*predictions[class_index];
+                dets[count].prob[j] = (prob > thresh) ? prob : 0;
+            }
+            ++count;
+        }
+    }
+    correct_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
+    return count;
+}
+
+#ifdef GPU
+
+void forward_yolo_layer_gpu(const layer l, network_state state)
+{
+    copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, 4);
+            activate_array_ongpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+    if(!state.train || l.onlyforward){
+        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        return;
+    }
+
+    cuda_pull_array(l.output_gpu, state.input, l.batch*l.inputs);
+    forward_yolo_layer(l, state);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+}
+
+void backward_yolo_layer_gpu(const layer l, network_state state)
+{
+    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
+}
+#endif
+
diff --git a/src/yolo_layer.h b/src/yolo_layer.h
new file mode 100644
index 00000000000..3ad5d26bd19
--- /dev/null
+++ b/src/yolo_layer.h
@@ -0,0 +1,20 @@
+#ifndef YOLO_LAYER_H
+#define YOLO_LAYER_H
+
+//#include "darknet.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
+void forward_yolo_layer(const layer l, network net);
+void backward_yolo_layer(const layer l, network net);
+void resize_yolo_layer(layer *l, int w, int h);
+int yolo_num_detections(layer l, float thresh);
+int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter);
+
+#ifdef GPU
+void forward_yolo_layer_gpu(const layer l, network net);
+void backward_yolo_layer_gpu(layer l, network net);
+#endif
+
+#endif
diff --git a/video_yolov3.sh b/video_yolov3.sh
new file mode 100644
index 00000000000..2d0346acb88
--- /dev/null
+++ b/video_yolov3.sh
@@ -0,0 +1,6 @@
+
+
+./darknet detector demo ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights test50.mp4 -i 0 -thresh 0.25
+
+
+