Add instructions and sample code for using TriNet.

lucasb-eyer · lucasb-eyer · commit aa4b8e5b6847 · 2017-08-15T07:20:19.000+02:00
diff --git a/README.md b/README.md
@@ -1,5 +1,47 @@
 # Triplet-based Person Re-Identification
+
 Code for reproducing the results of our "In Defense of the Triplet Loss for Person Re-Identification" paper.
 
-# Publication Pending!
-The publication of all code is pending acceptance of the paper. Since we've been asked several times, this work was a submission to ICCV'17 and you can have look at the [timeline](http://iccv2017.thecvf.com/submission/timeline), so if all goes well, code will come online sometime this summer.
+Both main authors are currently in an internship.
+We will publish the full training code after our internships, which is end of September 2017.
+(By "Watching" this project on github, you will receive e-mails about updates to this repo.)
+Meanwhile, we provide the pre-trained weights for the TriNet model, as well as some rudimentary example code for using it to compute embeddings, see below.
+
+# Pretrained Models
+
+This is a first, simple release. A better more generic script will follow in a few months, but this should be enough to get started trying out our models!
+
+As a first step, download the weights for the TriNet model [trained on MARS](https://omnomnom.vision.rwth-aachen.de/data/trinet-mars.npz) or trained on [Market1501](https://omnomnom.vision.rwth-aachen.de/data/trinet-market1501.npz).
+(Pre-trained LuNet models will follow.)
+
+Next, create a file (`files.txt`) which contains the full path to the image files you want to embed, one filename per line, like so:
+
+```
+/path/to/file1.png
+/path/to/file2.jpg
+```
+
+Finally, run the `trinet_embed.py` script, passing both the above file and the weights file you want to use, like so:
+
+```
+python trinet_embed.py files.txt /path/to/trinet-mars.npz
+```
+
+And it will output one comma-separated line for each file, containing the filename followed by the embedding, like so:
+
+```
+/path/to/file1.png,-1.234,5.678,...
+/path/to/file2.jpg,9.876,-1.234,...
+```
+
+You could for example redirect it to a file for further processing:
+
+```
+python trinet_embed.py files.txt /path/to/trinet-market1501.npz >embeddings.csv
+```
+
+You can now do meaningful work by comparing these embeddings using the Euclidean distance, for example, try some K-means clustering!
+
+A couple notes:
+- The script depends on both [Theano](http://deeplearning.net/software/theano/install.html) and [Lasagne](http://lasagne.readthedocs.io/en/latest/user/installation.html) being correctly installed.
+- The input files should be crops of a full person standing upright, and they will be resized to `288x144` before being passed to the network.
diff --git a/trinet_embed.py b/trinet_embed.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python
+from __future__ import print_function
+import numpy as np
+import cv2
+import pickle
+import sys
+
+
+if len(sys.argv) != 3:
+    print("Usage: {} IMAGE_LIST_FILE MODEL_WEIGHT_FILE".format(sys.argv[0]))
+    sys.exit(1)
+
+# Specify the path to a Market-1501 image that should be embedded and the location of the weights we provided.
+image_list = list(map(str.strip, open(sys.argv[1]).readlines()))
+weight_fname = sys.argv[2]
+
+
+
+# Setup the pretrained ResNet
+
+#This is based on the Lasagne ResNet-50 example with slight modifications to allow for different input sizes.
+#The original can be found at: https://github.com/Lasagne/Recipes/blob/master/examples/resnet50/ImageNet%20Pretrained%20Network%20(ResNet-50).ipynb
+import theano
+import lasagne
+from lasagne.layers import InputLayer
+from lasagne.layers import Conv2DLayer as ConvLayer
+from lasagne.layers import BatchNormLayer
+from lasagne.layers import Pool2DLayer as PoolLayer
+from lasagne.layers import NonlinearityLayer
+from lasagne.layers import ElemwiseSumLayer
+from lasagne.layers import DenseLayer
+from lasagne.nonlinearities import rectify, softmax
+
+
+def build_simple_block(incoming_layer, names,
+                       num_filters, filter_size, stride, pad,
+                       use_bias=False, nonlin=rectify):
+    """Creates stacked Lasagne layers ConvLayer -> BN -> (ReLu)
+
+    Parameters:
+    ----------
+    incoming_layer : instance of Lasagne layer
+        Parent layer
+
+    names : list of string
+        Names of the layers in block
+
+    num_filters : int
+        Number of filters in convolution layer
+
+    filter_size : int
+        Size of filters in convolution layer
+
+    stride : int
+        Stride of convolution layer
+
+    pad : int
+        Padding of convolution layer
+
+    use_bias : bool
+        Whether to use bias in conlovution layer
+
+    nonlin : function
+        Nonlinearity type of Nonlinearity layer
+
+    Returns
+    -------
+    tuple: (net, last_layer_name)
+        net : dict
+            Dictionary with stacked layers
+        last_layer_name : string
+            Last layer name
+    """
+    net = []
+    net.append((
+            names[0],
+            ConvLayer(incoming_layer, num_filters, filter_size, stride, pad,
+                      flip_filters=False, nonlinearity=None) if use_bias
+            else ConvLayer(incoming_layer, num_filters, filter_size, stride, pad, b=None,
+                           flip_filters=False, nonlinearity=None)
+        ))
+
+    net.append((
+            names[1],
+            BatchNormLayer(net[-1][1])
+        ))
+    if nonlin is not None:
+        net.append((
+            names[2],
+            NonlinearityLayer(net[-1][1], nonlinearity=nonlin)
+        ))
+
+    return dict(net), net[-1][0]
+
+
+def build_residual_block(incoming_layer, ratio_n_filter=1.0, ratio_size=1.0, has_left_branch=False,
+                         upscale_factor=4, ix=''):
+    """Creates two-branch residual block
+
+    Parameters:
+    ----------
+    incoming_layer : instance of Lasagne layer
+        Parent layer
+
+    ratio_n_filter : float
+        Scale factor of filter bank at the input of residual block
+
+    ratio_size : float
+        Scale factor of filter size
+
+    has_left_branch : bool
+        if True, then left branch contains simple block
+
+    upscale_factor : float
+        Scale factor of filter bank at the output of residual block
+
+    ix : int
+        Id of residual block
+
+    Returns
+    -------
+    tuple: (net, last_layer_name)
+        net : dict
+            Dictionary with stacked layers
+        last_layer_name : string
+            Last layer name
+    """
+    simple_block_name_pattern = ['res%s_branch%i%s', 'bn%s_branch%i%s', 'res%s_branch%i%s_relu']
+
+    net = {}
+
+    # right branch
+    net_tmp, last_layer_name = build_simple_block(
+        incoming_layer, list(map(lambda s: s % (ix, 2, 'a'), simple_block_name_pattern)),
+        int(lasagne.layers.get_output_shape(incoming_layer)[1]*ratio_n_filter), 1, int(1.0/ratio_size), 0)
+    net.update(net_tmp)
+
+    net_tmp, last_layer_name = build_simple_block(
+        net[last_layer_name], list(map(lambda s: s % (ix, 2, 'b'), simple_block_name_pattern)),
+        lasagne.layers.get_output_shape(net[last_layer_name])[1], 3, 1, 1)
+    net.update(net_tmp)
+
+    net_tmp, last_layer_name = build_simple_block(
+        net[last_layer_name], list(map(lambda s: s % (ix, 2, 'c'), simple_block_name_pattern)),
+        lasagne.layers.get_output_shape(net[last_layer_name])[1]*upscale_factor, 1, 1, 0,
+        nonlin=None)
+    net.update(net_tmp)
+
+    right_tail = net[last_layer_name]
+    left_tail = incoming_layer
+
+    # left branch
+    if has_left_branch:
+        net_tmp, last_layer_name = build_simple_block(
+            incoming_layer, list(map(lambda s: s % (ix, 1, ''), simple_block_name_pattern)),
+            int(lasagne.layers.get_output_shape(incoming_layer)[1]*4*ratio_n_filter), 1, int(1.0/ratio_size), 0,
+            nonlin=None)
+        net.update(net_tmp)
+        left_tail = net[last_layer_name]
+
+    net['res%s' % ix] = ElemwiseSumLayer([left_tail, right_tail], coeffs=1)
+    net['res%s_relu' % ix] = NonlinearityLayer(net['res%s' % ix], nonlinearity=rectify, name = 'res%s_relu' % ix)
+
+    return net, 'res%s_relu' % ix
+
+
+def build_model(input_size):
+    net = {}
+    net['input'] = InputLayer(input_size)
+    sub_net, parent_layer_name = build_simple_block(
+        net['input'], ['conv1', 'bn_conv1', 'conv1_relu'],
+        64, 7, 2, 3, use_bias=True)
+    net.update(sub_net)
+    net['pool1'] = PoolLayer(net[parent_layer_name], pool_size=3, stride=2, pad=0, mode='max', ignore_border=False)
+    block_size = list('abc')
+    parent_layer_name = 'pool1'
+    for c in block_size:
+        if c == 'a':
+            sub_net, parent_layer_name = build_residual_block(net[parent_layer_name], 1, 1, True, 4, ix='2%s' % c)
+        else:
+            sub_net, parent_layer_name = build_residual_block(net[parent_layer_name], 1.0/4, 1, False, 4, ix='2%s' % c)
+        net.update(sub_net)
+
+    block_size = list('abcd')
+    for c in block_size:
+        if c == 'a':
+            sub_net, parent_layer_name = build_residual_block(
+                net[parent_layer_name], 1.0/2, 1.0/2, True, 4, ix='3%s' % c)
+        else:
+            sub_net, parent_layer_name = build_residual_block(net[parent_layer_name], 1.0/4, 1, False, 4, ix='3%s' % c)
+        net.update(sub_net)
+
+    block_size = list('abcdef')
+    for c in block_size:
+        if c == 'a':
+            sub_net, parent_layer_name = build_residual_block(
+                net[parent_layer_name], 1.0/2, 1.0/2, True, 4, ix='4%s' % c)
+        else:
+            sub_net, parent_layer_name = build_residual_block(net[parent_layer_name], 1.0/4, 1, False, 4, ix='4%s' % c)
+        net.update(sub_net)
+
+    block_size = list('abc')
+    for c in block_size:
+        if c == 'a':
+            sub_net, parent_layer_name = build_residual_block(
+                net[parent_layer_name], 1.0/2, 1.0/2, True, 4, ix='5%s' % c)
+        else:
+            sub_net, parent_layer_name = build_residual_block(net[parent_layer_name], 1.0/4, 1, False, 4, ix='5%s' % c)
+        net.update(sub_net)
+    net['pool5'] = PoolLayer(net[parent_layer_name], pool_size=7, stride=1, pad=0,
+                             mode='average_exc_pad', ignore_border=False)
+
+    return net
+
+
+#Setup the original network
+resnet = build_model(input_size=(None, 3, 256,128))
+
+#Now we modify the network's final pooling layer and add 2 new layers at the end to predict the 128-dimensional embedding.
+#Different input size.
+inp = resnet['input']
+
+network_features = resnet['pool5']
+network_features.pool_size=(8,4)
+
+#New additional final layer
+network = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(
+        network_features,
+        num_units=1024,
+        nonlinearity=lasagne.nonlinearities.rectify,
+        W=lasagne.init.GlorotUniform('relu'),
+        b=None))
+
+network_out = lasagne.layers.DenseLayer(
+        network,
+        num_units=128,
+        nonlinearity=None,
+        W=lasagne.init.Orthogonal())
+
+
+
+#Setup the function to predict the embeddings.
+predict_features = theano.function(
+            inputs=[inp.input_var],
+            outputs=lasagne.layers.get_output(network_out, deterministic=True))
+
+
+#Set the parameters
+with np.load(weight_fname) as f:
+    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
+    lasagne.layers.set_all_param_values(network_out, param_values)
+
+
+
+#We subtract the per-channel mean of the "mean image" as loaded from the original ResNet-50 weight dump.
+#For simplcity, we just hardcode it here.
+im_mean = np.asarray([103.0626238, 115.90288257, 123.15163084], dtype=np.float32)
+
+
+
+# a little helper function to create a test-time augmentation batch.
+def get_augmentation_batch(image, im_mean):
+    #Resize it correctly, as needed by the test time augmentation.
+    image = cv2.resize(image, (128+16, 256+32))
+
+    #Change into CHW format
+    image = np.rollaxis(image,2)
+
+    #Setup storage for the batch
+    batch = np.zeros((10,3,256,128), dtype=np.float32)
+
+    #Four corner crops and the center crop
+    batch[0] = image[:,16:-16, 8:-8]    #Center crop
+    batch[1] = image[:,   :-32,   :-16] #Top left
+    batch[2] = image[:,   :-32, 16:]    #Top right
+    batch[3] = image[:, 32:,      :-16] #Bottom left
+    batch[4] = image[:, 32:,    16:]    #Bottom right
+
+    #Flipping
+    batch[5:] = batch[:5,:,:,::-1]
+
+    #Subtract the mean
+    batch = batch-im_mean[None,:,None,None]
+
+    return batch
+
+
+
+for image_filename in image_list:
+    print(image_filename, end=",")
+    sys.stdout.flush()
+
+    image = cv2.imread(image_filename)
+    if image is None:
+        raise ValueError("Couldn't load image {}".format(image_filename))
+
+    #Setup a batch of images and use the function to predict the embedding.
+    batch = get_augmentation_batch(image, im_mean)
+    embedding = np.mean(predict_features(batch), axis=0)
+    print(','.join(map(str, embedding)))