diff --git a/README.md b/README.md
index b0064510..74c6552c 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# Hourglass and CPN model in TensorFlow for 2018-FashionAI Key Points Detection of Apparel at TianChi
+# Hourglass, DHN and CPN model in TensorFlow for 2018-FashionAI Key Points Detection of Apparel at TianChi
 
-This repository contains codes of the re-implementent of [Stacked Hourglass Networks for Human Pose Estimation](https://arxiv.org/abs/1603.06937) and [Cascaded Pyramid Network for Multi-Person Pose Estimation](https://arxiv.org/abs/1711.07319) in TensorFlow for [FashionAI Global Challenge 2018 - Key Points Detection of Apparel](https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.11409106.5678.1.95b62e48Im9JVH&raceId=231648). The CPN(Cascaded Pyramid Network) here has several different backbones: ResNet50, SE-ResNet50, SE-ResNeXt50, [DetNet](https://arxiv.org/abs/1804.06215) or DetResNeXt50. I have also tried [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407) to ensemble models on the fly, although limited improvement was achieved.
+This repository contains codes of the re-implementent of [Stacked Hourglass Networks for Human Pose Estimation](https://arxiv.org/abs/1603.06937), [Simple Baselines for Human Pose Estimation and Tracking (Deconvolution Head Network)](https://arxiv.org/abs/1804.06208) and [Cascaded Pyramid Network for Multi-Person Pose Estimation](https://arxiv.org/abs/1711.07319) in TensorFlow for [FashionAI Global Challenge 2018 - Key Points Detection of Apparel](https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.11409106.5678.1.95b62e48Im9JVH&raceId=231648). Both the CPN(Cascaded Pyramid Network) and DHN (Deconvolution Head Network) here has several different backbones: ResNet50, SE-ResNet50, SE-ResNeXt50, [DetNet](https://arxiv.org/abs/1804.06215) or DetResNeXt50. I have also tried [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407) to ensemble models on the fly, although limited improvement was achieved.
 
 The pre-trained models of backbone networks can be found here:
 
@@ -19,6 +19,7 @@ Almost all the codes was writen by myself and tested under TensorFlow 1.6, Pytho
 About the model:
 
 - DetNet is better, perform almost the same as SEResNeXt, while SEResNet showed little improvement than ResNet
+- DHN has at least the same performance as CPN, but lack of thorough testing due to the limited time
 - Enforce the loss of invisible keypoints to zero gave better performance
 - OHKM is useful
 - It's bad to do gaussian blur on the predicted heatmap, but it's better to do gaussian blur on the target heatmaps for lower-level prediction
@@ -66,9 +67,9 @@ If you find it's useful to your research or competitions, any contribution or st
 	- train_2 -> fashionAI_key_points_test_a_20180227.tar
 	- train_3 -> fashionAI_key_points_test_b_20180418.tgz
 	- test_0  -> round2_fashionAI_key_points_test_a_20180426.tar
-	- test_1  -> round2_fashionAI_key_points_test_b_20180601.tar
+	- test_1  -> round2_fashionAI_key_points_test_b_20180530.zip.zip
 
-- set your local dataset path in [config.py](https://github.com/HiKapok/tf.fashionAI/blob/e90c5b0072338fa638c56ae788f7146d3f36cb1f/config.py#L20)
+- set your local dataset path in [config.py](https://github.com/HiKapok/tf.fashionAI/blob/e90c5b0072338fa638c56ae788f7146d3f36cb1f/config.py#L20), and then run convert_tfrecords.py to generate *.tfrecords
 - create one file foler named 'model' under the root path of your codes, download all the pre-trained weights of the backbone networks and put them into different sub-folders named 'resnet50', 'seresnet50' and 'seresnext50'. Then start training(set RECORDS_DATA_DIR and TEST_RECORDS_DATA_DIR according to your [config.py](https://github.com/HiKapok/tf.fashionAI/blob/e90c5b0072338fa638c56ae788f7146d3f36cb1f/config.py#L20)):
     ```sh
 	python train_detxt_cpn_onebyone.py --run_on_cloud=False --data_dir=RECORDS_DATA_DIR
diff --git a/config.py b/config.py
index 52ba098a..2a27258c 100644
--- a/config.py
+++ b/config.py
@@ -20,6 +20,7 @@
 DATA_DIR = '../Datasets'
 RECORDS_DATA_DIR = '../Datasets/tfrecords'
 TEST_RECORDS_DATA_DIR = '../Datasets/tfrecords_test'
+TEST_RECORDS_STAGE2 = '../Datasets/tfrecords_test_stage2'
 
 CATEGORIES = ['blouse', 'dress', 'outwear', 'skirt', 'trousers']
 SPLITS = ['test_0', 'train_1', 'train_2', 'train_3']#'train_0',
@@ -300,30 +301,31 @@
 # {'trousers': 10251, 'skirt': 11649, 'blouse': 11109, 'dress': 9002, 'outwear': 9586} 51597
 # warm-up {'trousers': 2795, 'skirt': 2292, 'blouse': 2997, 'dress': 2312, 'outwear': 2138} 12534
 # test_a {'trousers': 2631, 'skirt': 2683, 'blouse': 2586, 'dress': 2693, 'outwear': 2508} 13101
+# test_b {'outwear': 10906, 'trousers': 10618, 'dress': 11096, 'skirt': 11154, 'blouse': 10670} 54444
 split_size = {
             '*': {'train': 51597+12534,
                 'val': 0,
-                'test': 13101,
-                'test_a': 9970},
+                'test': 54444,
+                'test_a': 13101},
             'blouse': {'train': 11109+2997,
                 'val': 0,
-                'test': 2586,
-                'test_a': 1974},
+                'test': 10670,
+                'test_a': 2586},
             'dress': {'train': 9002+2312,
                 'val': 0,
-                'test': 2693,
-                'test_a': 2052},
+                'test': 11096,
+                'test_a': 2693},
             'outwear': {'train': 9586+2138,
                 'val': 0,
-                'test': 2508,
-                'test_a': 1947},
+                'test': 10906,
+                'test_a': 2508},
             'skirt': {'train': 11649+2292,
                 'val': 0,
-                'test': 2683,
-                'test_a': 2051},
+                'test': 11154,
+                'test_a': 2683},
             'trousers': {'train': 10251+2795,
                 'val': 0,
-                'test': 2631,
-                'test_a': 1946},
+                'test': 10618,
+                'test_a': 2631},
             }
 
diff --git a/convert_tfrecords.py b/convert_tfrecords.py
index d9aae33a..f54d0bd0 100644
--- a/convert_tfrecords.py
+++ b/convert_tfrecords.py
@@ -313,17 +313,24 @@ def count_split_examples(split_path, file_pattern=''):
 
 if __name__ == '__main__':
     np.random.seed(RANDOM_SEED)
-    #convert_test('../Datasets/tfrecords_test_stage1_b', splits=['test_stage1_b'])
-    os.mkdir(config.RECORDS_DATA_DIR)
-    convert_train(config.RECORDS_DATA_DIR, val_per=0.)
-    convert_train(config.RECORDS_DATA_DIR, val_per=0., all_splits=config.WARM_UP_SPLITS, file_idx_start=1000)
-    os.mkdir(config.TEST_RECORDS_DATA_DIR)
-    convert_test(config.TEST_RECORDS_DATA_DIR)
-    print('blouse', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='blouse_0000_val')
-    , 'outwear', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='outwear_0000_val')
-    , 'dress', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='dress_0000_val')
-    , 'skirt', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='skirt_0000_val')
-    , 'trousers', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='trousers_0000_val')
-    , 'all', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='val'))
+    convert_test(config.TEST_RECORDS_STAGE2, splits=['test_1'])
+    print('blouse', count_split_examples(config.TEST_RECORDS_STAGE2, file_pattern='blouse')
+    , 'outwear', count_split_examples(config.TEST_RECORDS_STAGE2, file_pattern='outwear')
+    , 'dress', count_split_examples(config.TEST_RECORDS_STAGE2, file_pattern='dress')
+    , 'skirt', count_split_examples(config.TEST_RECORDS_STAGE2, file_pattern='skirt')
+    , 'trousers', count_split_examples(config.TEST_RECORDS_STAGE2, file_pattern='trousers')
+    , 'all', count_split_examples(config.TEST_RECORDS_STAGE2, file_pattern='_'))
+
+    # os.mkdir(config.RECORDS_DATA_DIR)
+    # convert_train(config.RECORDS_DATA_DIR, val_per=0.)
+    # convert_train(config.RECORDS_DATA_DIR, val_per=0., all_splits=config.WARM_UP_SPLITS, file_idx_start=1000)
+    # os.mkdir(config.TEST_RECORDS_DATA_DIR)
+    # convert_test(config.TEST_RECORDS_DATA_DIR)
+    # print('blouse', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='blouse_0000_val')
+    # , 'outwear', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='outwear_0000_val')
+    # , 'dress', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='dress_0000_val')
+    # , 'skirt', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='skirt_0000_val')
+    # , 'trousers', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='trousers_0000_val')
+    # , 'all', count_split_examples(config.RECORDS_DATA_DIR, file_pattern='val'))
     # test_dataset()
 
diff --git a/depth_conv2d.py b/depth_conv2d.py
new file mode 100644
index 00000000..f1e1fdcd
--- /dev/null
+++ b/depth_conv2d.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# modified from tensorflow/contrib/layers/python/layers/layers.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.contrib.layers.python.layers import initializers
+from tensorflow.contrib.layers.python.layers import utils
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+
+DATA_FORMAT_NCHW = 'NCHW'
+DATA_FORMAT_NHWC = 'NHWC'
+DATA_FORMAT_NCDHW = 'NCDHW'
+DATA_FORMAT_NDHWC = 'NDHWC'
+
+def _model_variable_getter(getter,
+                           name,
+                           shape=None,
+                           dtype=None,
+                           initializer=None,
+                           regularizer=None,
+                           trainable=True,
+                           collections=None,
+                           caching_device=None,
+                           partitioner=None,
+                           rename=None,
+                           use_resource=None,
+                           **_):
+  """Getter that uses model_variable for compatibility with core layers."""
+  short_name = name.split('/')[-1]
+  if rename and short_name in rename:
+    name_components = name.split('/')
+    name_components[-1] = rename[short_name]
+    name = '/'.join(name_components)
+  return variables.model_variable(
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      collections=collections,
+      trainable=trainable,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      custom_getter=getter,
+      use_resource=use_resource)
+
+
+def _build_variable_getter(rename=None):
+  """Build a model variable getter that respects scope getter and renames."""
+
+  # VariableScope will nest the getters
+  def layer_variable_getter(getter, *args, **kwargs):
+    kwargs['rename'] = rename
+    return _model_variable_getter(getter, *args, **kwargs)
+
+  return layer_variable_getter
+
+def depth_conv2d(
+    inputs,
+    kernel_size,
+    stride=1,
+    channel_multiplier=1,
+    padding='SAME',
+    data_format=DATA_FORMAT_NHWC,
+    rate=1,
+    activation_fn=nn.relu,
+    normalizer_fn=None,
+    normalizer_params=None,
+    weights_initializer=initializers.xavier_initializer(),
+    weights_regularizer=None,
+    biases_initializer=init_ops.zeros_initializer(),
+    biases_regularizer=None,
+    reuse=None,
+    variables_collections=None,
+    outputs_collections=None,
+    trainable=True,
+    scope=None):
+
+    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
+        raise ValueError('data_format has to be either NCHW or NHWC.')
+    layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'depthwise_kernel': 'depthwise_weights'
+    })
+
+    with variable_scope.variable_scope(
+            scope,
+            'SeparableConv2d', [inputs],
+            reuse=reuse,
+            custom_getter=layer_variable_getter) as sc:
+        inputs = ops.convert_to_tensor(inputs)
+
+        df = ('channels_first'
+              if data_format and data_format.startswith('NC') else 'channels_last')
+
+        # Actually apply depthwise conv instead of separable conv.
+        dtype = inputs.dtype.base_dtype
+        kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
+        stride_h, stride_w = utils.two_element_tuple(stride)
+        num_filters_in = utils.channel_dimension(
+            inputs.get_shape(), df, min_rank=4)
+        weights_collections = utils.get_variable_collections(
+            variables_collections, 'weights')
+
+        depthwise_shape = [kernel_h, kernel_w, num_filters_in, channel_multiplier]
+        depthwise_weights = variables.model_variable(
+            'depthwise_weights',
+            shape=depthwise_shape,
+            dtype=dtype,
+            initializer=weights_initializer,
+            regularizer=weights_regularizer,
+            trainable=trainable,
+            collections=weights_collections)
+        strides = [1, 1, stride_h, stride_w] if data_format.startswith('NC') else [1, stride_h, stride_w, 1]
+
+        outputs = nn.depthwise_conv2d(
+            inputs,
+            depthwise_weights,
+            strides,
+            padding,
+            rate=utils.two_element_tuple(rate),
+            data_format=data_format)
+        num_outputs = num_filters_in
+
+        if normalizer_fn is not None:
+            normalizer_params = normalizer_params or {}
+            outputs = normalizer_fn(outputs, **normalizer_params)
+        else:
+            if biases_initializer is not None:
+                biases_collections = utils.get_variable_collections(
+                  variables_collections, 'biases')
+                biases = variables.model_variable(
+                    'biases',
+                    shape=[
+                      num_outputs,
+                    ],
+                    dtype=dtype,
+                    initializer=biases_initializer,
+                    regularizer=biases_regularizer,
+                    trainable=trainable,
+                    collections=biases_collections)
+                outputs = nn.bias_add(outputs, biases, data_format=data_format)
+
+        if activation_fn is not None:
+            outputs = activation_fn(outputs)
+        return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
diff --git a/eval_all_cpn_onepass.py b/eval_all_cpn_onepass.py
index 8ed922fe..12345c0f 100644
--- a/eval_all_cpn_onepass.py
+++ b/eval_all_cpn_onepass.py
@@ -17,6 +17,7 @@
 from __future__ import print_function
 
 import os
+import time
 import sys
 import numpy as np
 import pandas as pd
@@ -27,6 +28,7 @@
 from net import detxt_cpn
 from net import seresnet_cpn
 from net import cpn
+from net import simple_xt
 
 from utility import train_helper
 
@@ -48,7 +50,7 @@
     'gpu_memory_fraction', 1., 'GPU memory fraction to use.')
 # scaffold related configuration
 tf.app.flags.DEFINE_string(
-    'data_dir', '../Datasets/tfrecords_test',#tfrecords_test tfrecords_test_stage1_b
+    'data_dir', '../Datasets/tfrecords_test_stage2',#tfrecords_test tfrecords_test_stage1_b tfrecords_test_stage2
     'The directory where the dataset input data is stored.')
 tf.app.flags.DEFINE_string(
     'dataset_name', '{}_*.tfrecord', 'The pattern of the dataset name to load.')
@@ -97,7 +99,7 @@
     'model_scope', 'blouse',
     'Model scope name used to replace the name_scope in checkpoint.')
 tf.app.flags.DEFINE_boolean(
-    'run_on_cloud', True,
+    'run_on_cloud', False,
     'Wether we will train on cloud.')
 tf.app.flags.DEFINE_string(
     'model_to_eval', 'blouse, dress, outwear, skirt, trousers', #'all, blouse, dress, outwear, skirt, trousers', 'skirt, dress, outwear, trousers',
@@ -106,6 +108,7 @@
 #--model_scope=blouse --checkpoint_path=./logs/blouse
 FLAGS = tf.app.flags.FLAGS
 
+#print(FLAGS.data_dir)
 all_models = {
   'resnet50_cpn': {'backbone': cpn.cascaded_pyramid_net, 'logs_sub_dir': 'logs_cpn'},
   'detnet50_cpn': {'backbone': detnet_cpn.cascaded_pyramid_net, 'logs_sub_dir': 'logs_detnet_cpn'},
@@ -116,6 +119,8 @@
                         'logs_sub_dir': 'logs_large_sext_cpn'},
   'large_detnext_cpn': {'backbone': lambda inputs, output_channals, heatmap_size, istraining, data_format : detxt_cpn.cascaded_pyramid_net(inputs, output_channals, heatmap_size, istraining, data_format, net_depth=101),
                         'logs_sub_dir': 'logs_large_detxt_cpn'},
+  'simple_net': {'backbone': lambda inputs, output_channals, heatmap_size, istraining, data_format : simple_xt.simple_net(inputs, output_channals, heatmap_size, istraining, data_format, net_depth=101),
+                        'logs_sub_dir': 'logs_simple_net'},
   'head_seresnext50_cpn': {'backbone': seresnet_cpn.head_xt_cascaded_pyramid_net, 'logs_sub_dir': 'logs_head_sext_cpn'},
 }
 
@@ -443,10 +448,12 @@ def main(_):
     for m in model_to_eval[1:]:
         if m == '': continue
         df_list.append(pd.read_csv('./{}_{}.csv'.format(FLAGS.backbone.strip(), m), encoding='utf-8'))
-    pd.concat(df_list, ignore_index=True).to_csv('./{}_sub.csv'.format(FLAGS.backbone.strip()), encoding='utf-8', index=False)
+
+    time_stamps = int(time.time())
+    pd.concat(df_list, ignore_index=True).to_csv('./{}_sub_{}.csv'.format(FLAGS.backbone.strip(), time_stamps), encoding='utf-8', index=False)
 
     if FLAGS.run_on_cloud:
-        tf.gfile.Copy('./{}_sub.csv'.format(FLAGS.backbone.strip()), os.path.join(full_model_dir, '{}_sub.csv'.format(FLAGS.backbone.strip())), overwrite=True)
+        tf.gfile.Copy('./{}_sub_{}.csv'.format(FLAGS.backbone.strip(), time_stamps), os.path.join(full_model_dir, '{}_sub_{}.csv'.format(FLAGS.backbone.strip(), time_stamps)), overwrite=True)
 
 if __name__ == '__main__':
   tf.logging.set_verbosity(tf.logging.INFO)
diff --git a/eval_cpn.py b/eval_all_cpn_simple.py
similarity index 61%
rename from eval_cpn.py
rename to eval_all_cpn_simple.py
index d2948bd5..30059426 100644
--- a/eval_cpn.py
+++ b/eval_all_cpn_simple.py
@@ -23,7 +23,12 @@
 #from scipy.misc import imread, imsave, imshow, imresize
 import tensorflow as tf
 
-from net import cpn as cpn
+from net import detnet_cpn
+from net import detxt_cpn
+from net import seresnet_cpn
+from net import cpn
+from net import simple_xt
+
 from utility import train_helper
 
 from preprocessing import preprocessing
@@ -44,13 +49,16 @@
     'gpu_memory_fraction', 1., 'GPU memory fraction to use.')
 # scaffold related configuration
 tf.app.flags.DEFINE_string(
-    'data_dir', '../Datasets/tfrecords_test',
+    'data_dir', '../Datasets/tfrecords_test_stage2',#tfrecords_test tfrecords_test_stage1_b tfrecords_test_stage2
     'The directory where the dataset input data is stored.')
 tf.app.flags.DEFINE_string(
     'dataset_name', '{}_*.tfrecord', 'The pattern of the dataset name to load.')
 tf.app.flags.DEFINE_string(
-    'model_dir', './logs_cpn/',
+    'model_dir', '.',
     'The parent directory where the model will be stored.')
+tf.app.flags.DEFINE_string(
+    'backbone', 'detnet50_cpn',
+    'The backbone network to use for feature extraction.')
 tf.app.flags.DEFINE_integer(
     'log_every_n_steps', 10,
     'The frequency with which logs are print.')
@@ -82,18 +90,15 @@
 tf.app.flags.DEFINE_string(
     'checkpoint_path', None,
     'The path to a checkpoint from which to fine-tune.')
-tf.app.flags.DEFINE_string(
-    'coarse_pred_path', None,
-    'The path to a pred csv file from which to crop the input image for finer prediction.')
 tf.app.flags.DEFINE_boolean(
-    'flip_on_test', False,
+    'flip_on_test', True,
     'Wether we will average predictions of left-right fliped image.')
 tf.app.flags.DEFINE_string(
     #'blouse', 'dress', 'outwear', 'skirt', 'trousers', 'all'
     'model_scope', 'blouse',
     'Model scope name used to replace the name_scope in checkpoint.')
 tf.app.flags.DEFINE_boolean(
-    'run_on_cloud', True,
+    'run_on_cloud', False,
     'Wether we will train on cloud.')
 tf.app.flags.DEFINE_string(
     'model_to_eval', 'blouse, dress, outwear, skirt, trousers', #'all, blouse, dress, outwear, skirt, trousers', 'skirt, dress, outwear, trousers',
@@ -102,53 +107,25 @@
 #--model_scope=blouse --checkpoint_path=./logs/blouse
 FLAGS = tf.app.flags.FLAGS
 
-def preprocessing_fn(org_image, file_name, shape):
-  pd_df = None
-  if FLAGS.coarse_pred_path is not None:
-    if tf.gfile.Exists(FLAGS.coarse_pred_path):
-      tf.logging.info('Finetuning Prediction From {}.'.format(FLAGS.coarse_pred_path))
-      tf.gfile.Copy(FLAGS.coarse_pred_path, './__coarse_pred.csv', overwrite=True)
-      pd_df = pd.read_csv('./__coarse_pred.csv', encoding='utf-8')
-
-      all_filenames = []
-      all_xmin = []
-      all_ymin = []
-      all_xmax = []
-      all_ymax = []
-
-      all_values = pd_df.values.tolist()
-      for records in all_values:
-        all_filenames.append(records[0].encode('utf8'))
-        xmin = 2000
-        ymin = 2000
-        xmax = -1
-        ymax = -1
-        for kp in records[2:]:
-          keypoint_info = kp.strip().split('_')
-          if int(keypoint_info[2]) == -1:
-            continue
-          xmin = min(xmin, int(keypoint_info[0]))
-          ymin = min(ymin, int(keypoint_info[1]))
-          xmax = max(xmax, int(keypoint_info[0]))
-          ymax = max(ymax, int(keypoint_info[1]))
-        all_xmin.append(xmin)
-        all_ymin.append(ymin)
-        all_xmax.append(xmax)
-        all_ymax.append(ymax)
-      #print(all_filenames, all_xmin, all_ymin, all_xmax, all_ymax)
-      xmin_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_xmin, dtype=tf.int64)), -1)
-      ymin_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_ymin, dtype=tf.int64)), -1)
-      xmax_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_xmax, dtype=tf.int64)), -1)
-      ymax_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_ymax, dtype=tf.int64)), -1)
-      pd_df = [xmin_table, ymin_table, xmax_table, ymax_table]
-  #pred_item['file_name'].encode('utf8')
-
-  #lnorm_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.global_norm_key, dtype=tf.int64), tf.constant(config.global_norm_lvalues, dtype=tf.int64)), 0)
-  return preprocessing.preprocess_for_test(org_image, file_name, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size, pred_df=pd_df)
+all_models = {
+  'resnet50_cpn': {'backbone': cpn.cascaded_pyramid_net, 'logs_sub_dir': 'logs_cpn'},
+  'detnet50_cpn': {'backbone': detnet_cpn.cascaded_pyramid_net, 'logs_sub_dir': 'logs_detnet_cpn'},
+  'seresnet50_cpn': {'backbone': seresnet_cpn.cascaded_pyramid_net, 'logs_sub_dir': 'logs_se_cpn'},
+  'seresnext50_cpn': {'backbone': seresnet_cpn.xt_cascaded_pyramid_net, 'logs_sub_dir': 'logs_sext_cpn'},
+  'detnext50_cpn': {'backbone': detxt_cpn.cascaded_pyramid_net, 'logs_sub_dir': 'logs_detxt_cpn'},
+  'large_seresnext_cpn': {'backbone': lambda inputs, output_channals, heatmap_size, istraining, data_format : seresnet_cpn.xt_cascaded_pyramid_net(inputs, output_channals, heatmap_size, istraining, data_format, net_depth=101),
+                        'logs_sub_dir': 'logs_large_sext_cpn'},
+  'large_detnext_cpn': {'backbone': lambda inputs, output_channals, heatmap_size, istraining, data_format : detxt_cpn.cascaded_pyramid_net(inputs, output_channals, heatmap_size, istraining, data_format, net_depth=101),
+                        'logs_sub_dir': 'logs_large_detxt_cpn'},
+  'simple_net': {'backbone': lambda inputs, output_channals, heatmap_size, istraining, data_format : simple_xt.simple_net(inputs, output_channals, heatmap_size, istraining, data_format, net_depth=101),
+                        'logs_sub_dir': 'logs_simple_net'},
+  'head_seresnext50_cpn': {'backbone': seresnet_cpn.head_xt_cascaded_pyramid_net, 'logs_sub_dir': 'logs_head_sext_cpn'},
+}
+
 def input_pipeline(model_scope=FLAGS.model_scope):
-    #preprocessing_fn = lambda org_image, shape: preprocessing.preprocess_for_test(org_image, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size)
+    preprocessing_fn = lambda org_image, file_name, shape: preprocessing.preprocess_for_test_raw_output(org_image, file_name, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size)
 
-    images, shape, file_name, classid, offsets = dataset.slim_test_get_split(FLAGS.data_dir, preprocessing_fn, FLAGS.num_readers, FLAGS.num_preprocessing_threads, file_pattern=FLAGS.dataset_name, category=(model_scope if 'all' not in model_scope else '*'), reader=None)
+    images, shape, file_name, classid, offsets = dataset.slim_test_get_split(FLAGS.data_dir, None, FLAGS.num_readers, FLAGS.num_preprocessing_threads, file_pattern=FLAGS.dataset_name, category=(model_scope if 'all' not in model_scope else '*'), reader=None, dynamic_pad=True)
 
     return {'images': images, 'shape': shape, 'classid': classid, 'file_name': file_name, 'pred_offsets': offsets}
 
@@ -190,33 +167,15 @@ def save_image_with_heatmap(image, height, width, heatmap_size, heatmap, predict
         imsave(os.path.join(config.EVAL_DEBUG_DIR, file_name), img.astype(np.uint8))
       return save_image_with_heatmap.counter
 
-def gaussian_blur(inputs, inputs_filters, sigma, data_format, name=None):
-    with tf.name_scope(name, "gaussian_blur", [inputs]):
-        data_format_ = 'NHWC' if data_format=='channels_last' else 'NCHW'
-        if data_format_ == 'NHWC':
-            inputs = tf.transpose(inputs, [0, 2, 3, 1])
-        ksize = int(6 * sigma + 1.)
-        x = tf.expand_dims(tf.range(ksize, delta=1, dtype=tf.float32), axis=1)
-        y = tf.transpose(x, [1, 0])
-        kernel_matrix = tf.exp(- ((x - ksize/2.) ** 2 + (y - ksize/2.) ** 2) / (2 * sigma ** 2))
-        #print(kernel_matrix)
-        kernel_filter = tf.reshape(kernel_matrix, [ksize, ksize, 1, 1])
-        kernel_filter = tf.tile(kernel_filter, [1, 1, inputs_filters, 1])
-        #kernel_filter = tf.transpose(kernel_filter, [1, 0, 2, 3])
-        outputs = tf.nn.depthwise_conv2d(inputs, kernel_filter, strides=[1, 1, 1, 1], padding='SAME', data_format=data_format_, name='blur')
-        if data_format_ == 'NHWC':
-            outputs = tf.transpose(outputs, [0, 3, 1, 2])
-        return outputs
-
-def get_keypoint(image, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
+def get_keypoint(image, predictions, heatmap_size, height, width, category, clip_at_zero=False, data_format='channels_last', name=None):
     # expand_border = 10
-
     # pad_pred = tf.pad(predictions, tf.constant([[0, 0], [0, 0], [expand_border, expand_border], [expand_border, expand_border]]),
     #               mode='CONSTANT', name='pred_padding', constant_values=0)
 
     # blur_pred = gaussian_blur(pad_pred, config.class_num_joints[category], 3.5, 'channels_first', 'pred_blur')
 
     # predictions = tf.slice(blur_pred, [0, 0, expand_border, expand_border], [1, config.class_num_joints[category], heatmap_size, heatmap_size])
+
     predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
 
     pred_max = tf.reduce_max(predictions, axis=-1)
@@ -267,91 +226,65 @@ def get_keypoint(image, predictions, heatmap_size, height, width, category, clip
         pred_x, pred_y = pred_x * 1., pred_y * 1.
     return pred_x, pred_y
 
-def get_keypoint_v0(image, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
-    predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
-
-    pred_max = tf.reduce_max(predictions, axis=-1)
-    pred_indices = tf.argmax(predictions, axis=-1)
-    pred_x, pred_y = tf.cast(tf.floormod(pred_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_indices, heatmap_size), tf.float32)
-
-    width, height = tf.cast(width, tf.float32), tf.cast(height, tf.float32)
-    pred_x, pred_y = pred_x * width / tf.cast(heatmap_size, tf.float32), pred_y * height / tf.cast(heatmap_size, tf.float32)
-
-    if clip_at_zero:
-      pred_x, pred_y =  pred_x * tf.cast(pred_max>0, tf.float32), pred_y * tf.cast(pred_max>0, tf.float32)
-      pred_x = pred_x * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (width / 2.)
-      pred_y = pred_y * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (height / 2.)
-
-    if config.PRED_DEBUG:
-      pred_indices_ = tf.squeeze(pred_indices)
-      image_ = tf.squeeze(image) * 255.
-      pred_heatmap = tf.one_hot(pred_indices_, heatmap_size*heatmap_size, on_value=255, off_value=0, axis=-1, dtype=tf.int32)
-
-      pred_heatmap = tf.reshape(pred_heatmap, [-1, heatmap_size, heatmap_size])
-      if data_format == 'channels_first':
-        image_ = tf.transpose(image_, perm=(1, 2, 0))
-      save_image_op = tf.py_func(save_image_with_heatmap,
-                                  [image_, height, width,
-                                  heatmap_size,
-                                  pred_heatmap,
-                                  tf.reshape(predictions, [-1, heatmap_size, heatmap_size]),
-                                  config.left_right_group_map[category][0],
-                                  config.left_right_group_map[category][1],
-                                  config.left_right_group_map[category][2]],
-                                  tf.int64, stateful=True)
-      with tf.control_dependencies([save_image_op]):
-        pred_x, pred_y = pred_x * 1., pred_y * 1.
-    return pred_x, pred_y
+backbone_ = all_models[FLAGS.backbone.strip()]['backbone']
 
 def keypoint_model_fn(features, labels, mode, params):
     #print(features)
     shape = features['shape']
     classid = features['classid']
-    pred_offsets = tf.to_float(features['pred_offsets'])
     file_name = features['file_name']
     features = features['images']
 
     file_name = tf.identity(file_name, name='current_file')
 
+    image = preprocessing.preprocess_for_test_raw_output(features, params['train_image_size'], params['train_image_size'], data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), scope='first_stage')
+
     if not params['flip_on_test']:
-        with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE):
-            pred_outputs = cpn.cascaded_pyramid_net(features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
+        with tf.variable_scope(params['model_scope'], default_name=None, values=[image], reuse=tf.AUTO_REUSE):
+            pred_outputs = backbone_(image, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
         if params['data_format'] == 'channels_last':
             pred_outputs = [tf.transpose(pred_outputs[ind], [0, 3, 1, 2], name='outputs_trans_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
+
+        pred_x, pred_y = get_keypoint(image, pred_outputs[-1], params['heatmap_size'], shape[0][0], shape[0][1], (params['model_scope'] if 'all' not in params['model_scope'] else '*'), clip_at_zero=False, data_format=params['data_format'])
     else:
         # test augumentation on the fly
         if params['data_format'] == 'channels_last':
-            double_features = tf.reshape(tf.stack([features, tf.map_fn(tf.image.flip_left_right, features, back_prop=False)], axis = 1), [-1, params['train_image_size'], params['train_image_size'], 3])
+            double_features = tf.reshape(tf.stack([image, tf.map_fn(tf.image.flip_left_right, image, back_prop=False)], axis = 1), [-1, params['train_image_size'], params['train_image_size'], 3])
         else:
-            double_features = tf.reshape(tf.stack([features, tf.transpose(tf.map_fn(tf.image.flip_left_right, tf.transpose(features, [0, 2, 3, 1], name='nchw2nhwc'), back_prop=False), [0, 3, 1, 2], name='nhwc2nchw')], axis = 1), [-1, 3, params['train_image_size'], params['train_image_size']])
+            double_features = tf.reshape(tf.stack([image, tf.transpose(tf.map_fn(tf.image.flip_left_right, tf.transpose(image, [0, 2, 3, 1], name='nchw2nhwc'), back_prop=False), [0, 3, 1, 2], name='nhwc2nchw')], axis = 1), [-1, 3, params['train_image_size'], params['train_image_size']])
 
         num_joints = config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')]
         with tf.variable_scope(params['model_scope'], default_name=None, values=[double_features], reuse=tf.AUTO_REUSE):
-            pred_outputs = cpn.cascaded_pyramid_net(double_features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
+            pred_outputs = backbone_(double_features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
 
         if params['data_format'] == 'channels_last':
             pred_outputs = [tf.transpose(pred_outputs[ind], [0, 3, 1, 2], name='outputs_trans_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-        # [[0, 0, 0, ..], [1, 1, 1, ...], ...]
-        row_indices = tf.tile(tf.reshape(tf.range(tf.shape(double_features)[0]), [-1, 1]), [1, num_joints])
-        # [[0, 1, 2, ...], [1, 0, 2, ...], [0, 1, 2], [1, 0, 2], ...]
-        col_indices = tf.reshape(tf.tile(tf.reshape(tf.stack([tf.range(num_joints), tf.constant(config.left_right_remap[(params['model_scope'] if 'all' not in params['model_scope'] else '*')])], axis=0), [-1]), [tf.shape(features)[0]]), [-1, num_joints])
-        # [[[0, 0], [0, 1], [0, 2], ...], [[1, 1], [1, 0], [1, 2], ...], [[2, 0], [2, 1], [2, 2], ...], ...]
+        row_indices = tf.tile(tf.reshape(tf.stack([tf.range(0, tf.shape(double_features)[0], delta=2), tf.range(1, tf.shape(double_features)[0], delta=2)], axis=0), [-1, 1]), [1, num_joints])
+        col_indices = tf.reshape(tf.tile(tf.reshape(tf.stack([tf.range(num_joints), tf.constant(config.left_right_remap[(params['model_scope'] if 'all' not in params['model_scope'] else '*')])], axis=0), [2, -1]), [1, tf.shape(features)[0]]), [-1, num_joints])
         flip_indices=tf.stack([row_indices, col_indices], axis=-1)
 
         #flip_indices = tf.Print(flip_indices, [flip_indices], summarize=500)
         pred_outputs = [tf.gather_nd(pred_outputs[ind], flip_indices, name='gather_nd_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
 
         def cond_flip(heatmap_ind):
-            return tf.cond(heatmap_ind[1] < 1, lambda : heatmap_ind[0], lambda : tf.transpose(tf.image.flip_left_right(tf.transpose(heatmap_ind[0], [1, 2, 0], name='pred_nchw2nhwc')), [2, 0, 1], name='pred_nhwc2nchw'))
+            return tf.cond(heatmap_ind[1] < tf.shape(features)[0], lambda : heatmap_ind[0], lambda : tf.transpose(tf.image.flip_left_right(tf.transpose(heatmap_ind[0], [1, 2, 0], name='pred_nchw2nhwc')), [2, 0, 1], name='pred_nhwc2nchw'))
         # all the heatmap of the fliped image should also be fliped back
-        pred_outputs = [tf.map_fn(cond_flip, [pred_outputs[ind], tf.tile(tf.reshape(tf.range(2), [-1]), [tf.shape(features)[0]])], dtype=tf.float32, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name='map_fn_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-        # average predictions of left_reight_fliped image
-        segment_indices = tf.reshape(tf.tile(tf.reshape(tf.range(tf.shape(features)[0]), [-1, 1]), [1, 2]), [-1])
-        pred_outputs = [tf.segment_mean(pred_outputs[ind], segment_indices, name='segment_mean_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
+        pred_outputs = [tf.map_fn(cond_flip, [pred_outputs[ind], tf.range(tf.shape(double_features)[0])], dtype=tf.float32, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name='map_fn_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
+        pred_outputs = [tf.split(_, 2) for _ in pred_outputs]
+        pred_outputs_1 = [_[0] for _ in pred_outputs]
+        pred_outputs_2 = [_[1] for _ in pred_outputs]
+        pred_x_first_stage1, pred_y_first_stage1 = get_keypoint(image, pred_outputs_1[-1], params['heatmap_size'], shape[0][0], shape[0][1], (params['model_scope'] if 'all' not in params['model_scope'] else '*'), clip_at_zero=False, data_format=params['data_format'])
+        pred_x_first_stage2, pred_y_first_stage2 = get_keypoint(image, pred_outputs_2[-1], params['heatmap_size'], shape[0][0], shape[0][1], (params['model_scope'] if 'all' not in params['model_scope'] else '*'), clip_at_zero=False, data_format=params['data_format'])
+
+        dist = tf.pow(tf.pow(pred_x_first_stage1 - pred_x_first_stage2, 2.) + tf.pow(pred_y_first_stage1 - pred_y_first_stage2, 2.), .5)
 
-    pred_x, pred_y = get_keypoint(features, pred_outputs[-1], params['heatmap_size'], shape[0][0], shape[0][1], (params['model_scope'] if 'all' not in params['model_scope'] else '*'), clip_at_zero=True, data_format=params['data_format'])
+        pred_x = tf.where(dist < 1e-3, pred_x_first_stage1, pred_x_first_stage1 + (pred_x_first_stage2 - pred_x_first_stage1) * 0.25 / dist)
+        pred_y = tf.where(dist < 1e-3, pred_y_first_stage1, pred_y_first_stage1 + (pred_y_first_stage2 - pred_y_first_stage1) * 0.25 / dist)
 
-    predictions = {'pred_x': pred_x + pred_offsets[:, 0], 'pred_y': pred_y + pred_offsets[:, 1], 'file_name': file_name}
+    # for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):#TRAINABLE_VARIABLES):
+    #   print(var.op.name)
+
+    predictions = {'pred_x': pred_x, 'pred_y': pred_y, 'file_name': file_name}
 
     if mode == tf.estimator.ModeKeys.PREDICT:
         return tf.estimator.EstimatorSpec(
@@ -404,9 +337,11 @@ def main(_):
                                         session_config=sess_config)
 
     model_to_eval = [s.strip() for s in FLAGS.model_to_eval.split(',')]
+
+    full_model_dir = os.path.join(FLAGS.model_dir, all_models[FLAGS.backbone.strip()]['logs_sub_dir'])
     for m in model_to_eval:
         if m == '': continue
-        pred_results = eval_each(keypoint_model_fn, os.path.join(FLAGS.model_dir, m), m, run_config)
+        pred_results = eval_each(keypoint_model_fn, os.path.join(full_model_dir, m), m, run_config)
         #print(pred_results)
         # collect result
         df = pd.DataFrame(columns=['image_id', 'image_category'] + config.all_keys)
@@ -427,17 +362,17 @@ def main(_):
             #Images/blouse/ab669925e96490ec698af976586f0b2f.jpg
             df.loc[cur_record] = [filename, m] + temp_list
             cur_record = cur_record + 1
-        df.to_csv('./{}.csv'.format(m), encoding='utf-8', index=False)
+        df.to_csv('./{}_{}.csv'.format(FLAGS.backbone.strip(), m), encoding='utf-8', index=False)
 
     # merge dataframe
-    df_list = [pd.read_csv('./{}.csv'.format(model_to_eval[0]), encoding='utf-8')]
+    df_list = [pd.read_csv('./{}_{}.csv'.format(FLAGS.backbone.strip(), model_to_eval[0]), encoding='utf-8')]
     for m in model_to_eval[1:]:
         if m == '': continue
-        df_list.append(pd.read_csv('./{}.csv'.format(m), encoding='utf-8'))
-    pd.concat(df_list, ignore_index=True).to_csv('./sub.csv', encoding='utf-8', index=False)
+        df_list.append(pd.read_csv('./{}_{}.csv'.format(FLAGS.backbone.strip(), m), encoding='utf-8'))
+    pd.concat(df_list, ignore_index=True).to_csv('./{}_sub.csv'.format(FLAGS.backbone.strip()), encoding='utf-8', index=False)
 
     if FLAGS.run_on_cloud:
-        tf.gfile.Copy('./sub.csv', os.path.join(FLAGS.model_dir, 'sub.csv'), overwrite=True)
+        tf.gfile.Copy('./{}_sub.csv'.format(FLAGS.backbone.strip()), os.path.join(full_model_dir, '{}_sub.csv'.format(FLAGS.backbone.strip())), overwrite=True)
 
 if __name__ == '__main__':
   tf.logging.set_verbosity(tf.logging.INFO)
diff --git a/eval_detnet_cpn.py b/eval_detnet_cpn.py
deleted file mode 100644
index 9f42e688..00000000
--- a/eval_detnet_cpn.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Copyright 2018 Changan Wang
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import numpy as np
-import pandas as pd
-#from scipy.misc import imread, imsave, imshow, imresize
-import tensorflow as tf
-
-from net import detnet_cpn as cpn
-from utility import train_helper
-
-from preprocessing import preprocessing
-from preprocessing import dataset
-import config
-#--num_readers=2 --num_preprocessing_threads=2 --data_dir=/media/disk/keypoint/tfrecords --model_to_train=all, blouse
-# hardware related configuration
-tf.app.flags.DEFINE_integer(
-    'num_readers', 16,
-    'The number of parallel readers that read data from the dataset.')
-tf.app.flags.DEFINE_integer(
-    'num_preprocessing_threads', 48,
-    'The number of threads used to create the batches.')
-tf.app.flags.DEFINE_integer(
-    'num_cpu_threads', 0,
-    'The number of cpu cores used to train.')
-tf.app.flags.DEFINE_float(
-    'gpu_memory_fraction', 1., 'GPU memory fraction to use.')
-# scaffold related configuration
-tf.app.flags.DEFINE_string(
-    'data_dir', '../Datasets/tfrecords_test',#tfrecords_test tfrecords_test_stage1_b
-    'The directory where the dataset input data is stored.')
-tf.app.flags.DEFINE_string(
-    'dataset_name', '{}_*.tfrecord', 'The pattern of the dataset name to load.')
-tf.app.flags.DEFINE_string(
-    'model_dir', './logs_detnet_cpn/',
-    'The parent directory where the model will be stored.')
-tf.app.flags.DEFINE_integer(
-    'log_every_n_steps', 10,
-    'The frequency with which logs are print.')
-tf.app.flags.DEFINE_integer(
-    'save_summary_steps', 100,
-    'The frequency with which summaries are saved, in seconds.')
-# model related configuration
-tf.app.flags.DEFINE_integer(
-    'train_image_size', 384,
-    'The size of the input image for the model to use.')
-tf.app.flags.DEFINE_integer(
-    'heatmap_size', 96,
-    'The size of the output heatmap of the model.')
-tf.app.flags.DEFINE_float(
-    'heatmap_sigma', 1.,
-    'The sigma of Gaussian which generate the target heatmap.')
-tf.app.flags.DEFINE_float(
-    'bbox_border', 25.,
-    'The nearest distance of the crop border to al keypoints.')
-tf.app.flags.DEFINE_string(
-    'data_format', 'channels_last', # 'channels_first' or 'channels_last'
-    'A flag to override the data format used in the model. channels_first '
-    'provides a performance boost on GPU but is not always compatible '
-    'with CPU. If left unspecified, the data format will be chosen '
-    'automatically based on whether TensorFlow was built for CPU or GPU.')
-tf.app.flags.DEFINE_integer(
-    'tf_random_seed', 20180417, 'Random seed for TensorFlow initializers.')
-# checkpoint related configuration
-tf.app.flags.DEFINE_string(
-    'checkpoint_path', None,
-    'The path to a checkpoint from which to fine-tune.')
-tf.app.flags.DEFINE_string(
-    'coarse_pred_path', None,
-    'The path to a pred csv file from which to crop the input image for finer prediction.')
-tf.app.flags.DEFINE_boolean(
-    'flip_on_test', False,
-    'Wether we will average predictions of left-right fliped image.')
-tf.app.flags.DEFINE_string(
-    #'blouse', 'dress', 'outwear', 'skirt', 'trousers', 'all'
-    'model_scope', 'blouse',
-    'Model scope name used to replace the name_scope in checkpoint.')
-tf.app.flags.DEFINE_boolean(
-    'run_on_cloud', True,
-    'Wether we will train on cloud.')
-tf.app.flags.DEFINE_string(
-    'model_to_eval', 'blouse, dress, outwear, skirt, trousers', #'all, blouse, dress, outwear, skirt, trousers', 'skirt, dress, outwear, trousers',
-    'The sub-model to eval (comma-separated list).')
-
-#--model_scope=blouse --checkpoint_path=./logs/blouse
-FLAGS = tf.app.flags.FLAGS
-
-def preprocessing_fn(org_image, file_name, shape):
-  pd_df = None
-  if FLAGS.coarse_pred_path is not None:
-    if tf.gfile.Exists(FLAGS.coarse_pred_path):
-      tf.logging.info('Finetuning Prediction From {}.'.format(FLAGS.coarse_pred_path))
-      tf.gfile.Copy(FLAGS.coarse_pred_path, './__coarse_pred.csv', overwrite=True)
-      pd_df = pd.read_csv('./__coarse_pred.csv', encoding='utf-8')
-
-      all_filenames = []
-      all_xmin = []
-      all_ymin = []
-      all_xmax = []
-      all_ymax = []
-
-      all_values = pd_df.values.tolist()
-      for records in all_values:
-        all_filenames.append(records[0].encode('utf8'))
-        xmin = 2000
-        ymin = 2000
-        xmax = -1
-        ymax = -1
-        for kp in records[2:]:
-          keypoint_info = kp.strip().split('_')
-          if int(keypoint_info[2]) == -1:
-            continue
-          xmin = min(xmin, int(keypoint_info[0]))
-          ymin = min(ymin, int(keypoint_info[1]))
-          xmax = max(xmax, int(keypoint_info[0]))
-          ymax = max(ymax, int(keypoint_info[1]))
-        all_xmin.append(xmin)
-        all_ymin.append(ymin)
-        all_xmax.append(xmax)
-        all_ymax.append(ymax)
-      #print(all_filenames, all_xmin, all_ymin, all_xmax, all_ymax)
-      xmin_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_xmin, dtype=tf.int64)), -1)
-      ymin_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_ymin, dtype=tf.int64)), -1)
-      xmax_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_xmax, dtype=tf.int64)), -1)
-      ymax_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_ymax, dtype=tf.int64)), -1)
-      pd_df = [xmin_table, ymin_table, xmax_table, ymax_table]
-  #pred_item['file_name'].encode('utf8')
-
-  #lnorm_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.global_norm_key, dtype=tf.int64), tf.constant(config.global_norm_lvalues, dtype=tf.int64)), 0)
-  return preprocessing.preprocess_for_test(org_image, file_name, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size, pred_df=pd_df)
-
-def input_pipeline(model_scope=FLAGS.model_scope):
-    # preprocessing_fn = lambda org_image, shape: preprocessing.preprocess_for_test(org_image, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size)
-
-    images, shape, file_name, classid, offsets = dataset.slim_test_get_split(FLAGS.data_dir, preprocessing_fn, FLAGS.num_readers, FLAGS.num_preprocessing_threads, file_pattern=FLAGS.dataset_name, category=(model_scope if 'all' not in model_scope else '*'), reader=None)
-
-    return {'images': images, 'shape': shape, 'classid': classid, 'file_name': file_name, 'pred_offsets': offsets}
-
-if config.PRED_DEBUG:
-  from scipy.misc import imread, imsave, imshow, imresize
-  def save_image_with_heatmap(image, height, width, heatmap_size, heatmap, predictions, indR, indG, indB):
-      if not hasattr(save_image_with_heatmap, "counter"):
-          save_image_with_heatmap.counter = 0  # it doesn't exist yet, so initialize it
-      save_image_with_heatmap.counter += 1
-
-      img_to_save = np.array(image.tolist()) + 120
-      #print(img_to_save)
-
-      img_to_save = img_to_save.astype(np.uint8)
-
-      heatmap0 = np.sum(heatmap[indR, ...], axis=0).astype(np.uint8)
-      heatmap1 = np.sum(heatmap[indG, ...], axis=0).astype(np.uint8)
-      heatmap2 = np.sum(heatmap[indB, ...], axis=0).astype(np.uint8) if len(indB) > 0 else np.zeros((heatmap_size, heatmap_size), dtype=np.float32)
-
-      img_to_save = imresize(img_to_save, (height, width), interp='lanczos')
-      heatmap0 = imresize(heatmap0, (height, width), interp='lanczos')
-      heatmap1 = imresize(heatmap1, (height, width), interp='lanczos')
-      heatmap2 = imresize(heatmap2, (height, width), interp='lanczos')
-
-      img_to_save = img_to_save/2
-      img_to_save[:,:,0] = np.clip((img_to_save[:,:,0] + heatmap0 + heatmap2), 0, 255)
-      img_to_save[:,:,1] = np.clip((img_to_save[:,:,1] + heatmap1 + heatmap2), 0, 255)
-      #img_to_save[:,:,2] = np.clip((img_to_save[:,:,2]/4. + heatmap2), 0, 255)
-      file_name = 'with_heatmap_{}.jpg'.format(save_image_with_heatmap.counter)
-      imsave(os.path.join(config.EVAL_DEBUG_DIR, file_name), img_to_save.astype(np.uint8))
-
-      predictions = np.array(predictions.tolist())
-      #print(predictions.shape)
-      for ind in range(predictions.shape[0]):
-        img = predictions[ind]
-        img = img - img.min()
-        img *= 255.0/img.max()
-        file_name = 'heatmap_{}_{}.jpg'.format(save_image_with_heatmap.counter, ind)
-        imsave(os.path.join(config.EVAL_DEBUG_DIR, file_name), img.astype(np.uint8))
-      return save_image_with_heatmap.counter
-
-def gaussian_blur(inputs, inputs_filters, sigma, data_format, name=None):
-    with tf.name_scope(name, "gaussian_blur", [inputs]):
-        data_format_ = 'NHWC' if data_format=='channels_last' else 'NCHW'
-        if data_format_ == 'NHWC':
-            inputs = tf.transpose(inputs, [0, 2, 3, 1])
-        ksize = int(6 * sigma + 1.)
-        x = tf.expand_dims(tf.range(ksize, delta=1, dtype=tf.float32), axis=1)
-        y = tf.transpose(x, [1, 0])
-        kernel_matrix = tf.exp(- ((x - ksize/2.) ** 2 + (y - ksize/2.) ** 2) / (2 * sigma ** 2))
-        #print(kernel_matrix)
-        kernel_filter = tf.reshape(kernel_matrix, [ksize, ksize, 1, 1])
-        kernel_filter = tf.tile(kernel_filter, [1, 1, inputs_filters, 1])
-        #kernel_filter = tf.transpose(kernel_filter, [1, 0, 2, 3])
-        outputs = tf.nn.depthwise_conv2d(inputs, kernel_filter, strides=[1, 1, 1, 1], padding='SAME', data_format=data_format_, name='blur')
-        if data_format_ == 'NHWC':
-            outputs = tf.transpose(outputs, [0, 3, 1, 2])
-        return outputs
-
-def get_keypoint(image, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
-    # expand_border = 10
-    # pad_pred = tf.pad(predictions, tf.constant([[0, 0], [0, 0], [expand_border, expand_border], [expand_border, expand_border]]),
-    #               mode='CONSTANT', name='pred_padding', constant_values=0)
-
-    # blur_pred = gaussian_blur(pad_pred, config.class_num_joints[category], 3.5, 'channels_first', 'pred_blur')
-
-    # predictions = tf.slice(blur_pred, [0, 0, expand_border, expand_border], [1, config.class_num_joints[category], heatmap_size, heatmap_size])
-
-    predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
-
-    pred_max = tf.reduce_max(predictions, axis=-1)
-    pred_max_indices = tf.argmax(predictions, axis=-1)
-    pred_max_x, pred_max_y = tf.cast(tf.floormod(pred_max_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_max_indices, heatmap_size), tf.float32)
-    # mask the max elements to zero
-    mask_predictions = predictions * tf.one_hot(pred_max_indices, heatmap_size*heatmap_size, on_value=0., off_value=1., dtype=tf.float32)
-    # get the second max prediction
-    pred_next_max = tf.reduce_max(mask_predictions, axis=-1)
-    pred_next_max_indices = tf.argmax(mask_predictions, axis=-1)
-    pred_next_max_x, pred_next_max_y = tf.cast(tf.floormod(pred_next_max_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_next_max_indices, heatmap_size), tf.float32)
-
-    dist = tf.pow(tf.pow(pred_next_max_x - pred_max_x, 2.) + tf.pow(pred_next_max_y - pred_max_y, 2.), .5)
-
-    pred_x = tf.where(dist < 1e-3, pred_max_x, pred_max_x + (pred_next_max_x - pred_max_x) * 0.25 / dist)
-    pred_y = tf.where(dist < 1e-3, pred_max_y, pred_max_y + (pred_next_max_y - pred_max_y) * 0.25 / dist)
-
-    pred_indices_ = tf.squeeze(tf.cast(pred_x, tf.int64) + tf.cast(pred_y, tf.int64) * heatmap_size)
-
-    width, height = tf.cast(width, tf.float32), tf.cast(height, tf.float32)
-    width_ratio, height_ratio = width / tf.cast(heatmap_size, tf.float32), height / tf.cast(heatmap_size, tf.float32)
-
-    pred_x, pred_y = pred_x * width_ratio, pred_y * height_ratio
-    #pred_x, pred_y = pred_x * width_ratio + width_ratio/2., pred_y * height_ratio + height_ratio/2.
-
-    if clip_at_zero:
-      pred_x, pred_y =  pred_x * tf.cast(pred_max>0, tf.float32), pred_y * tf.cast(pred_max>0, tf.float32)
-      pred_x = pred_x * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (width / 2.)
-      pred_y = pred_y * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (height / 2.)
-
-    if config.PRED_DEBUG:
-      image_ = tf.squeeze(image) * 255.
-      pred_heatmap = tf.one_hot(pred_indices_, heatmap_size*heatmap_size, on_value=255, off_value=0, axis=-1, dtype=tf.int32)
-
-      pred_heatmap = tf.reshape(pred_heatmap, [-1, heatmap_size, heatmap_size])
-      if data_format == 'channels_first':
-        image_ = tf.transpose(image_, perm=(1, 2, 0))
-      save_image_op = tf.py_func(save_image_with_heatmap,
-                                  [image_, height, width,
-                                  heatmap_size,
-                                  pred_heatmap,
-                                  tf.reshape(predictions, [-1, heatmap_size, heatmap_size]),
-                                  config.left_right_group_map[category][0],
-                                  config.left_right_group_map[category][1],
-                                  config.left_right_group_map[category][2]],
-                                  tf.int64, stateful=True)
-      with tf.control_dependencies([save_image_op]):
-        pred_x, pred_y = pred_x * 1., pred_y * 1.
-    return pred_x, pred_y
-
-def get_keypoint_v0(image, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
-    predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
-
-    pred_max = tf.reduce_max(predictions, axis=-1)
-    pred_indices = tf.argmax(predictions, axis=-1)
-    pred_x, pred_y = tf.cast(tf.floormod(pred_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_indices, heatmap_size), tf.float32)
-
-    width, height = tf.cast(width, tf.float32), tf.cast(height, tf.float32)
-    pred_x, pred_y = pred_x * width / tf.cast(heatmap_size, tf.float32), pred_y * height / tf.cast(heatmap_size, tf.float32)
-
-    if clip_at_zero:
-      pred_x, pred_y =  pred_x * tf.cast(pred_max>0, tf.float32), pred_y * tf.cast(pred_max>0, tf.float32)
-      pred_x = pred_x * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (width / 2.)
-      pred_y = pred_y * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (height / 2.)
-
-    if config.PRED_DEBUG:
-      pred_indices_ = tf.squeeze(pred_indices)
-      image_ = tf.squeeze(image) * 255.
-      pred_heatmap = tf.one_hot(pred_indices_, heatmap_size*heatmap_size, on_value=255, off_value=0, axis=-1, dtype=tf.int32)
-
-      pred_heatmap = tf.reshape(pred_heatmap, [-1, heatmap_size, heatmap_size])
-      if data_format == 'channels_first':
-        image_ = tf.transpose(image_, perm=(1, 2, 0))
-      save_image_op = tf.py_func(save_image_with_heatmap,
-                                  [image_, height, width,
-                                  heatmap_size,
-                                  pred_heatmap,
-                                  tf.reshape(predictions, [-1, heatmap_size, heatmap_size]),
-                                  config.left_right_group_map[category][0],
-                                  config.left_right_group_map[category][1],
-                                  config.left_right_group_map[category][2]],
-                                  tf.int64, stateful=True)
-      with tf.control_dependencies([save_image_op]):
-        pred_x, pred_y = pred_x * 1., pred_y * 1.
-    return pred_x, pred_y
-
-def keypoint_model_fn(features, labels, mode, params):
-    #print(features)
-    shape = features['shape']
-    classid = features['classid']
-    pred_offsets = tf.to_float(features['pred_offsets'])
-    file_name = features['file_name']
-    features = features['images']
-
-    file_name = tf.identity(file_name, name='current_file')
-
-    if not params['flip_on_test']:
-        with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE):
-            pred_outputs = cpn.cascaded_pyramid_net(features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
-        if params['data_format'] == 'channels_last':
-            pred_outputs = [tf.transpose(pred_outputs[ind], [0, 3, 1, 2], name='outputs_trans_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-    else:
-        # test augumentation on the fly
-        if params['data_format'] == 'channels_last':
-            double_features = tf.reshape(tf.stack([features, tf.map_fn(tf.image.flip_left_right, features, back_prop=False)], axis = 1), [-1, params['train_image_size'], params['train_image_size'], 3])
-        else:
-            double_features = tf.reshape(tf.stack([features, tf.transpose(tf.map_fn(tf.image.flip_left_right, tf.transpose(features, [0, 2, 3, 1], name='nchw2nhwc'), back_prop=False), [0, 3, 1, 2], name='nhwc2nchw')], axis = 1), [-1, 3, params['train_image_size'], params['train_image_size']])
-
-        num_joints = config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')]
-        with tf.variable_scope(params['model_scope'], default_name=None, values=[double_features], reuse=tf.AUTO_REUSE):
-            pred_outputs = cpn.cascaded_pyramid_net(double_features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
-
-        if params['data_format'] == 'channels_last':
-            pred_outputs = [tf.transpose(pred_outputs[ind], [0, 3, 1, 2], name='outputs_trans_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-        # [[0, 0, 0, ..], [1, 1, 1, ...], ...]
-        row_indices = tf.tile(tf.reshape(tf.range(tf.shape(double_features)[0]), [-1, 1]), [1, num_joints])
-        # [[0, 1, 2, ...], [1, 0, 2, ...], [0, 1, 2], [1, 0, 2], ...]
-        col_indices = tf.reshape(tf.tile(tf.reshape(tf.stack([tf.range(num_joints), tf.constant(config.left_right_remap[(params['model_scope'] if 'all' not in params['model_scope'] else '*')])], axis=0), [-1]), [tf.shape(features)[0]]), [-1, num_joints])
-        # [[[0, 0], [0, 1], [0, 2], ...], [[1, 1], [1, 0], [1, 2], ...], [[2, 0], [2, 1], [2, 2], ...], ...]
-        flip_indices=tf.stack([row_indices, col_indices], axis=-1)
-
-        #flip_indices = tf.Print(flip_indices, [flip_indices], summarize=500)
-        pred_outputs = [tf.gather_nd(pred_outputs[ind], flip_indices, name='gather_nd_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-
-        def cond_flip(heatmap_ind):
-            return tf.cond(heatmap_ind[1] < 1, lambda : heatmap_ind[0], lambda : tf.transpose(tf.image.flip_left_right(tf.transpose(heatmap_ind[0], [1, 2, 0], name='pred_nchw2nhwc')), [2, 0, 1], name='pred_nhwc2nchw'))
-        # all the heatmap of the fliped image should also be fliped back
-        pred_outputs = [tf.map_fn(cond_flip, [pred_outputs[ind], tf.tile(tf.reshape(tf.range(2), [-1]), [tf.shape(features)[0]])], dtype=tf.float32, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name='map_fn_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-        # average predictions of left_reight_fliped image
-        segment_indices = tf.reshape(tf.tile(tf.reshape(tf.range(tf.shape(features)[0]), [-1, 1]), [1, 2]), [-1])
-        pred_outputs = [tf.segment_mean(pred_outputs[ind], segment_indices, name='segment_mean_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-
-    pred_x, pred_y = get_keypoint(features, pred_outputs[-1], params['heatmap_size'], shape[0][0], shape[0][1], (params['model_scope'] if 'all' not in params['model_scope'] else '*'), clip_at_zero=True, data_format=params['data_format'])
-
-    predictions = {'pred_x': pred_x + pred_offsets[:, 0], 'pred_y': pred_y + pred_offsets[:, 1], 'file_name': file_name}
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-        return tf.estimator.EstimatorSpec(
-                              mode=mode,
-                              predictions=predictions,
-                              loss=None, train_op=None)
-    else:
-        raise ValueError('Only "PREDICT" mode is supported.')
-
-def parse_comma_list(args):
-    return [float(s.strip()) for s in args.split(',')]
-
-def eval_each(model_fn, model_dir, model_scope, run_config):
-    fashionAI = tf.estimator.Estimator(
-        model_fn=model_fn, model_dir=model_dir, config=run_config,
-        params={
-            'train_image_size': FLAGS.train_image_size,
-            'heatmap_size': FLAGS.heatmap_size,
-            'data_format': FLAGS.data_format,
-            'model_scope': model_scope,
-            'flip_on_test': FLAGS.flip_on_test,
-        })
-    #tf.logging.info('params recv: %s', FLAGS.flag_values_dict())
-
-    tensors_to_log = {
-        'cur_file': 'current_file'
-    }
-
-    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=FLAGS.log_every_n_steps, formatter=lambda dicts: ', '.join(['%s=%s' % (k, v) for k, v in dicts.items()]))
-    tf.logging.info('Starting to predict model {}.'.format(model_scope))
-    pred_results = fashionAI.predict(input_fn=lambda : input_pipeline(model_scope), hooks=[logging_hook], checkpoint_path=train_helper.get_latest_checkpoint_for_evaluate_(model_dir, model_dir))
-    #tf.logging.info()
-    return list(pred_results)
-
-def main(_):
-    # Using the Winograd non-fused algorithms provides a small performance boost.
-    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
-
-    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction)
-    sess_config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False, intra_op_parallelism_threads = FLAGS.num_cpu_threads, inter_op_parallelism_threads = FLAGS.num_cpu_threads, gpu_options = gpu_options)
-
-    # Set up a RunConfig to only save checkpoints once per training cycle.
-    run_config = tf.estimator.RunConfig().replace(
-                                        save_checkpoints_secs=None).replace(
-                                        save_checkpoints_steps=None).replace(
-                                        save_summary_steps=FLAGS.save_summary_steps).replace(
-                                        keep_checkpoint_max=5).replace(
-                                        tf_random_seed=FLAGS.tf_random_seed).replace(
-                                        log_step_count_steps=FLAGS.log_every_n_steps).replace(
-                                        session_config=sess_config)
-
-    model_to_eval = [s.strip() for s in FLAGS.model_to_eval.split(',')]
-    for m in model_to_eval:
-        if m == '': continue
-        pred_results = eval_each(keypoint_model_fn, os.path.join(FLAGS.model_dir, m), m, run_config)
-        #print(pred_results)
-        # collect result
-        df = pd.DataFrame(columns=['image_id', 'image_category'] + config.all_keys)
-        cur_record = 0
-        gloabl2local_ind = dict(zip(config.class2global_ind_map[m], list(range(len(config.class2global_ind_map[m]))) ))
-        #print(gloabl2local_ind)
-        for pred_item in pred_results:
-            temp_list = []
-            index = 0
-            x = pred_item['pred_x'].tolist()
-            y = pred_item['pred_y'].tolist()
-            filename = pred_item['file_name'].decode('utf8')
-            for ind in list(range(config.class_num_joints['*'])):
-                if ind in gloabl2local_ind:
-                    temp_list.append('{}_{}_1'.format(round(x[gloabl2local_ind[ind]]), round(y[gloabl2local_ind[ind]])))
-                else:
-                    temp_list.append('-1_-1_-1')
-            #Images/blouse/ab669925e96490ec698af976586f0b2f.jpg
-            df.loc[cur_record] = [filename, m] + temp_list
-            cur_record = cur_record + 1
-        df.to_csv('./{}.csv'.format(m), encoding='utf-8', index=False)
-
-    # merge dataframe
-    df_list = [pd.read_csv('./{}.csv'.format(model_to_eval[0]), encoding='utf-8')]
-    for m in model_to_eval[1:]:
-        if m == '': continue
-        df_list.append(pd.read_csv('./{}.csv'.format(m), encoding='utf-8'))
-    pd.concat(df_list, ignore_index=True).to_csv('./sub.csv', encoding='utf-8', index=False)
-
-    if FLAGS.run_on_cloud:
-        tf.gfile.Copy('./sub.csv', os.path.join(FLAGS.model_dir, 'sub.csv'), overwrite=True)
-
-if __name__ == '__main__':
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run()
diff --git a/eval_detxt_cpn.py b/eval_detxt_cpn.py
deleted file mode 100644
index 5b548c66..00000000
--- a/eval_detxt_cpn.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Copyright 2018 Changan Wang
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import numpy as np
-import pandas as pd
-#from scipy.misc import imread, imsave, imshow, imresize
-import tensorflow as tf
-
-from net import detxt_cpn as cpn
-from utility import train_helper
-
-from preprocessing import preprocessing
-from preprocessing import dataset
-import config
-#--num_readers=2 --num_preprocessing_threads=2 --data_dir=/media/disk/keypoint/tfrecords --model_to_train=all, blouse
-# hardware related configuration
-tf.app.flags.DEFINE_integer(
-    'num_readers', 16,
-    'The number of parallel readers that read data from the dataset.')
-tf.app.flags.DEFINE_integer(
-    'num_preprocessing_threads', 48,
-    'The number of threads used to create the batches.')
-tf.app.flags.DEFINE_integer(
-    'num_cpu_threads', 0,
-    'The number of cpu cores used to train.')
-tf.app.flags.DEFINE_float(
-    'gpu_memory_fraction', 1., 'GPU memory fraction to use.')
-# scaffold related configuration
-tf.app.flags.DEFINE_string(
-    'data_dir', '../Datasets/tfrecords_test',#tfrecords_test tfrecords_test_stage1_b
-    'The directory where the dataset input data is stored.')
-tf.app.flags.DEFINE_string(
-    'dataset_name', '{}_*.tfrecord', 'The pattern of the dataset name to load.')
-tf.app.flags.DEFINE_string(
-    'model_dir', './logs_detxt_cpn/',
-    'The parent directory where the model will be stored.')
-tf.app.flags.DEFINE_integer(
-    'log_every_n_steps', 10,
-    'The frequency with which logs are print.')
-tf.app.flags.DEFINE_integer(
-    'save_summary_steps', 100,
-    'The frequency with which summaries are saved, in seconds.')
-# model related configuration
-tf.app.flags.DEFINE_integer(
-    'train_image_size', 384,
-    'The size of the input image for the model to use.')
-tf.app.flags.DEFINE_integer(
-    'heatmap_size', 96,
-    'The size of the output heatmap of the model.')
-tf.app.flags.DEFINE_float(
-    'heatmap_sigma', 1.,
-    'The sigma of Gaussian which generate the target heatmap.')
-tf.app.flags.DEFINE_float(
-    'bbox_border', 25.,
-    'The nearest distance of the crop border to al keypoints.')
-tf.app.flags.DEFINE_string(
-    'data_format', 'channels_last', # 'channels_first' or 'channels_last'
-    'A flag to override the data format used in the model. channels_first '
-    'provides a performance boost on GPU but is not always compatible '
-    'with CPU. If left unspecified, the data format will be chosen '
-    'automatically based on whether TensorFlow was built for CPU or GPU.')
-tf.app.flags.DEFINE_integer(
-    'tf_random_seed', 20180417, 'Random seed for TensorFlow initializers.')
-# checkpoint related configuration
-tf.app.flags.DEFINE_string(
-    'checkpoint_path', None,
-    'The path to a checkpoint from which to fine-tune.')
-tf.app.flags.DEFINE_string(
-    'coarse_pred_path', None,
-    'The path to a pred csv file from which to crop the input image for finer prediction.')
-tf.app.flags.DEFINE_boolean(
-    'flip_on_test', False,
-    'Wether we will average predictions of left-right fliped image.')
-tf.app.flags.DEFINE_string(
-    #'blouse', 'dress', 'outwear', 'skirt', 'trousers', 'all'
-    'model_scope', 'blouse',
-    'Model scope name used to replace the name_scope in checkpoint.')
-tf.app.flags.DEFINE_boolean(
-    'run_on_cloud', True,
-    'Wether we will train on cloud.')
-tf.app.flags.DEFINE_string(
-    'model_to_eval', 'blouse, dress, outwear, skirt, trousers', #'all, blouse, dress, outwear, skirt, trousers', 'skirt, dress, outwear, trousers',
-    'The sub-model to eval (comma-separated list).')
-
-#--model_scope=blouse --checkpoint_path=./logs/blouse
-FLAGS = tf.app.flags.FLAGS
-
-def preprocessing_fn(org_image, file_name, shape):
-  pd_df = None
-  if FLAGS.coarse_pred_path is not None:
-    if tf.gfile.Exists(FLAGS.coarse_pred_path):
-      tf.logging.info('Finetuning Prediction From {}.'.format(FLAGS.coarse_pred_path))
-      tf.gfile.Copy(FLAGS.coarse_pred_path, './__coarse_pred.csv', overwrite=True)
-      pd_df = pd.read_csv('./__coarse_pred.csv', encoding='utf-8')
-
-      all_filenames = []
-      all_xmin = []
-      all_ymin = []
-      all_xmax = []
-      all_ymax = []
-
-      all_values = pd_df.values.tolist()
-      for records in all_values:
-        all_filenames.append(records[0].encode('utf8'))
-        xmin = 2000
-        ymin = 2000
-        xmax = -1
-        ymax = -1
-        for kp in records[2:]:
-          keypoint_info = kp.strip().split('_')
-          if int(keypoint_info[2]) == -1:
-            continue
-          xmin = min(xmin, int(keypoint_info[0]))
-          ymin = min(ymin, int(keypoint_info[1]))
-          xmax = max(xmax, int(keypoint_info[0]))
-          ymax = max(ymax, int(keypoint_info[1]))
-        all_xmin.append(xmin)
-        all_ymin.append(ymin)
-        all_xmax.append(xmax)
-        all_ymax.append(ymax)
-      #print(all_filenames, all_xmin, all_ymin, all_xmax, all_ymax)
-      xmin_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_xmin, dtype=tf.int64)), -1)
-      ymin_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_ymin, dtype=tf.int64)), -1)
-      xmax_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_xmax, dtype=tf.int64)), -1)
-      ymax_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_ymax, dtype=tf.int64)), -1)
-      pd_df = [xmin_table, ymin_table, xmax_table, ymax_table]
-  #pred_item['file_name'].encode('utf8')
-
-  #lnorm_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.global_norm_key, dtype=tf.int64), tf.constant(config.global_norm_lvalues, dtype=tf.int64)), 0)
-  return preprocessing.preprocess_for_test(org_image, file_name, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size, pred_df=pd_df)
-
-def input_pipeline(model_scope=FLAGS.model_scope):
-    # preprocessing_fn = lambda org_image, shape: preprocessing.preprocess_for_test(org_image, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size)
-
-    images, shape, file_name, classid, offsets = dataset.slim_test_get_split(FLAGS.data_dir, preprocessing_fn, FLAGS.num_readers, FLAGS.num_preprocessing_threads, file_pattern=FLAGS.dataset_name, category=(model_scope if 'all' not in model_scope else '*'), reader=None)
-
-    return {'images': images, 'shape': shape, 'classid': classid, 'file_name': file_name, 'pred_offsets': offsets}
-
-if config.PRED_DEBUG:
-  from scipy.misc import imread, imsave, imshow, imresize
-  def save_image_with_heatmap(image, height, width, heatmap_size, heatmap, predictions, indR, indG, indB):
-      if not hasattr(save_image_with_heatmap, "counter"):
-          save_image_with_heatmap.counter = 0  # it doesn't exist yet, so initialize it
-      save_image_with_heatmap.counter += 1
-
-      img_to_save = np.array(image.tolist()) + 120
-      #print(img_to_save)
-
-      img_to_save = img_to_save.astype(np.uint8)
-
-      heatmap0 = np.sum(heatmap[indR, ...], axis=0).astype(np.uint8)
-      heatmap1 = np.sum(heatmap[indG, ...], axis=0).astype(np.uint8)
-      heatmap2 = np.sum(heatmap[indB, ...], axis=0).astype(np.uint8) if len(indB) > 0 else np.zeros((heatmap_size, heatmap_size), dtype=np.float32)
-
-      img_to_save = imresize(img_to_save, (height, width), interp='lanczos')
-      heatmap0 = imresize(heatmap0, (height, width), interp='lanczos')
-      heatmap1 = imresize(heatmap1, (height, width), interp='lanczos')
-      heatmap2 = imresize(heatmap2, (height, width), interp='lanczos')
-
-      img_to_save = img_to_save/2
-      img_to_save[:,:,0] = np.clip((img_to_save[:,:,0] + heatmap0 + heatmap2), 0, 255)
-      img_to_save[:,:,1] = np.clip((img_to_save[:,:,1] + heatmap1 + heatmap2), 0, 255)
-      #img_to_save[:,:,2] = np.clip((img_to_save[:,:,2]/4. + heatmap2), 0, 255)
-      file_name = 'with_heatmap_{}.jpg'.format(save_image_with_heatmap.counter)
-      imsave(os.path.join(config.EVAL_DEBUG_DIR, file_name), img_to_save.astype(np.uint8))
-
-      predictions = np.array(predictions.tolist())
-      #print(predictions.shape)
-      for ind in range(predictions.shape[0]):
-        img = predictions[ind]
-        img = img - img.min()
-        img *= 255.0/img.max()
-        file_name = 'heatmap_{}_{}.jpg'.format(save_image_with_heatmap.counter, ind)
-        imsave(os.path.join(config.EVAL_DEBUG_DIR, file_name), img.astype(np.uint8))
-      return save_image_with_heatmap.counter
-
-def gaussian_blur(inputs, inputs_filters, sigma, data_format, name=None):
-    with tf.name_scope(name, "gaussian_blur", [inputs]):
-        data_format_ = 'NHWC' if data_format=='channels_last' else 'NCHW'
-        if data_format_ == 'NHWC':
-            inputs = tf.transpose(inputs, [0, 2, 3, 1])
-        ksize = int(6 * sigma + 1.)
-        x = tf.expand_dims(tf.range(ksize, delta=1, dtype=tf.float32), axis=1)
-        y = tf.transpose(x, [1, 0])
-        kernel_matrix = tf.exp(- ((x - ksize/2.) ** 2 + (y - ksize/2.) ** 2) / (2 * sigma ** 2))
-        #print(kernel_matrix)
-        kernel_filter = tf.reshape(kernel_matrix, [ksize, ksize, 1, 1])
-        kernel_filter = tf.tile(kernel_filter, [1, 1, inputs_filters, 1])
-        #kernel_filter = tf.transpose(kernel_filter, [1, 0, 2, 3])
-        outputs = tf.nn.depthwise_conv2d(inputs, kernel_filter, strides=[1, 1, 1, 1], padding='SAME', data_format=data_format_, name='blur')
-        if data_format_ == 'NHWC':
-            outputs = tf.transpose(outputs, [0, 3, 1, 2])
-        return outputs
-
-def get_keypoint(image, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
-    # expand_border = 10
-    # pad_pred = tf.pad(predictions, tf.constant([[0, 0], [0, 0], [expand_border, expand_border], [expand_border, expand_border]]),
-    #               mode='CONSTANT', name='pred_padding', constant_values=0)
-
-    # blur_pred = gaussian_blur(pad_pred, config.class_num_joints[category], 3.5, 'channels_first', 'pred_blur')
-
-    # predictions = tf.slice(blur_pred, [0, 0, expand_border, expand_border], [1, config.class_num_joints[category], heatmap_size, heatmap_size])
-
-    predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
-
-    pred_max = tf.reduce_max(predictions, axis=-1)
-    pred_max_indices = tf.argmax(predictions, axis=-1)
-    pred_max_x, pred_max_y = tf.cast(tf.floormod(pred_max_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_max_indices, heatmap_size), tf.float32)
-    # mask the max elements to zero
-    mask_predictions = predictions * tf.one_hot(pred_max_indices, heatmap_size*heatmap_size, on_value=0., off_value=1., dtype=tf.float32)
-    # get the second max prediction
-    pred_next_max = tf.reduce_max(mask_predictions, axis=-1)
-    pred_next_max_indices = tf.argmax(mask_predictions, axis=-1)
-    pred_next_max_x, pred_next_max_y = tf.cast(tf.floormod(pred_next_max_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_next_max_indices, heatmap_size), tf.float32)
-
-    dist = tf.pow(tf.pow(pred_next_max_x - pred_max_x, 2.) + tf.pow(pred_next_max_y - pred_max_y, 2.), .5)
-
-    pred_x = tf.where(dist < 1e-3, pred_max_x, pred_max_x + (pred_next_max_x - pred_max_x) * 0.25 / dist)
-    pred_y = tf.where(dist < 1e-3, pred_max_y, pred_max_y + (pred_next_max_y - pred_max_y) * 0.25 / dist)
-
-    pred_indices_ = tf.squeeze(tf.cast(pred_x, tf.int64) + tf.cast(pred_y, tf.int64) * heatmap_size)
-
-    width, height = tf.cast(width, tf.float32), tf.cast(height, tf.float32)
-    width_ratio, height_ratio = width / tf.cast(heatmap_size, tf.float32), height / tf.cast(heatmap_size, tf.float32)
-
-    pred_x, pred_y = pred_x * width_ratio, pred_y * height_ratio
-    #pred_x, pred_y = pred_x * width_ratio + width_ratio/2., pred_y * height_ratio + height_ratio/2.
-
-    if clip_at_zero:
-      pred_x, pred_y =  pred_x * tf.cast(pred_max>0, tf.float32), pred_y * tf.cast(pred_max>0, tf.float32)
-      pred_x = pred_x * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (width / 2.)
-      pred_y = pred_y * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (height / 2.)
-
-    if config.PRED_DEBUG:
-      image_ = tf.squeeze(image) * 255.
-      pred_heatmap = tf.one_hot(pred_indices_, heatmap_size*heatmap_size, on_value=255, off_value=0, axis=-1, dtype=tf.int32)
-
-      pred_heatmap = tf.reshape(pred_heatmap, [-1, heatmap_size, heatmap_size])
-      if data_format == 'channels_first':
-        image_ = tf.transpose(image_, perm=(1, 2, 0))
-      save_image_op = tf.py_func(save_image_with_heatmap,
-                                  [image_, height, width,
-                                  heatmap_size,
-                                  pred_heatmap,
-                                  tf.reshape(predictions, [-1, heatmap_size, heatmap_size]),
-                                  config.left_right_group_map[category][0],
-                                  config.left_right_group_map[category][1],
-                                  config.left_right_group_map[category][2]],
-                                  tf.int64, stateful=True)
-      with tf.control_dependencies([save_image_op]):
-        pred_x, pred_y = pred_x * 1., pred_y * 1.
-    return pred_x, pred_y
-
-def get_keypoint_v0(image, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
-    predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
-
-    pred_max = tf.reduce_max(predictions, axis=-1)
-    pred_indices = tf.argmax(predictions, axis=-1)
-    pred_x, pred_y = tf.cast(tf.floormod(pred_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_indices, heatmap_size), tf.float32)
-
-    width, height = tf.cast(width, tf.float32), tf.cast(height, tf.float32)
-    pred_x, pred_y = pred_x * width / tf.cast(heatmap_size, tf.float32), pred_y * height / tf.cast(heatmap_size, tf.float32)
-
-    if clip_at_zero:
-      pred_x, pred_y =  pred_x * tf.cast(pred_max>0, tf.float32), pred_y * tf.cast(pred_max>0, tf.float32)
-      pred_x = pred_x * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (width / 2.)
-      pred_y = pred_y * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (height / 2.)
-
-    if config.PRED_DEBUG:
-      pred_indices_ = tf.squeeze(pred_indices)
-      image_ = tf.squeeze(image) * 255.
-      pred_heatmap = tf.one_hot(pred_indices_, heatmap_size*heatmap_size, on_value=255, off_value=0, axis=-1, dtype=tf.int32)
-
-      pred_heatmap = tf.reshape(pred_heatmap, [-1, heatmap_size, heatmap_size])
-      if data_format == 'channels_first':
-        image_ = tf.transpose(image_, perm=(1, 2, 0))
-      save_image_op = tf.py_func(save_image_with_heatmap,
-                                  [image_, height, width,
-                                  heatmap_size,
-                                  pred_heatmap,
-                                  tf.reshape(predictions, [-1, heatmap_size, heatmap_size]),
-                                  config.left_right_group_map[category][0],
-                                  config.left_right_group_map[category][1],
-                                  config.left_right_group_map[category][2]],
-                                  tf.int64, stateful=True)
-      with tf.control_dependencies([save_image_op]):
-        pred_x, pred_y = pred_x * 1., pred_y * 1.
-    return pred_x, pred_y
-
-def keypoint_model_fn(features, labels, mode, params):
-    #print(features)
-    shape = features['shape']
-    classid = features['classid']
-    pred_offsets = tf.to_float(features['pred_offsets'])
-    file_name = features['file_name']
-    features = features['images']
-
-    file_name = tf.identity(file_name, name='current_file')
-
-    if not params['flip_on_test']:
-        with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE):
-            pred_outputs = cpn.cascaded_pyramid_net(features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
-        if params['data_format'] == 'channels_last':
-            pred_outputs = [tf.transpose(pred_outputs[ind], [0, 3, 1, 2], name='outputs_trans_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-    else:
-        # test augumentation on the fly
-        if params['data_format'] == 'channels_last':
-            double_features = tf.reshape(tf.stack([features, tf.map_fn(tf.image.flip_left_right, features, back_prop=False)], axis = 1), [-1, params['train_image_size'], params['train_image_size'], 3])
-        else:
-            double_features = tf.reshape(tf.stack([features, tf.transpose(tf.map_fn(tf.image.flip_left_right, tf.transpose(features, [0, 2, 3, 1], name='nchw2nhwc'), back_prop=False), [0, 3, 1, 2], name='nhwc2nchw')], axis = 1), [-1, 3, params['train_image_size'], params['train_image_size']])
-
-        num_joints = config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')]
-        with tf.variable_scope(params['model_scope'], default_name=None, values=[double_features], reuse=tf.AUTO_REUSE):
-            pred_outputs = cpn.cascaded_pyramid_net(double_features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
-
-        if params['data_format'] == 'channels_last':
-            pred_outputs = [tf.transpose(pred_outputs[ind], [0, 3, 1, 2], name='outputs_trans_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-        # [[0, 0, 0, ..], [1, 1, 1, ...], ...]
-        row_indices = tf.tile(tf.reshape(tf.range(tf.shape(double_features)[0]), [-1, 1]), [1, num_joints])
-        # [[0, 1, 2, ...], [1, 0, 2, ...], [0, 1, 2], [1, 0, 2], ...]
-        col_indices = tf.reshape(tf.tile(tf.reshape(tf.stack([tf.range(num_joints), tf.constant(config.left_right_remap[(params['model_scope'] if 'all' not in params['model_scope'] else '*')])], axis=0), [-1]), [tf.shape(features)[0]]), [-1, num_joints])
-        # [[[0, 0], [0, 1], [0, 2], ...], [[1, 1], [1, 0], [1, 2], ...], [[2, 0], [2, 1], [2, 2], ...], ...]
-        flip_indices=tf.stack([row_indices, col_indices], axis=-1)
-
-        #flip_indices = tf.Print(flip_indices, [flip_indices], summarize=500)
-        pred_outputs = [tf.gather_nd(pred_outputs[ind], flip_indices, name='gather_nd_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-
-        def cond_flip(heatmap_ind):
-            return tf.cond(heatmap_ind[1] < 1, lambda : heatmap_ind[0], lambda : tf.transpose(tf.image.flip_left_right(tf.transpose(heatmap_ind[0], [1, 2, 0], name='pred_nchw2nhwc')), [2, 0, 1], name='pred_nhwc2nchw'))
-        # all the heatmap of the fliped image should also be fliped back
-        pred_outputs = [tf.map_fn(cond_flip, [pred_outputs[ind], tf.tile(tf.reshape(tf.range(2), [-1]), [tf.shape(features)[0]])], dtype=tf.float32, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name='map_fn_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-        # average predictions of left_reight_fliped image
-        segment_indices = tf.reshape(tf.tile(tf.reshape(tf.range(tf.shape(features)[0]), [-1, 1]), [1, 2]), [-1])
-        pred_outputs = [tf.segment_mean(pred_outputs[ind], segment_indices, name='segment_mean_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-
-    pred_x, pred_y = get_keypoint(features, pred_outputs[-1], params['heatmap_size'], shape[0][0], shape[0][1], (params['model_scope'] if 'all' not in params['model_scope'] else '*'), clip_at_zero=True, data_format=params['data_format'])
-
-    predictions = {'pred_x': pred_x + pred_offsets[:, 0], 'pred_y': pred_y + pred_offsets[:, 1], 'file_name': file_name}
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-        return tf.estimator.EstimatorSpec(
-                              mode=mode,
-                              predictions=predictions,
-                              loss=None, train_op=None)
-    else:
-        raise ValueError('Only "PREDICT" mode is supported.')
-
-def parse_comma_list(args):
-    return [float(s.strip()) for s in args.split(',')]
-
-def eval_each(model_fn, model_dir, model_scope, run_config):
-    fashionAI = tf.estimator.Estimator(
-        model_fn=model_fn, model_dir=model_dir, config=run_config,
-        params={
-            'train_image_size': FLAGS.train_image_size,
-            'heatmap_size': FLAGS.heatmap_size,
-            'data_format': FLAGS.data_format,
-            'model_scope': model_scope,
-            'flip_on_test': FLAGS.flip_on_test,
-        })
-    #tf.logging.info('params recv: %s', FLAGS.flag_values_dict())
-
-    tensors_to_log = {
-        'cur_file': 'current_file'
-    }
-
-    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=FLAGS.log_every_n_steps, formatter=lambda dicts: ', '.join(['%s=%s' % (k, v) for k, v in dicts.items()]))
-    tf.logging.info('Starting to predict model {}.'.format(model_scope))
-    pred_results = fashionAI.predict(input_fn=lambda : input_pipeline(model_scope), hooks=[logging_hook], checkpoint_path=train_helper.get_latest_checkpoint_for_evaluate_(model_dir, model_dir))
-    #tf.logging.info()
-    return list(pred_results)
-
-def main(_):
-    # Using the Winograd non-fused algorithms provides a small performance boost.
-    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
-
-    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction)
-    sess_config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False, intra_op_parallelism_threads = FLAGS.num_cpu_threads, inter_op_parallelism_threads = FLAGS.num_cpu_threads, gpu_options = gpu_options)
-
-    # Set up a RunConfig to only save checkpoints once per training cycle.
-    run_config = tf.estimator.RunConfig().replace(
-                                        save_checkpoints_secs=None).replace(
-                                        save_checkpoints_steps=None).replace(
-                                        save_summary_steps=FLAGS.save_summary_steps).replace(
-                                        keep_checkpoint_max=5).replace(
-                                        tf_random_seed=FLAGS.tf_random_seed).replace(
-                                        log_step_count_steps=FLAGS.log_every_n_steps).replace(
-                                        session_config=sess_config)
-
-    model_to_eval = [s.strip() for s in FLAGS.model_to_eval.split(',')]
-    for m in model_to_eval:
-        if m == '': continue
-        pred_results = eval_each(keypoint_model_fn, os.path.join(FLAGS.model_dir, m), m, run_config)
-        #print(pred_results)
-        # collect result
-        df = pd.DataFrame(columns=['image_id', 'image_category'] + config.all_keys)
-        cur_record = 0
-        gloabl2local_ind = dict(zip(config.class2global_ind_map[m], list(range(len(config.class2global_ind_map[m]))) ))
-        #print(gloabl2local_ind)
-        for pred_item in pred_results:
-            temp_list = []
-            index = 0
-            x = pred_item['pred_x'].tolist()
-            y = pred_item['pred_y'].tolist()
-            filename = pred_item['file_name'].decode('utf8')
-            for ind in list(range(config.class_num_joints['*'])):
-                if ind in gloabl2local_ind:
-                    temp_list.append('{}_{}_1'.format(round(x[gloabl2local_ind[ind]]), round(y[gloabl2local_ind[ind]])))
-                else:
-                    temp_list.append('-1_-1_-1')
-            #Images/blouse/ab669925e96490ec698af976586f0b2f.jpg
-            df.loc[cur_record] = [filename, m] + temp_list
-            cur_record = cur_record + 1
-        df.to_csv('./{}.csv'.format(m), encoding='utf-8', index=False)
-
-    # merge dataframe
-    df_list = [pd.read_csv('./{}.csv'.format(model_to_eval[0]), encoding='utf-8')]
-    for m in model_to_eval[1:]:
-        if m == '': continue
-        df_list.append(pd.read_csv('./{}.csv'.format(m), encoding='utf-8'))
-    pd.concat(df_list, ignore_index=True).to_csv('./sub.csv', encoding='utf-8', index=False)
-
-    if FLAGS.run_on_cloud:
-        tf.gfile.Copy('./sub.csv', os.path.join(FLAGS.model_dir, 'sub.csv'), overwrite=True)
-
-if __name__ == '__main__':
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run()
diff --git a/eval_senet_cpn.py b/eval_senet_cpn.py
deleted file mode 100644
index bf812a40..00000000
--- a/eval_senet_cpn.py
+++ /dev/null
@@ -1,451 +0,0 @@
-# Copyright 2018 Changan Wang
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import numpy as np
-import pandas as pd
-#from scipy.misc import imread, imsave, imshow, imresize
-import tensorflow as tf
-
-from net import seresnet_cpn as cpn
-from utility import train_helper
-
-from preprocessing import preprocessing
-from preprocessing import dataset
-import config
-#--num_readers=2 --num_preprocessing_threads=2 --data_dir=/media/disk/keypoint/tfrecords --model_to_train=all, blouse
-# hardware related configuration
-tf.app.flags.DEFINE_integer(
-    'num_readers', 16,
-    'The number of parallel readers that read data from the dataset.')
-tf.app.flags.DEFINE_integer(
-    'num_preprocessing_threads', 48,
-    'The number of threads used to create the batches.')
-tf.app.flags.DEFINE_integer(
-    'num_cpu_threads', 0,
-    'The number of cpu cores used to train.')
-tf.app.flags.DEFINE_float(
-    'gpu_memory_fraction', 1., 'GPU memory fraction to use.')
-# scaffold related configuration
-tf.app.flags.DEFINE_string(
-    'data_dir', '../Datasets/tfrecords_test',#tfrecords_test_stage1_b tfrecords_test
-    'The directory where the dataset input data is stored.')
-tf.app.flags.DEFINE_string(
-    'dataset_name', '{}_*.tfrecord', 'The pattern of the dataset name to load.')
-tf.app.flags.DEFINE_string(
-    'model_dir', './logs_sext_cpn/',
-    'The parent directory where the model will be stored.')
-tf.app.flags.DEFINE_integer(
-    'log_every_n_steps', 10,
-    'The frequency with which logs are print.')
-tf.app.flags.DEFINE_integer(
-    'save_summary_steps', 100,
-    'The frequency with which summaries are saved, in seconds.')
-# model related configuration
-tf.app.flags.DEFINE_integer(
-    'train_image_size', 384,
-    'The size of the input image for the model to use.')
-tf.app.flags.DEFINE_integer(
-    'heatmap_size', 96,
-    'The size of the output heatmap of the model.')
-tf.app.flags.DEFINE_string(
-    'backbone', 'seresnext50',#or seresnext50 seresnet50
-    'The backbone network to use for feature pyramid.')
-tf.app.flags.DEFINE_float(
-    'heatmap_sigma', 1.,
-    'The sigma of Gaussian which generate the target heatmap.')
-tf.app.flags.DEFINE_float(
-    'bbox_border', 25.,
-    'The nearest distance of the crop border to al keypoints.')
-tf.app.flags.DEFINE_string(
-    'data_format', 'channels_last', # 'channels_first' or 'channels_last'
-    'A flag to override the data format used in the model. channels_first '
-    'provides a performance boost on GPU but is not always compatible '
-    'with CPU. If left unspecified, the data format will be chosen '
-    'automatically based on whether TensorFlow was built for CPU or GPU.')
-tf.app.flags.DEFINE_integer(
-    'tf_random_seed', 20180417, 'Random seed for TensorFlow initializers.')
-# checkpoint related configuration
-tf.app.flags.DEFINE_string(
-    'checkpoint_path', None,
-    'The path to a checkpoint from which to fine-tune.')
-tf.app.flags.DEFINE_string(
-    'coarse_pred_path', None,
-    'The path to a pred csv file from which to crop the input image for finer prediction.')
-tf.app.flags.DEFINE_boolean(
-    'flip_on_test', False,
-    'Wether we will average predictions of left-right fliped image.')
-tf.app.flags.DEFINE_string(
-    #'blouse', 'dress', 'outwear', 'skirt', 'trousers', 'all'
-    'model_scope', 'blouse',
-    'Model scope name used to replace the name_scope in checkpoint.')
-tf.app.flags.DEFINE_boolean(
-    'run_on_cloud', True,
-    'Wether we will train on cloud.')
-tf.app.flags.DEFINE_string(
-    'model_to_eval', 'blouse, dress, outwear, skirt, trousers', #'all, blouse, dress, outwear, skirt, trousers', 'skirt, dress, outwear, trousers',
-    'The sub-model to eval (comma-separated list).')
-
-#--model_scope=blouse --checkpoint_path=./logs/blouse
-FLAGS = tf.app.flags.FLAGS
-
-def preprocessing_fn(org_image, file_name, shape):
-  pd_df = None
-  if FLAGS.coarse_pred_path is not None:
-    tf.logging.info('Finetuning Prediction From {}.'.format(FLAGS.coarse_pred_path))
-    if tf.gfile.Exists(FLAGS.coarse_pred_path):
-      tf.gfile.Copy(FLAGS.coarse_pred_path, './__coarse_pred.csv', overwrite=True)
-      pd_df = pd.read_csv('./__coarse_pred.csv', encoding='utf-8')
-
-      all_filenames = []
-      all_xmin = []
-      all_ymin = []
-      all_xmax = []
-      all_ymax = []
-
-      all_values = pd_df.values.tolist()
-      for records in all_values:
-        all_filenames.append(records[0].encode('utf8'))
-        xmin = 2000
-        ymin = 2000
-        xmax = -1
-        ymax = -1
-        for kp in records[2:]:
-          keypoint_info = kp.strip().split('_')
-          if int(keypoint_info[2]) == -1:
-            continue
-          xmin = min(xmin, int(keypoint_info[0]))
-          ymin = min(ymin, int(keypoint_info[1]))
-          xmax = max(xmax, int(keypoint_info[0]))
-          ymax = max(ymax, int(keypoint_info[1]))
-        all_xmin.append(xmin)
-        all_ymin.append(ymin)
-        all_xmax.append(xmax)
-        all_ymax.append(ymax)
-      #print(all_filenames, all_xmin, all_ymin, all_xmax, all_ymax)
-      xmin_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_xmin, dtype=tf.int64)), -1)
-      ymin_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_ymin, dtype=tf.int64)), -1)
-      xmax_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_xmax, dtype=tf.int64)), -1)
-      ymax_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(all_filenames, dtype=tf.string), tf.constant(all_ymax, dtype=tf.int64)), -1)
-      pd_df = [xmin_table, ymin_table, xmax_table, ymax_table]
-  #pred_item['file_name'].encode('utf8')
-
-  #lnorm_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.global_norm_key, dtype=tf.int64), tf.constant(config.global_norm_lvalues, dtype=tf.int64)), 0)
-  return preprocessing.preprocess_for_test(org_image, file_name, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size, pred_df=pd_df)
-def input_pipeline(model_scope=FLAGS.model_scope):
-    #preprocessing_fn = lambda org_image, shape: preprocessing.preprocess_for_test(org_image, shape, FLAGS.train_image_size, FLAGS.train_image_size, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size)
-
-    images, shape, file_name, classid, offsets = dataset.slim_test_get_split(FLAGS.data_dir, preprocessing_fn, FLAGS.num_readers, FLAGS.num_preprocessing_threads, file_pattern=FLAGS.dataset_name, category=(model_scope if 'all' not in model_scope else '*'), reader=None)
-
-    return {'images': images, 'shape': shape, 'classid': classid, 'file_name': file_name, 'pred_offsets': offsets}
-
-if config.PRED_DEBUG:
-  from scipy.misc import imread, imsave, imshow, imresize
-  def save_image_with_heatmap(image, height, width, heatmap_size, heatmap, predictions, indR, indG, indB):
-      if not hasattr(save_image_with_heatmap, "counter"):
-          save_image_with_heatmap.counter = 0  # it doesn't exist yet, so initialize it
-      save_image_with_heatmap.counter += 1
-
-      img_to_save = np.array(image.tolist()) + 120
-      #print(img_to_save)
-
-      img_to_save = img_to_save.astype(np.uint8)
-
-      heatmap0 = np.sum(heatmap[indR, ...], axis=0).astype(np.uint8)
-      heatmap1 = np.sum(heatmap[indG, ...], axis=0).astype(np.uint8)
-      heatmap2 = np.sum(heatmap[indB, ...], axis=0).astype(np.uint8) if len(indB) > 0 else np.zeros((heatmap_size, heatmap_size), dtype=np.float32)
-
-      img_to_save = imresize(img_to_save, (height, width), interp='lanczos')
-      heatmap0 = imresize(heatmap0, (height, width), interp='lanczos')
-      heatmap1 = imresize(heatmap1, (height, width), interp='lanczos')
-      heatmap2 = imresize(heatmap2, (height, width), interp='lanczos')
-
-      img_to_save = img_to_save/2
-      img_to_save[:,:,0] = np.clip((img_to_save[:,:,0] + heatmap0 + heatmap2), 0, 255)
-      img_to_save[:,:,1] = np.clip((img_to_save[:,:,1] + heatmap1 + heatmap2), 0, 255)
-      #img_to_save[:,:,2] = np.clip((img_to_save[:,:,2]/4. + heatmap2), 0, 255)
-      file_name = 'with_heatmap_{}.jpg'.format(save_image_with_heatmap.counter)
-      imsave(os.path.join(config.EVAL_DEBUG_DIR, file_name), img_to_save.astype(np.uint8))
-
-      predictions = np.array(predictions.tolist())
-      #print(predictions.shape)
-      for ind in range(predictions.shape[0]):
-        img = predictions[ind]
-        img = img - img.min()
-        img *= 255.0/img.max()
-        file_name = 'heatmap_{}_{}.jpg'.format(save_image_with_heatmap.counter, ind)
-        imsave(os.path.join(config.EVAL_DEBUG_DIR, file_name), img.astype(np.uint8))
-      return save_image_with_heatmap.counter
-
-def gaussian_blur(inputs, inputs_filters, sigma, data_format, name=None):
-    with tf.name_scope(name, "gaussian_blur", [inputs]):
-        data_format_ = 'NHWC' if data_format=='channels_last' else 'NCHW'
-        if data_format_ == 'NHWC':
-            inputs = tf.transpose(inputs, [0, 2, 3, 1])
-        ksize = int(6 * sigma + 1.)
-        x = tf.expand_dims(tf.range(ksize, delta=1, dtype=tf.float32), axis=1)
-        y = tf.transpose(x, [1, 0])
-        kernel_matrix = tf.exp(- ((x - ksize/2.) ** 2 + (y - ksize/2.) ** 2) / (2 * sigma ** 2))
-        #print(kernel_matrix)
-        kernel_filter = tf.reshape(kernel_matrix, [ksize, ksize, 1, 1])
-        kernel_filter = tf.tile(kernel_filter, [1, 1, inputs_filters, 1])
-        #kernel_filter = tf.transpose(kernel_filter, [1, 0, 2, 3])
-        outputs = tf.nn.depthwise_conv2d(inputs, kernel_filter, strides=[1, 1, 1, 1], padding='SAME', data_format=data_format_, name='blur')
-        if data_format_ == 'NHWC':
-            outputs = tf.transpose(outputs, [0, 3, 1, 2])
-        return outputs
-
-def get_keypoint(image, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
-    # expand_border = 10
-
-    # pad_pred = tf.pad(predictions, tf.constant([[0, 0], [0, 0], [expand_border, expand_border], [expand_border, expand_border]]),
-    #               mode='CONSTANT', name='pred_padding', constant_values=0)
-
-    # blur_pred = gaussian_blur(pad_pred, config.class_num_joints[category], 3.5, 'channels_first', 'pred_blur')
-
-    # predictions = tf.slice(blur_pred, [0, 0, expand_border, expand_border], [1, config.class_num_joints[category], heatmap_size, heatmap_size])
-    predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
-
-    pred_max = tf.reduce_max(predictions, axis=-1)
-    pred_max_indices = tf.argmax(predictions, axis=-1)
-    pred_max_x, pred_max_y = tf.cast(tf.floormod(pred_max_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_max_indices, heatmap_size), tf.float32)
-    # mask the max elements to zero
-    mask_predictions = predictions * tf.one_hot(pred_max_indices, heatmap_size*heatmap_size, on_value=0., off_value=1., dtype=tf.float32)
-    # get the second max prediction
-    pred_next_max = tf.reduce_max(mask_predictions, axis=-1)
-    pred_next_max_indices = tf.argmax(mask_predictions, axis=-1)
-    pred_next_max_x, pred_next_max_y = tf.cast(tf.floormod(pred_next_max_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_next_max_indices, heatmap_size), tf.float32)
-
-    dist = tf.pow(tf.pow(pred_next_max_x - pred_max_x, 2.) + tf.pow(pred_next_max_y - pred_max_y, 2.), .5)
-
-    pred_x = tf.where(dist < 1e-3, pred_max_x, pred_max_x + (pred_next_max_x - pred_max_x) * 0.25 / dist)
-    pred_y = tf.where(dist < 1e-3, pred_max_y, pred_max_y + (pred_next_max_y - pred_max_y) * 0.25 / dist)
-
-    pred_indices_ = tf.squeeze(tf.cast(pred_x, tf.int64) + tf.cast(pred_y, tf.int64) * heatmap_size)
-
-    width, height = tf.cast(width, tf.float32), tf.cast(height, tf.float32)
-    width_ratio, height_ratio = width / tf.cast(heatmap_size, tf.float32), height / tf.cast(heatmap_size, tf.float32)
-
-    pred_x, pred_y = pred_x * width_ratio, pred_y * height_ratio
-    #pred_x, pred_y = pred_x * width_ratio + width_ratio/2., pred_y * height_ratio + height_ratio/2.
-
-    if clip_at_zero:
-      pred_x, pred_y =  pred_x * tf.cast(pred_max>0, tf.float32), pred_y * tf.cast(pred_max>0, tf.float32)
-      pred_x = pred_x * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (width / 2.)
-      pred_y = pred_y * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (height / 2.)
-
-    if config.PRED_DEBUG:
-      image_ = tf.squeeze(image) * 255.
-      pred_heatmap = tf.one_hot(pred_indices_, heatmap_size*heatmap_size, on_value=255, off_value=0, axis=-1, dtype=tf.int32)
-
-      pred_heatmap = tf.reshape(pred_heatmap, [-1, heatmap_size, heatmap_size])
-      if data_format == 'channels_first':
-        image_ = tf.transpose(image_, perm=(1, 2, 0))
-      save_image_op = tf.py_func(save_image_with_heatmap,
-                                  [image_, height, width,
-                                  heatmap_size,
-                                  pred_heatmap,
-                                  tf.reshape(predictions, [-1, heatmap_size, heatmap_size]),
-                                  config.left_right_group_map[category][0],
-                                  config.left_right_group_map[category][1],
-                                  config.left_right_group_map[category][2]],
-                                  tf.int64, stateful=True)
-      with tf.control_dependencies([save_image_op]):
-        pred_x, pred_y = pred_x * 1., pred_y * 1.
-    return pred_x, pred_y
-
-def get_keypoint_v0(image, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
-    predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
-
-    pred_max = tf.reduce_max(predictions, axis=-1)
-    pred_indices = tf.argmax(predictions, axis=-1)
-    pred_x, pred_y = tf.cast(tf.floormod(pred_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_indices, heatmap_size), tf.float32)
-
-    width, height = tf.cast(width, tf.float32), tf.cast(height, tf.float32)
-    pred_x, pred_y = pred_x * width / tf.cast(heatmap_size, tf.float32), pred_y * height / tf.cast(heatmap_size, tf.float32)
-
-    if clip_at_zero:
-      pred_x, pred_y =  pred_x * tf.cast(pred_max>0, tf.float32), pred_y * tf.cast(pred_max>0, tf.float32)
-      pred_x = pred_x * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (width / 2.)
-      pred_y = pred_y * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (height / 2.)
-
-    if config.PRED_DEBUG:
-      pred_indices_ = tf.squeeze(pred_indices)
-      image_ = tf.squeeze(image) * 255.
-      pred_heatmap = tf.one_hot(pred_indices_, heatmap_size*heatmap_size, on_value=255, off_value=0, axis=-1, dtype=tf.int32)
-
-      pred_heatmap = tf.reshape(pred_heatmap, [-1, heatmap_size, heatmap_size])
-      if data_format == 'channels_first':
-        image_ = tf.transpose(image_, perm=(1, 2, 0))
-      save_image_op = tf.py_func(save_image_with_heatmap,
-                                  [image_, height, width,
-                                  heatmap_size,
-                                  pred_heatmap,
-                                  tf.reshape(predictions, [-1, heatmap_size, heatmap_size]),
-                                  config.left_right_group_map[category][0],
-                                  config.left_right_group_map[category][1],
-                                  config.left_right_group_map[category][2]],
-                                  tf.int64, stateful=True)
-      with tf.control_dependencies([save_image_op]):
-        pred_x, pred_y = pred_x * 1., pred_y * 1.
-    return pred_x, pred_y
-
-cpn_backbone = cpn.cascaded_pyramid_net
-if 'seresnext50' in FLAGS.backbone:
-    cpn_backbone = cpn.xt_cascaded_pyramid_net
-
-def keypoint_model_fn(features, labels, mode, params):
-    #print(features)
-    shape = features['shape']
-    classid = features['classid']
-    pred_offsets = tf.to_float(features['pred_offsets'])
-    file_name = features['file_name']
-    features = features['images']
-
-    file_name = tf.identity(file_name, name='current_file')
-
-    if not params['flip_on_test']:
-        with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE):
-            pred_outputs = cpn_backbone(features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
-        if params['data_format'] == 'channels_last':
-            pred_outputs = [tf.transpose(pred_outputs[ind], [0, 3, 1, 2], name='outputs_trans_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-    else:
-        # test augumentation on the fly
-        if params['data_format'] == 'channels_last':
-            double_features = tf.reshape(tf.stack([features, tf.map_fn(tf.image.flip_left_right, features, back_prop=False)], axis = 1), [-1, params['train_image_size'], params['train_image_size'], 3])
-        else:
-            double_features = tf.reshape(tf.stack([features, tf.transpose(tf.map_fn(tf.image.flip_left_right, tf.transpose(features, [0, 2, 3, 1], name='nchw2nhwc'), back_prop=False), [0, 3, 1, 2], name='nhwc2nchw')], axis = 1), [-1, 3, params['train_image_size'], params['train_image_size']])
-
-        num_joints = config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')]
-        with tf.variable_scope(params['model_scope'], default_name=None, values=[double_features], reuse=tf.AUTO_REUSE):
-            pred_outputs = cpn_backbone(double_features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])
-
-        if params['data_format'] == 'channels_last':
-            pred_outputs = [tf.transpose(pred_outputs[ind], [0, 3, 1, 2], name='outputs_trans_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-        # [[0, 0, 0, ..], [1, 1, 1, ...], ...]
-        row_indices = tf.tile(tf.reshape(tf.range(tf.shape(double_features)[0]), [-1, 1]), [1, num_joints])
-        # [[0, 1, 2, ...], [1, 0, 2, ...], [0, 1, 2], [1, 0, 2], ...]
-        col_indices = tf.reshape(tf.tile(tf.reshape(tf.stack([tf.range(num_joints), tf.constant(config.left_right_remap[(params['model_scope'] if 'all' not in params['model_scope'] else '*')])], axis=0), [-1]), [tf.shape(features)[0]]), [-1, num_joints])
-        # [[[0, 0], [0, 1], [0, 2], ...], [[1, 1], [1, 0], [1, 2], ...], [[2, 0], [2, 1], [2, 2], ...], ...]
-        flip_indices=tf.stack([row_indices, col_indices], axis=-1)
-
-        #flip_indices = tf.Print(flip_indices, [flip_indices], summarize=500)
-        pred_outputs = [tf.gather_nd(pred_outputs[ind], flip_indices, name='gather_nd_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-
-        def cond_flip(heatmap_ind):
-            return tf.cond(heatmap_ind[1] < 1, lambda : heatmap_ind[0], lambda : tf.transpose(tf.image.flip_left_right(tf.transpose(heatmap_ind[0], [1, 2, 0], name='pred_nchw2nhwc')), [2, 0, 1], name='pred_nhwc2nchw'))
-        # all the heatmap of the fliped image should also be fliped back
-        pred_outputs = [tf.map_fn(cond_flip, [pred_outputs[ind], tf.tile(tf.reshape(tf.range(2), [-1]), [tf.shape(features)[0]])], dtype=tf.float32, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name='map_fn_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-        # average predictions of left_reight_fliped image
-        segment_indices = tf.reshape(tf.tile(tf.reshape(tf.range(tf.shape(features)[0]), [-1, 1]), [1, 2]), [-1])
-        pred_outputs = [tf.segment_mean(pred_outputs[ind], segment_indices, name='segment_mean_{}'.format(ind)) for ind in list(range(len(pred_outputs)))]
-
-    pred_x, pred_y = get_keypoint(features, pred_outputs[-1], params['heatmap_size'], shape[0][0], shape[0][1], (params['model_scope'] if 'all' not in params['model_scope'] else '*'), clip_at_zero=True, data_format=params['data_format'])
-
-    predictions = {'pred_x': pred_x + pred_offsets[:, 0], 'pred_y': pred_y + pred_offsets[:, 1], 'file_name': file_name}
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-        return tf.estimator.EstimatorSpec(
-                              mode=mode,
-                              predictions=predictions,
-                              loss=None, train_op=None)
-    else:
-        raise ValueError('Only "PREDICT" mode is supported.')
-
-def parse_comma_list(args):
-    return [float(s.strip()) for s in args.split(',')]
-
-def eval_each(model_fn, model_dir, model_scope, run_config):
-    fashionAI = tf.estimator.Estimator(
-        model_fn=model_fn, model_dir=model_dir, config=run_config,
-        params={
-            'train_image_size': FLAGS.train_image_size,
-            'heatmap_size': FLAGS.heatmap_size,
-            'data_format': FLAGS.data_format,
-            'model_scope': model_scope,
-            'flip_on_test': FLAGS.flip_on_test,
-        })
-    #tf.logging.info('params recv: %s', FLAGS.flag_values_dict())
-
-    tensors_to_log = {
-        'cur_file': 'current_file'
-    }
-
-    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=FLAGS.log_every_n_steps, formatter=lambda dicts: ', '.join(['%s=%s' % (k, v) for k, v in dicts.items()]))
-    tf.logging.info('Starting to predict model {}.'.format(model_scope))
-    pred_results = fashionAI.predict(input_fn=lambda : input_pipeline(model_scope), hooks=[logging_hook], checkpoint_path=train_helper.get_latest_checkpoint_for_evaluate_(model_dir, model_dir))
-    #tf.logging.info()
-    return list(pred_results)
-
-def main(_):
-    # Using the Winograd non-fused algorithms provides a small performance boost.
-    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
-
-    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction)
-    sess_config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False, intra_op_parallelism_threads = FLAGS.num_cpu_threads, inter_op_parallelism_threads = FLAGS.num_cpu_threads, gpu_options = gpu_options)
-
-    # Set up a RunConfig to only save checkpoints once per training cycle.
-    run_config = tf.estimator.RunConfig().replace(
-                                        save_checkpoints_secs=None).replace(
-                                        save_checkpoints_steps=None).replace(
-                                        save_summary_steps=FLAGS.save_summary_steps).replace(
-                                        keep_checkpoint_max=5).replace(
-                                        tf_random_seed=FLAGS.tf_random_seed).replace(
-                                        log_step_count_steps=FLAGS.log_every_n_steps).replace(
-                                        session_config=sess_config)
-
-    model_to_eval = [s.strip() for s in FLAGS.model_to_eval.split(',')]
-    for m in model_to_eval:
-        if m == '': continue
-        pred_results = eval_each(keypoint_model_fn, os.path.join(FLAGS.model_dir, m), m, run_config)
-        #print(pred_results)
-        # collect result
-        df = pd.DataFrame(columns=['image_id', 'image_category'] + config.all_keys)
-        cur_record = 0
-        gloabl2local_ind = dict(zip(config.class2global_ind_map[m], list(range(len(config.class2global_ind_map[m]))) ))
-        #print(gloabl2local_ind)
-        for pred_item in pred_results:
-            temp_list = []
-            index = 0
-            x = pred_item['pred_x'].tolist()
-            y = pred_item['pred_y'].tolist()
-            filename = pred_item['file_name'].decode('utf8')
-            for ind in list(range(config.class_num_joints['*'])):
-                if ind in gloabl2local_ind:
-                    temp_list.append('{}_{}_1'.format(round(x[gloabl2local_ind[ind]]), round(y[gloabl2local_ind[ind]])))
-                else:
-                    temp_list.append('-1_-1_-1')
-            #Images/blouse/ab669925e96490ec698af976586f0b2f.jpg
-            df.loc[cur_record] = [filename, m] + temp_list
-            cur_record = cur_record + 1
-        df.to_csv('./{}.csv'.format(m), encoding='utf-8', index=False)
-
-    # merge dataframe
-    df_list = [pd.read_csv('./{}.csv'.format(model_to_eval[0]), encoding='utf-8')]
-    for m in model_to_eval[1:]:
-        if m == '': continue
-        df_list.append(pd.read_csv('./{}.csv'.format(m), encoding='utf-8'))
-    pd.concat(df_list, ignore_index=True).to_csv('./sub.csv', encoding='utf-8', index=False)
-
-    if FLAGS.run_on_cloud:
-        tf.gfile.Copy('./sub.csv', os.path.join(FLAGS.model_dir, 'sub.csv'), overwrite=True)
-
-if __name__ == '__main__':
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run()
diff --git a/net/hourglass_old.py b/net/hourglass_old.py
deleted file mode 100644
index 68b69c2b..00000000
--- a/net/hourglass_old.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright 2018 Changan Wang
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-_BATCH_NORM_DECAY = 0.9
-_BATCH_NORM_EPSILON = 1e-5
-_USE_FUSED_BN = True
-#initializer_to_use = tf.glorot_uniform_initializer
-initializer_to_use = tf.glorot_normal_initializer
-conv_bn_initializer_to_use = tf.glorot_normal_initializer#lambda : tf.truncated_normal_initializer(mean=0.0, stddev=0.005)
-
-def batch_norm_relu(inputs, is_training, data_format, name=None):
-  """Performs a batch normalization followed by a ReLU."""
-  # We set fused=True for a significant performance boost. See
-  # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
-  inputs = tf.layers.batch_normalization(
-            inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
-            momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True,
-            scale=True, training=is_training, fused=_USE_FUSED_BN, name=name)
-  inputs = tf.nn.relu(inputs, name=name + '/relu' if name is not None else None)
-  return inputs
-
-def batch_norm(inputs, is_training, data_format, name=None):
-  """Performs a batch normalization followed by a ReLU."""
-  # We set fused=True for a significant performance boost. See
-  # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
-  inputs = tf.layers.batch_normalization(
-            inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
-            momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True,
-            scale=True, training=is_training, fused=_USE_FUSED_BN, name=name)
-  return inputs
-
-def fixed_padding(inputs, kernel_size, data_format):
-  pad_total = kernel_size - 1
-  pad_beg = pad_total // 2
-  pad_end = pad_total - pad_beg
-
-  if data_format == 'channels_first':
-    padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
-                                    [pad_beg, pad_end], [pad_beg, pad_end]])
-  else:
-    padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
-                                    [pad_beg, pad_end], [0, 0]])
-  return padded_inputs
-
-# this is only can be used before BN
-def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format, kernel_initializer=conv_bn_initializer_to_use, name=None):
-  """Strided 2-D convolution with explicit padding."""
-  # The padding is consistent and is based only on `kernel_size`, not on the
-  # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
-  if strides > 1:
-    inputs = fixed_padding(inputs, kernel_size, data_format)
-
-  return tf.layers.conv2d(
-            inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
-            padding=('same' if strides == 1 else 'valid'), use_bias=False,
-            kernel_initializer=kernel_initializer(),
-            data_format=data_format, name=name)
-
-
-def bottleneck_block(inputs, in_filters, out_filters, is_training, data_format, name=None):
-  shortcut = inputs
-  inputs = batch_norm_relu(inputs, is_training, data_format, name=None if name is None else name+'_bn1')
-
-  # The projection shortcut should come after the first batch norm and ReLU
-  # since it performs a 1x1 convolution.
-  # different from original hourglass
-  if in_filters != out_filters:
-    shortcut = conv2d_fixed_padding(
-                inputs=inputs, filters=out_filters, kernel_size=1, strides=1,
-                data_format=data_format, name=None if name is None else name+'_skip')
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=out_filters//2, kernel_size=1, strides=1,
-      data_format=data_format, name=None if name is None else name+'_1x1_down')
-  inputs = batch_norm_relu(inputs, is_training, data_format, name=None if name is None else name+'_bn2')
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=out_filters//2, kernel_size=3, strides=1,
-      data_format=data_format, name=None if name is None else name+'_3x3_conv')
-  inputs = batch_norm_relu(inputs, is_training, data_format, name=None if name is None else name+'_bn3')
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=out_filters, kernel_size=1, strides=1,
-      data_format=data_format, name=None if name is None else name+'_1x1_up')
-
-  return tf.add(inputs, shortcut, name=None if name is None else name+'_elem_add')
-
-def dozen_bottleneck_blocks(inputs, in_filters, out_filters, num_modules, is_training, data_format, name=None):
-  for m in range(num_modules):
-    inputs = bottleneck_block(inputs, in_filters, out_filters, is_training, data_format, name=None if name is None else name.format(m))
-
-  return inputs
-
-def hourglass(inputs, filters, is_training, data_format, deep_index=1, num_modules=1, name=None):
-  upchannal1 = dozen_bottleneck_blocks(inputs, filters, filters, num_modules, is_training, data_format, name=None if name is None else name+'_up_{}')
-  # upchannal1 = inputs
-  # for m in range(num_modules):
-  #   upchannal1 = bottleneck_block(upchannal1, filters, filters, is_training, data_format, name=None if name is None else name+'_up_{}'.format(m))
-
-  downchannal1 = tf.layers.max_pooling2d(inputs=inputs, pool_size=2, strides=2, padding='valid',
-          data_format=data_format, name=None if name is None else name+'_down_pool')
-
-  downchannal1 = dozen_bottleneck_blocks(downchannal1, filters, filters, num_modules, is_training, data_format, name=None if name is None else name+'_down1_{}')
-  # for m in range(num_modules):
-  #   downchannal1 = bottleneck_block(downchannal1, filters, filters, is_training, data_format, name=None if name is None else name+'_down1_{}'.format(m))
-
-  if deep_index > 1:
-    downchannal2 = hourglass(downchannal1, filters, is_training, data_format, deep_index=deep_index-1, num_modules=num_modules, name=None if name is None else name+'_inner_{}'.format(deep_index))
-  else:
-    downchannal2 = dozen_bottleneck_blocks(downchannal1, filters, filters, num_modules, is_training, data_format, name=None if name is None else name+'_down2_{}')
-    # downchannal2 = downchannal1
-    # for m in range(num_modules):
-    #   downchannal2 = bottleneck_block(downchannal2, filters, filters, is_training, data_format, name=None if name is None else name+'_down2_{}'.format(m))
-
-  downchannal3 = dozen_bottleneck_blocks(downchannal2, filters, filters, num_modules, is_training, data_format, name=None if name is None else name+'_down3_{}')
-  # downchannal3 = downchannal2
-  # for m in range(num_modules):
-  #   downchannal3 = bottleneck_block(downchannal3, filters, filters, is_training, data_format, name=None if name is None else name+'_down3_{}'.format(m))
-
-  if data_format == 'channels_first':
-      downchannal3 = tf.transpose(downchannal3, [0, 2, 3, 1], name=None if name is None else name+'_trans')
-  # for visualise
-  with tf.name_scope(name+'_get_shape', "get_shape", [downchannal3]) as scope:
-    input_shape = tf.shape(downchannal3)[-3:-1] * 2
-  upchannal2 = tf.image.resize_bilinear(downchannal3, input_shape, name=None if name is None else name+'_resize')
-  if data_format == 'channels_first':
-    upchannal2 = tf.transpose(upchannal2, [0, 3, 1, 2], name=None if name is None else name+'_trans_inv')
-
-  return tf.add(upchannal1, upchannal2, name=None if name is None else name+'_elem_add')
-
-def create_model(inputs, num_stack, feat_channals, output_channals, num_modules, is_training, data_format):
-  inputs = conv2d_fixed_padding(inputs=inputs, filters=64, kernel_size=7, strides=2,
-            data_format=data_format, kernel_initializer=conv_bn_initializer_to_use, name='precede/conv_7x7')
-  inputs = batch_norm_relu(inputs, is_training, data_format, name='precede/inputs_bn')
-
-  inputs = bottleneck_block(inputs, 64, 128, is_training, data_format, name='precede/residual1')
-  inputs = tf.layers.max_pooling2d(inputs=inputs, pool_size=2, strides=2, padding='valid',
-              data_format=data_format, name='precede/pool')
-
-  inputs = bottleneck_block(inputs, 128, 128, is_training, data_format, name='precede/residual2')
-  inputs = bottleneck_block(inputs, 128, feat_channals, is_training, data_format, name='precede/residual3')
-
-  hg_inputs = inputs
-  outputs_list = []
-  for stack_index in range(num_stack):
-    hg = hourglass(hg_inputs, feat_channals, is_training, data_format, deep_index=4, num_modules=num_modules, name='stack_{}/hg'.format(stack_index))
-
-    hg = dozen_bottleneck_blocks(hg, feat_channals, feat_channals, num_modules, is_training, data_format, name='stack_{}/'.format(stack_index) + 'output_{}')
-    # for m in range(num_modules):
-    #   hg = bottleneck_block(hg, feat_channals, feat_channals, is_training, data_format, name='stack_{}/output_{}'.format(stack_index, m))
-
-    # produce prediction
-    output_scores = conv2d_fixed_padding(inputs=hg, filters=feat_channals, kernel_size=1, strides=1, data_format=data_format, name='stack_{}/output_1x1'.format(stack_index))
-    output_scores = batch_norm_relu(output_scores, is_training, data_format, name='stack_{}/output_bn'.format(stack_index))
-
-    # produce heatmap from prediction
-    # use variable_scope to help model resotre name filter
-    heatmap = tf.layers.conv2d(inputs=output_scores, filters=output_channals, kernel_size=1,
-                                strides=1, padding='same', use_bias=True, activation=None,
-                                kernel_initializer=initializer_to_use(),
-                                bias_initializer=tf.zeros_initializer(),
-                                data_format=data_format,
-                                name='hg_heatmap/stack_{}/heatmap_1x1'.format(stack_index))
-
-
-    outputs_list.append(heatmap)
-    # no remap conv for the last hourglass
-    if stack_index < num_stack - 1:
-      output_scores_ = tf.layers.conv2d(inputs=output_scores, filters=feat_channals, kernel_size=1,
-                          strides=1, padding='same', use_bias=True, activation=None,
-                          kernel_initializer=initializer_to_use(),
-                          bias_initializer=tf.zeros_initializer(),
-                          data_format=data_format,
-                          name='stack_{}/remap_outputs'.format(stack_index))
-      # use variable_scope to help model resotre name filter
-      heatmap_ = tf.layers.conv2d(inputs=heatmap, filters=feat_channals, kernel_size=1,
-                        strides=1, padding='same', use_bias=True, activation=None,
-                        kernel_initializer=initializer_to_use(),
-                        bias_initializer=tf.zeros_initializer(),
-                        data_format=data_format,
-                        name='hg_heatmap/stack_{}/remap_heatmap'.format(stack_index))
-
-      # next hourglass inputs
-      fused_heatmap = tf.add(output_scores_, heatmap_, 'stack_{}/fused_heatmap'.format(stack_index))
-      hg_inputs = tf.add(hg_inputs, fused_heatmap, 'stack_{}/next_inputs'.format(stack_index))
-      #hg_inputs = hg_inputs + output_scores_ + heatmap_
-
-  return outputs_list
-
-
-
-
diff --git a/net/simple_xt.py b/net/simple_xt.py
new file mode 100644
index 00000000..e39f2dbe
--- /dev/null
+++ b/net/simple_xt.py
@@ -0,0 +1,392 @@
+# Copyright 2018 Changan Wang
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+import math
+
+_BATCH_NORM_DECAY = 0.9
+_BATCH_NORM_EPSILON = 1e-5
+_USE_FUSED_BN = True
+
+################################################################################
+# Convenience functions for building the ResNet model.
+################################################################################
+def batch_norm(inputs, training, data_format, name=None):
+    """Performs a batch normalization using a standard set of parameters."""
+    # We set fused=True for a significant performance boost. See
+    # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
+    return tf.layers.batch_normalization(
+        inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
+        momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True,
+        scale=True, training=training, name=name, fused=_USE_FUSED_BN)
+
+
+def fixed_padding(inputs, kernel_size, data_format):
+    """Pads the input along the spatial dimensions independently of input size.
+
+    Args:
+      inputs: A tensor of size [batch, channels, height_in, width_in] or
+        [batch, height_in, width_in, channels] depending on data_format.
+      kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+                   Should be a positive integer.
+      data_format: The input format ('channels_last' or 'channels_first').
+
+    Returns:
+      A tensor with the same format as the input with the data either intact
+      (if kernel_size == 1) or padded (if kernel_size > 1).
+    """
+    pad_total = kernel_size - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+
+    if data_format == 'channels_first':
+        padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
+                                      [pad_beg, pad_end], [pad_beg, pad_end]])
+    else:
+        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
+                                      [pad_beg, pad_end], [0, 0]])
+    return padded_inputs
+
+
+def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format, kernel_initializer=tf.glorot_uniform_initializer, name=None):
+    """Strided 2-D convolution with explicit padding."""
+    # The padding is consistent and is based only on `kernel_size`, not on the
+    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+    if strides > 1:
+        inputs = fixed_padding(inputs, kernel_size, data_format)
+
+    return tf.layers.conv2d(
+                inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
+                padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
+                kernel_initializer=kernel_initializer(),
+                data_format=data_format, name=name)
+
+# input image order: BGR, range [0-255]
+# mean_value: 104, 117, 123
+# only subtract mean is used
+def constant_xavier_initializer(shape, group, dtype=tf.float32, uniform=True):
+    """Initializer function."""
+    if not dtype.is_floating:
+      raise TypeError('Cannot create initializer for non-floating point type.')
+    # Estimating fan_in and fan_out is not possible to do perfectly, but we try.
+    # This is the right thing for matrix multiply and convolutions.
+    if shape:
+      fan_in = float(shape[-2]) if len(shape) > 1 else float(shape[-1])
+      fan_out = float(shape[-1])/group
+    else:
+      fan_in = 1.0
+      fan_out = 1.0
+    for dim in shape[:-2]:
+      fan_in *= float(dim)
+      fan_out *= float(dim)
+
+    # Average number of inputs and output connections.
+    n = (fan_in + fan_out) / 2.0
+    if uniform:
+      # To get stddev = math.sqrt(factor / n) need to adjust for uniform.
+      limit = math.sqrt(3.0 * 1.0 / n)
+      return tf.random_uniform(shape, -limit, limit, dtype, seed=None)
+    else:
+      # To get stddev = math.sqrt(factor / n) need to adjust for truncated.
+      trunc_stddev = math.sqrt(1.3 * 1.0 / n)
+      return tf.truncated_normal(shape, 0.0, trunc_stddev, dtype, seed=None)
+
+def wrapper_initlizer(shape, dtype=None, partition_info=None):
+    return constant_xavier_initializer(shape, 32, dtype)
+# for root block, use dummy input_filters, e.g. 128 rather than 64 for the first block
+def se_next_bottleneck_block(inputs, input_filters, name_prefix, is_training, group, data_format='channels_last', need_reduce=True, is_root=False, reduced_scale=16):
+    bn_axis = -1 if data_format == 'channels_last' else 1
+    strides_to_use = 1
+    residuals = inputs
+    if need_reduce:
+        strides_to_use = 1 if is_root else 2
+        #print(strides_to_use)
+        proj_mapping = tf.layers.conv2d(inputs, input_filters, (1, 1), use_bias=False,
+                                name=name_prefix + '_1x1_proj', strides=(strides_to_use, strides_to_use),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+        # print(proj_mapping)
+        residuals = tf.layers.batch_normalization(proj_mapping, momentum=_BATCH_NORM_DECAY,
+                                name=name_prefix + '_1x1_proj/bn', axis=bn_axis,
+                                epsilon=_BATCH_NORM_EPSILON, training=is_training, reuse=None, fused=_USE_FUSED_BN)
+    #print(strides_to_use)
+    reduced_inputs = tf.layers.conv2d(inputs, input_filters // 2, (1, 1), use_bias=False,
+                            name=name_prefix + '_1x1_reduce', strides=(1, 1),
+                            padding='valid', data_format=data_format, activation=None,
+                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                            bias_initializer=tf.zeros_initializer())
+    reduced_inputs_bn = tf.layers.batch_normalization(reduced_inputs, momentum=_BATCH_NORM_DECAY,
+                                        name=name_prefix + '_1x1_reduce/bn', axis=bn_axis,
+                                        epsilon=_BATCH_NORM_EPSILON, training=is_training, reuse=None, fused=_USE_FUSED_BN)
+    reduced_inputs_relu = tf.nn.relu(reduced_inputs_bn, name=name_prefix + '_1x1_reduce/relu')
+
+    if data_format == 'channels_first':
+        reduced_inputs_relu = tf.pad(reduced_inputs_relu, paddings = [[0, 0], [0, 0], [1, 1], [1, 1]])
+        weight_shape = [3, 3, reduced_inputs_relu.get_shape().as_list()[1]//group, input_filters // 2]
+        if is_training:
+            weight_ = tf.Variable(constant_xavier_initializer(weight_shape, group=group, dtype=tf.float32), trainable=is_training, name=name_prefix + '_3x3/kernel')
+        else:
+            weight_ = tf.get_variable(name_prefix + '_3x3/kernel', shape=weight_shape, initializer=wrapper_initlizer, trainable=is_training)
+        weight_groups = tf.split(weight_, num_or_size_splits=group, axis=-1, name=name_prefix + '_weight_split')
+        xs = tf.split(reduced_inputs_relu, num_or_size_splits=group, axis=1, name=name_prefix + '_inputs_split')
+    else:
+        reduced_inputs_relu = tf.pad(reduced_inputs_relu, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+        weight_shape = [3, 3, reduced_inputs_relu.get_shape().as_list()[-1]//group, input_filters // 2]
+        if is_training:
+            weight_ = tf.Variable(constant_xavier_initializer(weight_shape, group=group, dtype=tf.float32), trainable=is_training, name=name_prefix + '_3x3/kernel')
+        else:
+            weight_ = tf.get_variable(name_prefix + '_3x3/kernel', shape=weight_shape, initializer=wrapper_initlizer, trainable=is_training)
+        weight_groups = tf.split(weight_, num_or_size_splits=group, axis=-1, name=name_prefix + '_weight_split')
+        xs = tf.split(reduced_inputs_relu, num_or_size_splits=group, axis=-1, name=name_prefix + '_inputs_split')
+
+    convolved = [tf.nn.convolution(x, weight, padding='VALID', strides=[strides_to_use, strides_to_use], name=name_prefix + '_group_conv',
+                    data_format=('NCHW' if data_format == 'channels_first' else 'NHWC')) for (x, weight) in zip(xs, weight_groups)]
+
+    if data_format == 'channels_first':
+        conv3_inputs = tf.concat(convolved, axis=1, name=name_prefix + '_concat')
+    else:
+        conv3_inputs = tf.concat(convolved, axis=-1, name=name_prefix + '_concat')
+
+    conv3_inputs_bn = tf.layers.batch_normalization(conv3_inputs, momentum=_BATCH_NORM_DECAY, name=name_prefix + '_3x3/bn',
+                                        axis=bn_axis, epsilon=_BATCH_NORM_EPSILON, training=is_training, reuse=None, fused=_USE_FUSED_BN)
+    conv3_inputs_relu = tf.nn.relu(conv3_inputs_bn, name=name_prefix + '_3x3/relu')
+
+
+    increase_inputs = tf.layers.conv2d(conv3_inputs_relu, input_filters, (1, 1), use_bias=False,
+                                name=name_prefix + '_1x1_increase', strides=(1, 1),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+    increase_inputs_bn = tf.layers.batch_normalization(increase_inputs, momentum=_BATCH_NORM_DECAY,
+                                        name=name_prefix + '_1x1_increase/bn', axis=bn_axis,
+                                        epsilon=_BATCH_NORM_EPSILON, training=is_training, reuse=None, fused=_USE_FUSED_BN)
+
+    if data_format == 'channels_first':
+        pooled_inputs = tf.reduce_mean(increase_inputs_bn, [2, 3], name=name_prefix + '_global_pool', keep_dims=True)
+    else:
+        pooled_inputs = tf.reduce_mean(increase_inputs_bn, [1, 2], name=name_prefix + '_global_pool', keep_dims=True)
+
+    down_inputs = tf.layers.conv2d(pooled_inputs, input_filters // reduced_scale, (1, 1), use_bias=True,
+                                name=name_prefix + '_1x1_down', strides=(1, 1),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+    down_inputs_relu = tf.nn.relu(down_inputs, name=name_prefix + '_1x1_down/relu')
+
+
+    up_inputs = tf.layers.conv2d(down_inputs_relu, input_filters, (1, 1), use_bias=True,
+                                name=name_prefix + '_1x1_up', strides=(1, 1),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+    prob_outputs = tf.nn.sigmoid(up_inputs, name=name_prefix + '_prob')
+
+    rescaled_feat = tf.multiply(prob_outputs, increase_inputs_bn, name=name_prefix + '_mul')
+    pre_act = tf.add(residuals, rescaled_feat, name=name_prefix + '_add')
+    return tf.nn.relu(pre_act, name=name_prefix + '/relu')
+
+def dilated_se_next_bottleneck_block(inputs, input_filters, name_prefix, is_training, group, data_format='channels_last', need_reduce=True, reduced_scale=16):
+    bn_axis = -1 if data_format == 'channels_last' else 1
+    residuals = inputs
+    if need_reduce:
+        proj_mapping = tf.layers.conv2d(inputs, input_filters, (1, 1), use_bias=False,
+                                name=name_prefix + '_1x1_proj', strides=(1, 1),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+        # print(proj_mapping)
+        residuals = tf.layers.batch_normalization(proj_mapping, momentum=_BATCH_NORM_DECAY,
+                                name=name_prefix + '_1x1_proj/bn', axis=bn_axis,
+                                epsilon=_BATCH_NORM_EPSILON, training=is_training, reuse=None, fused=_USE_FUSED_BN)
+    #print(strides_to_use)
+    reduced_inputs = tf.layers.conv2d(inputs, input_filters // 2, (1, 1), use_bias=False,
+                            name=name_prefix + '_1x1_reduce', strides=(1, 1),
+                            padding='valid', data_format=data_format, activation=None,
+                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                            bias_initializer=tf.zeros_initializer())
+    reduced_inputs_bn = tf.layers.batch_normalization(reduced_inputs, momentum=_BATCH_NORM_DECAY,
+                                        name=name_prefix + '_1x1_reduce/bn', axis=bn_axis,
+                                        epsilon=_BATCH_NORM_EPSILON, training=is_training, reuse=None, fused=_USE_FUSED_BN)
+    reduced_inputs_relu = tf.nn.relu(reduced_inputs_bn, name=name_prefix + '_1x1_reduce/relu')
+
+    if data_format == 'channels_first':
+        #reduced_inputs_relu = tf.pad(reduced_inputs_relu, paddings = [[0, 0], [0, 0], [1, 1], [1, 1]])
+        weight_shape = [3, 3, reduced_inputs_relu.get_shape().as_list()[1]//group, input_filters // 2]
+        if is_training:
+            weight_ = tf.Variable(constant_xavier_initializer(weight_shape, group=group, dtype=tf.float32), trainable=is_training, name=name_prefix + '_3x3/kernel')
+        else:
+            weight_ = tf.get_variable(name_prefix + '_3x3/kernel', shape=weight_shape, initializer=wrapper_initlizer, trainable=is_training)
+        weight_groups = tf.split(weight_, num_or_size_splits=group, axis=-1, name=name_prefix + '_weight_split')
+        xs = tf.split(reduced_inputs_relu, num_or_size_splits=group, axis=1, name=name_prefix + '_inputs_split')
+    else:
+        #reduced_inputs_relu = tf.pad(reduced_inputs_relu, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+        weight_shape = [3, 3, reduced_inputs_relu.get_shape().as_list()[-1]//group, input_filters // 2]
+        if is_training:
+            weight_ = tf.Variable(constant_xavier_initializer(weight_shape, group=group, dtype=tf.float32), trainable=is_training, name=name_prefix + '_3x3/kernel')
+        else:
+            weight_ = tf.get_variable(name_prefix + '_3x3/kernel', shape=weight_shape, initializer=wrapper_initlizer, trainable=is_training)
+        weight_groups = tf.split(weight_, num_or_size_splits=group, axis=-1, name=name_prefix + '_weight_split')
+        xs = tf.split(reduced_inputs_relu, num_or_size_splits=group, axis=-1, name=name_prefix + '_inputs_split')
+
+    # !!! before is VALID !!!
+    convolved = [tf.nn.convolution(x, weight, padding='SAME', strides=[1, 1], dilation_rate=[2, 2], name=name_prefix + '_group_conv',
+                    data_format=('NCHW' if data_format == 'channels_first' else 'NHWC')) for (x, weight) in zip(xs, weight_groups)]
+
+    if data_format == 'channels_first':
+        conv3_inputs = tf.concat(convolved, axis=1, name=name_prefix + '_concat')
+    else:
+        conv3_inputs = tf.concat(convolved, axis=-1, name=name_prefix + '_concat')
+
+    conv3_inputs_bn = tf.layers.batch_normalization(conv3_inputs, momentum=_BATCH_NORM_DECAY, name=name_prefix + '_3x3/bn',
+                                        axis=bn_axis, epsilon=_BATCH_NORM_EPSILON, training=is_training, reuse=None, fused=_USE_FUSED_BN)
+    conv3_inputs_relu = tf.nn.relu(conv3_inputs_bn, name=name_prefix + '_3x3/relu')
+
+
+    increase_inputs = tf.layers.conv2d(conv3_inputs_relu, input_filters, (1, 1), use_bias=False,
+                                name=name_prefix + '_1x1_increase', strides=(1, 1),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+    increase_inputs_bn = tf.layers.batch_normalization(increase_inputs, momentum=_BATCH_NORM_DECAY,
+                                        name=name_prefix + '_1x1_increase/bn', axis=bn_axis,
+                                        epsilon=_BATCH_NORM_EPSILON, training=is_training, reuse=None, fused=_USE_FUSED_BN)
+
+    if data_format == 'channels_first':
+        pooled_inputs = tf.reduce_mean(increase_inputs_bn, [2, 3], name=name_prefix + '_global_pool', keep_dims=True)
+    else:
+        pooled_inputs = tf.reduce_mean(increase_inputs_bn, [1, 2], name=name_prefix + '_global_pool', keep_dims=True)
+
+    down_inputs = tf.layers.conv2d(pooled_inputs, input_filters // reduced_scale, (1, 1), use_bias=True,
+                                name=name_prefix + '_1x1_down', strides=(1, 1),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+    down_inputs_relu = tf.nn.relu(down_inputs, name=name_prefix + '_1x1_down/relu')
+
+
+    up_inputs = tf.layers.conv2d(down_inputs_relu, input_filters, (1, 1), use_bias=True,
+                                name=name_prefix + '_1x1_up', strides=(1, 1),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+    prob_outputs = tf.nn.sigmoid(up_inputs, name=name_prefix + '_prob')
+
+    rescaled_feat = tf.multiply(prob_outputs, increase_inputs_bn, name=name_prefix + '_mul')
+    pre_act = tf.add(residuals, rescaled_feat, name=name_prefix + '_add')
+    return tf.nn.relu(pre_act, name=name_prefix + '/relu')
+
+# the input image should in BGR order, note that this is not the common case in Tensorflow
+def sext_backbone(input_image, istraining, data_format, net_depth=101, group=32):
+    bn_axis = -1 if data_format == 'channels_last' else 1
+
+    if data_format == 'channels_last':
+        image_channels = tf.unstack(input_image, axis=-1)
+        swaped_input_image = tf.stack([image_channels[2], image_channels[1], image_channels[0]], axis=-1)
+    else:
+        image_channels = tf.unstack(input_image, axis=1)
+        swaped_input_image = tf.stack([image_channels[2], image_channels[1], image_channels[0]], axis=1)
+    #swaped_input_image = input_image
+
+    if net_depth not in [50, 101]:
+        raise TypeError('Only ResNeXt50 or ResNeXt101 is supprted now.')
+
+    input_depth = [256, 512, 1024] # the input depth of the the first block is dummy input
+    num_units = [3, 4, 6] if net_depth==50 else [3, 4, 23]
+    block_name_prefix = ['conv2_{}', 'conv3_{}', 'conv4_{}']
+
+    if data_format == 'channels_first':
+        swaped_input_image = tf.pad(swaped_input_image, paddings = [[0, 0], [0, 0], [3, 3], [3, 3]])
+    else:
+        swaped_input_image = tf.pad(swaped_input_image, paddings = [[0, 0], [3, 3], [3, 3], [0, 0]])
+
+    inputs_features = tf.layers.conv2d(swaped_input_image, input_depth[0]//4, (7, 7), use_bias=False,
+                                name='conv1/7x7_s2', strides=(2, 2),
+                                padding='valid', data_format=data_format, activation=None,
+                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                bias_initializer=tf.zeros_initializer())
+    #print(ee)
+
+    inputs_features = tf.layers.batch_normalization(inputs_features, momentum=_BATCH_NORM_DECAY,
+                                        name='conv1/7x7_s2/bn', axis=bn_axis,
+                                        epsilon=_BATCH_NORM_EPSILON, training=istraining, reuse=None, fused=_USE_FUSED_BN)
+    inputs_features = tf.nn.relu(inputs_features, name='conv1/relu_7x7_s2')
+
+    inputs_features = tf.layers.max_pooling2d(inputs_features, [3, 3], [2, 2], padding='same', data_format=data_format, name='pool1/3x3_s2')
+
+    end_points = []
+    is_root = True
+    for ind, num_unit in enumerate(num_units):
+        need_reduce = True
+        for unit_index in range(1, num_unit+1):
+            inputs_features = se_next_bottleneck_block(inputs_features, input_depth[ind], block_name_prefix[ind].format(unit_index), is_training=istraining, group=group, data_format=data_format, need_reduce=need_reduce, is_root=is_root)
+            need_reduce = False
+            is_root = False
+        end_points.append(inputs_features)
+
+    # conv5
+    need_reduce = True
+    for unit_index in range(1, 4):
+        inputs_features = dilated_se_next_bottleneck_block(inputs_features, 2048, 'conv5_{}'.format(unit_index), is_training=istraining, group=group, data_format=data_format, need_reduce=need_reduce)
+        need_reduce = False
+    end_points.append(inputs_features)
+
+    #print(inputs)
+    return end_points
+
+
+def simple_net(inputs, output_channals, heatmap_size, istraining, data_format, net_depth=101):
+    end_points = sext_backbone(inputs, istraining, data_format, net_depth=net_depth)
+    bn_axis = -1 if data_format == 'channels_last' else 1
+    with tf.variable_scope('additional_layer', 'additional_layer', values=end_points, reuse=None):
+        inputs_features = tf.layers.conv2d_transpose(end_points[-1], 256, 4, strides=(2, 2), padding='same',
+                            data_format=data_format, activation=None, use_bias=False, kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=None,
+                            kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None,
+                            kernel_constraint=None, bias_constraint=None,
+                            trainable=istraining, name='deconv_1', reuse=None)
+        inputs_features = tf.layers.batch_normalization(inputs_features, momentum=_BATCH_NORM_DECAY, name='deconv_1_bn',
+                                        axis=bn_axis, epsilon=_BATCH_NORM_EPSILON, training=istraining, reuse=None, fused=_USE_FUSED_BN)
+        inputs_features = tf.nn.relu(inputs_features, name='deconv_1_relu')
+
+        inputs_features = tf.layers.conv2d_transpose(inputs_features, 256, 4, strides=(2, 2), padding='same',
+                            data_format=data_format, activation=None, use_bias=False, kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=None,
+                            kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None,
+                            kernel_constraint=None, bias_constraint=None,
+                            trainable=istraining, name='deconv_2', reuse=None)
+        inputs_features = tf.layers.batch_normalization(inputs_features, momentum=_BATCH_NORM_DECAY, name='deconv_2_bn',
+                                        axis=bn_axis, epsilon=_BATCH_NORM_EPSILON, training=istraining, reuse=None, fused=_USE_FUSED_BN)
+        inputs_features = tf.nn.relu(inputs_features, name='deconv_2_relu')
+
+        inputs_features = tf.layers.conv2d_transpose(inputs_features, 256, 4, strides=(2, 2), padding='same',
+                            data_format=data_format, activation=None, use_bias=False, kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=None,
+                            kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None,
+                            kernel_constraint=None, bias_constraint=None,
+                            trainable=istraining, name='deconv_3', reuse=None)
+        inputs_features = tf.layers.batch_normalization(inputs_features, momentum=_BATCH_NORM_DECAY, name='deconv_3_bn',
+                                        axis=bn_axis, epsilon=_BATCH_NORM_EPSILON, training=istraining, reuse=None, fused=_USE_FUSED_BN)
+        inputs_features = tf.nn.relu(inputs_features, name='deconv_3_relu')
+
+        heatmap = tf.layers.conv2d(inputs=inputs_features, filters=output_channals, kernel_size=1,
+                            strides=1, padding='same', use_bias=True, activation=None,
+                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                            bias_initializer=tf.zeros_initializer(),
+                            data_format=data_format,
+                            name='heatmap_1x1')
+
+        return [heatmap]
diff --git a/test.py b/test.py
deleted file mode 100644
index d1e3deb6..00000000
--- a/test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-
-# import requests
-
-# def download_file_from_google_drive(id, destination):
-#     def get_confirm_token(response):
-#         for key, value in response.cookies.items():
-#             if key.startswith('download_warning'):
-#                 return value
-
-#         return None
-
-#     def save_response_content(response, destination):
-#         CHUNK_SIZE = 32768
-
-#         with open(destination, "wb") as f:
-#             for chunk in response.iter_content(CHUNK_SIZE):
-#                 if chunk: # filter out keep-alive new chunks
-#                     f.write(chunk)
-
-#     URL = "https://docs.google.com/uc?export=download"
-
-#     session = requests.Session()
-
-#     response = session.get(URL, params = { 'id' : id }, stream = True)
-#     token = get_confirm_token(response)
-
-#     if token:
-#         params = { 'id' : id, 'confirm' : token }
-#         response = session.get(URL, params = params, stream = True)
-
-#     save_response_content(response, destination)
-
-# #?id=
-# if __name__ == "__main__":
-
-#     # TAKE ID FROM SHAREABLE LINK
-#     file_id = '1AwG0nWFUrikd17xQpTmAj2LcwK-MbNqJ'
-#     # DESTINATION FILE ON YOUR DISK
-#     destination = './dd.txt'
-#     download_file_from_google_drive(file_id, destination)
-
-import torch.nn.functional as F
-import torch
-print(F.smooth_l1_loss(torch.Tensor([[21]]), torch.Tensor([[0]]), size_average=False).data[0])
-print(F.smooth_l1_loss(torch.Tensor([[21,22,23,24]]), torch.Tensor([[0,0,0,0]]), size_average=False).data[0])
-print(F.smooth_l1_loss(torch.Tensor([[11,12,13,14]]), torch.Tensor([[0,0,0,0]]), size_average=False).data[0])
-print(F.smooth_l1_loss(torch.Tensor([[21,22,23,24], [11,12,13,14]]), torch.Tensor([[0,0,0,0],[0,0,0,0]]), size_average=False).data[0])
-
-
-import pandas as pd
-
-df = pd.read_csv("G:/preds.csv", header=0)
-
-df['real_class'] = df['real_class'].astype('int')
-df['pred_class'] = df['pred_class'].astype('int')
-
-df['equal'] = df['real_class'] == df['pred_class']
-
-acc_by_cls = df.groupby(['real_class'])['equal'].mean().reset_index()
-acc_by_cls.columns=['class', 'acc']
-
-print(acc_by_cls)
-import tensorflow as tf
-import numpy as np
-
-mask = np.array([[True, False, True], [False, True, True]])
-
-
-
-targets = tf.constant([[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[115, 116], [117, 118]]],
-                       [[[11, 12], [13, 14]], [[15, 16], [17, 18]], [[25, 26], [27, 28]]]])
-
-a = tf.boolean_mask(targets, mask)  # [[1, 2], [5, 6]]
-
-sess = tf.Session()
-
-with sess.as_default():
-    print(a.eval())
-
-
-pred_outputs = tf.zeros_like(targets)
-
-sss = tf.losses.mean_squared_error(targets, pred_outputs, weights=1.0,
-                                    loss_collection=None,
-                                    reduction=tf.losses.Reduction.NONE)
-num_topk = 2
-sss = tf.reduce_mean(tf.reshape(sss, [2, 2, -1]), axis=-1)
-gather_col = tf.nn.top_k(sss, k=num_topk, sorted=True)[1]
-
-gather_row = tf.reshape(tf.tile(tf.reshape(tf.range(2), [-1, 1]), [1, num_topk]), [-1, 1])
-
-gather_indcies = tf.stack([gather_row, tf.reshape(gather_col, [-1, 1])], axis=-1)
-
-select_heatmap = tf.gather_nd(targets, gather_indcies)
-
-sess = tf.Session()
-table = tf.contrib.lookup.HashTable(
-    tf.contrib.lookup.KeyValueTensorInitializer(tf.constant([0,1,2], dtype=tf.int64), tf.constant([1,2,-1], dtype=tf.int64)), 0)
-out = table.lookup(tf.constant([0,1,2,3,4], dtype=tf.int64))
-sess.run(tf.group([tf.local_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()]))
-with sess.as_default():
-    #table.init.run()
-    print(sss.eval())
-    print(gather_col.eval())
-    print(gather_row.eval())
-    print(gather_indcies.eval())
-    print(select_heatmap.eval())
-
-
-
-heatmap_sigma = 1.
-ksize = 6 * heatmap_sigma + 1.
-
-x = tf.expand_dims(tf.range(tf.round(ksize), delta=1, dtype=tf.float32), axis=1)
-y = tf.transpose(x, [1, 0])
-hhh = tf.exp(- ((x - ksize/2.) ** 2 + (y - ksize/2.) ** 2) / (2 * heatmap_sigma ** 2))
-
-
-
-hhh_filter = tf.reshape(hhh, [tf.round(ksize), tf.round(ksize), 1, 1])
-hhh_filter = tf.transpose(hhh_filter, [1, 0, 2, 3])
-
-
-filtered_x = tf.nn.conv2d(image_resized, sobel_x_filter,
-                          strides=[1, 1, 1, 1], padding='SAME', data_format='NHWC',
-    dilations=[1, 1, 1, 1],
-    name=None)
-
-
-
-
-
-
-
-pred = tf.constant([[[[1.11,1.12],[1.13,1.14]], [[1.21,1.22],[1.23,1.24]]], [[[2.11,2.12],[2.13,2.14]], [[2.21,2.22],[2.23,2.24]]]])
-pred = tf.segment_mean(pred, [0,0])
-
-im = tf.constant([[[[1.11,1.12,1.],[1.13,1.14,1.]], [[1.21,1.22,1.],[1.23,1.24,1.]]], [[[2.11,2.12,1.],[2.13,2.14,1.]], [[2.21,2.22,1.],[2.23,2.24,1.]]]])
-#pred = tf.reshape(tf.stack([im, im], axis=1), [-1, 2, 2, 3])
-aaaaaaa1 = tf.tile(tf.reshape(tf.range(tf.shape(pred)[0]), [-1, 1]), [1, 2])
-aaaaaaa2=tf.stack([aaaaaaa1,tf.constant([[1,0],[0,1]])], axis=-1)
-indcesss = tf.constant([[[0,1],[0,0]],[[1,1],[1,0]]])
-
-aaaaaaa = tf.gather_nd(pred, aaaaaaa2)
-
-d = {'d':12, 'f':0.12344, 'c':555.5767}
-items = d.items()
-print(', '.join(['%s=%.6f' % (k, v) for k,v in items]))
-
-heatmap_size=64
-pred_heatmap = tf.one_hot([34*heatmap_size+23, 1*heatmap_size+60, 32*heatmap_size+1], heatmap_size*heatmap_size, on_value=1., off_value=0., axis=-1, dtype=tf.float32)
-pred_max = tf.reduce_max(pred_heatmap, axis=-1)
-pred_indices = tf.argmax(pred_heatmap, axis=-1)
-pred_x, pred_y = tf.cast(tf.floormod(pred_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_indices, heatmap_size), tf.float32)
-
-
-a = tf.losses.mean_squared_error(tf.constant([[0,1],[1,2],[2,3],[3,4],[4,5]], dtype=tf.int64),tf.constant([[1,2],[2,3],[3,4],[4,5],[5,6]], dtype=tf.int64), weights=1.0/5., loss_collection=None, reduction=tf.losses.Reduction.MEAN)
-
-
-aa = tf.reduce_sum(tf.squared_difference(tf.constant([[0,1],[1,2],[2,3],[3,4],[4,5]], dtype=tf.int64),tf.constant([[1,2],[2,3],[3,4],[4,5],[5,6]], dtype=tf.int64)), axis=-1)
-b = tf.metrics.mean_absolute_error(aa, tf.zeros_like(aa))
-#tf.metrics.mean_squared_error(,
-                                #weights=1.0*2,
-                                #name='last_pred_mse')
-sess = tf.Session()
-table = tf.contrib.lookup.HashTable(
-    tf.contrib.lookup.KeyValueTensorInitializer(tf.constant([0,1,2], dtype=tf.int64), tf.constant([1,2,-1], dtype=tf.int64)), 0)
-out = table.lookup(tf.constant([0,1,2,3,4], dtype=tf.int64))
-sess.run(tf.group([tf.local_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()]))
-with sess.as_default():
-    #table.init.run()
-    print(pred.eval())
-    print(aaaaaaa1.eval())
-    print(aaaaaaa2.eval())
-    #print(b[0].eval())
-    # print(a.eval())
-    # print(b[1].eval())
-    # print(pred_x.eval())
-
-    # print(pred_y.eval())
-
diff --git a/train_simplenet_onebyone.py b/train_simplenet_onebyone.py
new file mode 100644
index 00000000..b0e26b09
--- /dev/null
+++ b/train_simplenet_onebyone.py
@@ -0,0 +1,553 @@
+# Copyright 2018 Changan Wang
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import numpy as np
+#from scipy.misc import imread, imsave, imshow, imresize
+import tensorflow as tf
+
+import tf_replicate_model_fn
+
+from net import simple_xt
+
+from utility import train_helper
+from utility import mertric
+
+from preprocessing import preprocessing
+from preprocessing import dataset
+import config
+
+# hardware related configuration
+tf.app.flags.DEFINE_integer(
+    'num_readers', 16,#16
+    'The number of parallel readers that read data from the dataset.')
+tf.app.flags.DEFINE_integer(
+    'num_preprocessing_threads', 48,#48
+    'The number of threads used to create the batches.')
+tf.app.flags.DEFINE_integer(
+    'num_cpu_threads', 0,
+    'The number of cpu cores used to train.')
+tf.app.flags.DEFINE_float(
+    'gpu_memory_fraction', 1., 'GPU memory fraction to use.')
+# scaffold related configuration
+tf.app.flags.DEFINE_string(
+    'data_dir', '../Datasets/tfrecords',#'/media/rs/0E06CD1706CD0127/Kapok/Chi/Datasets/tfrecords',
+    'The directory where the dataset input data is stored.')
+tf.app.flags.DEFINE_string(
+    'dataset_name', '{}_????', 'The pattern of the dataset name to load.')
+tf.app.flags.DEFINE_string(
+    'model_dir', './logs_simple_net/',
+    'The parent directory where the model will be stored.')
+tf.app.flags.DEFINE_integer(
+    'log_every_n_steps', 10,
+    'The frequency with which logs are print.')
+tf.app.flags.DEFINE_integer(
+    'save_summary_steps', 100,
+    'The frequency with which summaries are saved, in seconds.')
+tf.app.flags.DEFINE_integer(
+    'save_checkpoints_steps', 8000,
+    'The frequency with which the model is saved, in steps.')
+# model related configuration
+tf.app.flags.DEFINE_integer(
+    'train_image_size', 384,
+    'The size of the input image for the model to use.')
+tf.app.flags.DEFINE_integer(
+    'heatmap_size', 192,
+    'The size of the output heatmap of the model.')
+tf.app.flags.DEFINE_float(
+    'heatmap_sigma', 1.,
+    'The sigma of Gaussian which generate the target heatmap.')
+tf.app.flags.DEFINE_float(
+    'bbox_border', 25.,
+    'The nearest distance of the crop border to al keypoints.')
+tf.app.flags.DEFINE_integer(
+    'batch_size', 8,
+    'Batch size for training and evaluation.')
+tf.app.flags.DEFINE_boolean(
+    'use_ohkm', True,
+    'Wether we will use the ohkm for hard keypoints.')
+tf.app.flags.DEFINE_string(
+    'data_format', 'channels_first', # 'channels_first' or 'channels_last'
+    'A flag to override the data format used in the model. channels_first '
+    'provides a performance boost on GPU but is not always compatible '
+    'with CPU. If left unspecified, the data format will be chosen '
+    'automatically based on whether TensorFlow was built for CPU or GPU.')
+# optimizer related configuration
+tf.app.flags.DEFINE_integer(
+    'tf_random_seed', 20180417, 'Random seed for TensorFlow initializers.')
+tf.app.flags.DEFINE_float(
+    'weight_decay', 1e-5, 'The weight decay on the model weights.')
+tf.app.flags.DEFINE_float(
+    'mse_weight', 1., 'The weight decay on the model weights.')
+tf.app.flags.DEFINE_float(
+    'momentum', 0.9,
+    'The momentum for the MomentumOptimizer and RMSPropOptimizer.')
+tf.app.flags.DEFINE_float('learning_rate', 8e-3, 'Initial learning rate.')#1e-3
+tf.app.flags.DEFINE_float(
+    'end_learning_rate', 0.0000001,
+    'The minimal end learning rate used by a polynomial decay learning rate.')
+tf.app.flags.DEFINE_float(
+    'warmup_learning_rate', 0.0002,
+    'The start warm-up learning rate to avoid NAN.')
+tf.app.flags.DEFINE_integer(
+    'warmup_steps', 100,
+    'The total steps to warm-up.')
+# for learning rate piecewise_constant decay
+tf.app.flags.DEFINE_string(
+    'decay_boundaries', '2, 3',
+    'Learning rate decay boundaries by global_step (comma-separated list).')
+tf.app.flags.DEFINE_string(
+    'lr_decay_factors', '1, 0.5, 0.1',
+    'The values of learning_rate decay factor for each segment between boundaries (comma-separated list).')
+# checkpoint related configuration
+tf.app.flags.DEFINE_string(
+    'checkpoint_path', './model/seresnext101',
+    'The path to a checkpoint from which to fine-tune.')
+tf.app.flags.DEFINE_string(
+    'checkpoint_model_scope', '',
+    'Model scope in the checkpoint. None if the same as the trained model.')
+tf.app.flags.DEFINE_string(
+    #'blouse', 'dress', 'outwear', 'skirt', 'trousers', 'all'
+    'model_scope', None,
+    'Model scope name used to replace the name_scope in checkpoint.')
+tf.app.flags.DEFINE_string(
+    'checkpoint_exclude_scopes', None,
+    'Comma-separated list of scopes of variables to exclude when restoring from a checkpoint.')
+tf.app.flags.DEFINE_boolean(
+    'ignore_missing_vars', True,
+    'When restoring a checkpoint would ignore missing variables.')
+tf.app.flags.DEFINE_boolean(
+    'run_on_cloud', True,
+    'Wether we will train on cloud.')
+tf.app.flags.DEFINE_boolean(
+    'multi_gpu', True,
+    'Wether we will use multi-GPUs to train.')
+tf.app.flags.DEFINE_string(
+    'cloud_checkpoint_path', 'seresnext101',
+    'The path to a checkpoint from which to fine-tune.')
+tf.app.flags.DEFINE_string(
+    'model_to_train', 'blouse, dress, outwear, skirt, trousers', #'all, blouse, dress, outwear, skirt, trousers', 'skirt, dress, outwear, trousers',
+    'The sub-model to train (comma-separated list).')
+
+FLAGS = tf.app.flags.FLAGS
+#--model_scope=blouse --checkpoint_path=./logs/all --data_format=channels_last --batch_size=1
+
+def validate_batch_size_for_multi_gpu(batch_size):
+    """For multi-gpu, batch-size must be a multiple of the number of
+    available GPUs.
+
+    Note that this should eventually be handled by replicate_model_fn
+    directly. Multi-GPU support is currently experimental, however,
+    so doing the work here until that feature is in place.
+    """
+    if not FLAGS.multi_gpu:
+        return 0
+
+    from tensorflow.python.client import device_lib
+
+    local_device_protos = device_lib.list_local_devices()
+    num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
+    if not num_gpus:
+        raise ValueError('Multi-GPU mode was specified, but no GPUs '
+                        'were found. To use CPU, run without --multi_gpu=False.')
+
+    remainder = batch_size % num_gpus
+    if remainder:
+        err = ('When running with multiple GPUs, batch size '
+                'must be a multiple of the number of available GPUs. '
+                'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
+                ).format(num_gpus, batch_size, batch_size - remainder)
+        raise ValueError(err)
+    return num_gpus
+
+def input_pipeline(is_training=True, model_scope=FLAGS.model_scope, num_epochs=None):
+    if 'all' in model_scope:
+        lnorm_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.global_norm_key, dtype=tf.int64),
+                                                                tf.constant(config.global_norm_lvalues, dtype=tf.int64)), 0)
+        rnorm_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.global_norm_key, dtype=tf.int64),
+                                                                tf.constant(config.global_norm_rvalues, dtype=tf.int64)), 1)
+    else:
+        lnorm_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.local_norm_key, dtype=tf.int64),
+                                                                tf.constant(config.local_norm_lvalues, dtype=tf.int64)), 0)
+        rnorm_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.local_norm_key, dtype=tf.int64),
+                                                                tf.constant(config.local_norm_rvalues, dtype=tf.int64)), 1)
+
+    preprocessing_fn = lambda org_image, classid, shape, key_x, key_y, key_v: preprocessing.preprocess_image(org_image, classid, shape, FLAGS.train_image_size, FLAGS.train_image_size, key_x, key_y, key_v, (lnorm_table, rnorm_table), is_training=is_training, data_format=('NCHW' if FLAGS.data_format=='channels_first' else 'NHWC'), category=(model_scope if 'all' not in model_scope else '*'), bbox_border=FLAGS.bbox_border, heatmap_sigma=FLAGS.heatmap_sigma, heatmap_size=FLAGS.heatmap_size)
+
+    images, shape, classid, targets, key_v, isvalid, norm_value = dataset.slim_get_split(FLAGS.data_dir, preprocessing_fn, FLAGS.batch_size, FLAGS.num_readers, FLAGS.num_preprocessing_threads, num_epochs=num_epochs, is_training=is_training, file_pattern=FLAGS.dataset_name, category=(model_scope if 'all' not in model_scope else '*'), reader=None)
+
+    return images, {'targets': targets, 'key_v': key_v, 'shape': shape, 'classid': classid, 'isvalid': isvalid, 'norm_value': norm_value}
+
+if config.PRED_DEBUG:
+  from scipy.misc import imread, imsave, imshow, imresize
+  def save_image_with_heatmap(image, height, width, heatmap_size, targets, pred_heatmap, indR, indG, indB):
+      if not hasattr(save_image_with_heatmap, "counter"):
+          save_image_with_heatmap.counter = 0  # it doesn't exist yet, so initialize it
+      save_image_with_heatmap.counter += 1
+
+      img_to_save = np.array(image.tolist()) + 128
+      #print(img_to_save.shape)
+
+      img_to_save = img_to_save.astype(np.uint8)
+
+      heatmap0 = np.sum(targets[indR, ...], axis=0).astype(np.uint8)
+      heatmap1 = np.sum(targets[indG, ...], axis=0).astype(np.uint8)
+      heatmap2 = np.sum(targets[indB, ...], axis=0).astype(np.uint8) if len(indB) > 0 else np.zeros((heatmap_size, heatmap_size), dtype=np.float32)
+
+      img_to_save = imresize(img_to_save, (height, width), interp='lanczos')
+      heatmap0 = imresize(heatmap0, (height, width), interp='lanczos')
+      heatmap1 = imresize(heatmap1, (height, width), interp='lanczos')
+      heatmap2 = imresize(heatmap2, (height, width), interp='lanczos')
+
+      img_to_save = img_to_save/2
+      img_to_save[:,:,0] = np.clip((img_to_save[:,:,0] + heatmap0 + heatmap2), 0, 255)
+      img_to_save[:,:,1] = np.clip((img_to_save[:,:,1] + heatmap1 + heatmap2), 0, 255)
+      #img_to_save[:,:,2] = np.clip((img_to_save[:,:,2]/4. + heatmap2), 0, 255)
+      file_name = 'targets_{}.jpg'.format(save_image_with_heatmap.counter)
+      imsave(os.path.join(config.DEBUG_DIR, file_name), img_to_save.astype(np.uint8))
+
+      pred_heatmap = np.array(pred_heatmap.tolist())
+      #print(pred_heatmap.shape)
+      for ind in range(pred_heatmap.shape[0]):
+        img = pred_heatmap[ind]
+        img = img - img.min()
+        img *= 255.0/img.max()
+        file_name = 'heatmap_{}_{}.jpg'.format(save_image_with_heatmap.counter, ind)
+        imsave(os.path.join(config.DEBUG_DIR, file_name), img.astype(np.uint8))
+      return save_image_with_heatmap.counter
+
+def get_keypoint(image, targets, predictions, heatmap_size, height, width, category, clip_at_zero=True, data_format='channels_last', name=None):
+    predictions = tf.reshape(predictions, [1, -1, heatmap_size*heatmap_size])
+
+    pred_max = tf.reduce_max(predictions, axis=-1)
+    pred_indices = tf.argmax(predictions, axis=-1)
+    pred_x, pred_y = tf.cast(tf.floormod(pred_indices, heatmap_size), tf.float32), tf.cast(tf.floordiv(pred_indices, heatmap_size), tf.float32)
+
+    width, height = tf.cast(width, tf.float32), tf.cast(height, tf.float32)
+    pred_x, pred_y = pred_x * width / tf.cast(heatmap_size, tf.float32), pred_y * height / tf.cast(heatmap_size, tf.float32)
+
+    if clip_at_zero:
+      pred_x, pred_y =  pred_x * tf.cast(pred_max>0, tf.float32), pred_y * tf.cast(pred_max>0, tf.float32)
+      pred_x = pred_x * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (width / 2.)
+      pred_y = pred_y * tf.cast(pred_max>0, tf.float32) + tf.cast(pred_max<=0, tf.float32) * (height / 2.)
+
+    if config.PRED_DEBUG:
+      pred_indices_ = tf.squeeze(pred_indices)
+      image_ = tf.squeeze(image) * 255.
+      pred_heatmap = tf.one_hot(pred_indices_, heatmap_size*heatmap_size, on_value=1., off_value=0., axis=-1, dtype=tf.float32)
+
+      pred_heatmap = tf.reshape(pred_heatmap, [-1, heatmap_size, heatmap_size])
+      if data_format == 'channels_first':
+        image_ = tf.transpose(image_, perm=(1, 2, 0))
+      save_image_op = tf.py_func(save_image_with_heatmap,
+                                  [image_, height, width,
+                                  heatmap_size,
+                                  tf.reshape(pred_heatmap * 255., [-1, heatmap_size, heatmap_size]),
+                                  tf.reshape(predictions, [-1, heatmap_size, heatmap_size]),
+                                  config.left_right_group_map[category][0],
+                                  config.left_right_group_map[category][1],
+                                  config.left_right_group_map[category][2]],
+                                  tf.int64, stateful=True)
+      with tf.control_dependencies([save_image_op]):
+        pred_x, pred_y = pred_x * 1., pred_y * 1.
+    return pred_x, pred_y
+
+def gaussian_blur(inputs, inputs_filters, sigma, data_format, name=None):
+    with tf.name_scope(name, "gaussian_blur", [inputs]):
+        data_format_ = 'NHWC' if data_format=='channels_last' else 'NCHW'
+        if data_format_ == 'NHWC':
+            inputs = tf.transpose(inputs, [0, 2, 3, 1])
+        ksize = int(6 * sigma + 1.)
+        x = tf.expand_dims(tf.range(ksize, delta=1, dtype=tf.float32), axis=1)
+        y = tf.transpose(x, [1, 0])
+        kernel_matrix = tf.exp(- ((x - ksize/2.) ** 2 + (y - ksize/2.) ** 2) / (2 * sigma ** 2))
+        #print(kernel_matrix)
+        kernel_filter = tf.reshape(kernel_matrix, [ksize, ksize, 1, 1])
+        kernel_filter = tf.tile(kernel_filter, [1, 1, inputs_filters, 1])
+        #kernel_filter = tf.transpose(kernel_filter, [1, 0, 2, 3])
+        outputs = tf.nn.depthwise_conv2d(inputs, kernel_filter, strides=[1, 1, 1, 1], padding='SAME', data_format=data_format_, name='blur')
+        if data_format_ == 'NHWC':
+            outputs = tf.transpose(outputs, [0, 3, 1, 2])
+        return outputs
+
+def keypoint_model_fn(features, labels, mode, params):
+    targets = labels['targets']
+    shape = labels['shape']
+    classid = labels['classid']
+    key_v = labels['key_v']
+    isvalid = labels['isvalid']
+    norm_value = labels['norm_value']
+
+    cur_batch_size = tf.shape(features)[0]
+    #features= tf.ones_like(features)
+
+    with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE):
+        pred_outputs = simple_xt.simple_net(features, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], params['heatmap_size'], (mode == tf.estimator.ModeKeys.TRAIN), params['data_format'])[0]
+
+    if params['data_format'] == 'channels_last':
+        pred_outputs = tf.transpose(pred_outputs, [0, 3, 1, 2], name='outputs_trans')
+
+    score_map = pred_outputs
+
+    pred_x, pred_y = get_keypoint(features, targets, score_map, params['heatmap_size'], params['train_image_size'], params['train_image_size'], (params['model_scope'] if 'all' not in params['model_scope'] else '*'), clip_at_zero=True, data_format=params['data_format'])
+
+    # this is important!!!
+    targets = 255. * targets
+
+    #with tf.control_dependencies([pred_x, pred_y]):
+    ne_mertric = mertric.normalized_error(targets, score_map, norm_value, key_v, isvalid,
+                             cur_batch_size,
+                             config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')],
+                             params['heatmap_size'],
+                             params['train_image_size'])
+
+    all_visible = tf.expand_dims(tf.expand_dims(tf.cast(tf.logical_and(key_v>0, isvalid>0), tf.float32), axis=-1), axis=-1)
+    targets = targets * all_visible
+    pred_outputs = pred_outputs * all_visible
+
+    sq_diff = tf.reduce_sum(tf.squared_difference(targets, pred_outputs), axis=-1)
+    last_pred_mse = tf.metrics.mean_absolute_error(sq_diff, tf.zeros_like(sq_diff), name='last_pred_mse')
+
+    metrics = {'normalized_error': ne_mertric, 'last_pred_mse':last_pred_mse}
+    predictions = {'normalized_error': ne_mertric[1]}
+    ne_mertric = tf.identity(ne_mertric[1], name='ne_mertric')
+
+    base_learning_rate = params['learning_rate']
+    mse_loss_list = []
+    if params['use_ohkm']:
+        base_learning_rate = 1. * base_learning_rate
+        temp_loss = tf.reduce_mean(tf.reshape(tf.losses.mean_squared_error(targets, pred_outputs, weights=1.0, loss_collection=None, reduction=tf.losses.Reduction.NONE), [cur_batch_size, config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')], -1]), axis=-1)
+
+        num_topk = config.class_num_joints[(params['model_scope'] if 'all' not in params['model_scope'] else '*')] // 2
+        gather_col = tf.nn.top_k(temp_loss, k=num_topk, sorted=True)[1]
+        gather_row = tf.reshape(tf.tile(tf.reshape(tf.range(cur_batch_size), [-1, 1]), [1, num_topk]), [-1, 1])
+        gather_indcies = tf.stop_gradient(tf.stack([gather_row, tf.reshape(gather_col, [-1, 1])], axis=-1))
+
+        select_targets = tf.gather_nd(targets, gather_indcies)
+        select_heatmap = tf.gather_nd(pred_outputs, gather_indcies)
+
+        mse_loss_list.append(tf.losses.mean_squared_error(select_targets, select_heatmap,
+                                weights=1.0 / tf.cast(cur_batch_size, tf.float32),
+                                scope='loss',
+                                loss_collection=None,#tf.GraphKeys.LOSSES,
+                                # mean all elements of all pixels in all batch
+                                reduction=tf.losses.Reduction.MEAN))
+    else:
+        mse_loss_list.append(tf.losses.mean_squared_error(targets, pred_outputs,
+                            weights=1.0 / tf.cast(cur_batch_size, tf.float32),
+                            scope='loss',
+                            loss_collection=None,#tf.GraphKeys.LOSSES,
+                            # mean all elements of all pixels in all batch
+                            reduction=tf.losses.Reduction.MEAN))# SUM, SUM_OVER_BATCH_SIZE, default mean by all elements
+
+    mse_loss = tf.multiply(params['mse_weight'], tf.add_n(mse_loss_list), name='mse_loss')
+    tf.summary.scalar('mse', mse_loss)
+    tf.losses.add_loss(mse_loss)
+
+    # bce_loss_list = []
+    # for pred_ind in list(range(len(pred_outputs))):
+    #     bce_loss_list.append(tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred_outputs[pred_ind], labels=targets_list[pred_ind]/255., name='loss_{}'.format(pred_ind)), name='loss_mean_{}'.format(pred_ind)))
+
+    # mse_loss = tf.multiply(params['mse_weight'] / params['num_stacks'], tf.add_n(bce_loss_list), name='mse_loss')
+    # tf.summary.scalar('mse', mse_loss)
+    # tf.losses.add_loss(mse_loss)
+
+    # Add weight decay to the loss. We exclude the batch norm variables because
+    # doing so leads to a small improvement in accuracy.
+    loss = mse_loss + params['weight_decay'] * tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name])
+    total_loss = tf.identity(loss, name='total_loss')
+    tf.summary.scalar('loss', total_loss)
+
+    if mode == tf.estimator.ModeKeys.EVAL:
+        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, predictions=predictions, eval_metric_ops=metrics)
+
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        global_step = tf.train.get_or_create_global_step()
+
+        lr_values = [params['warmup_learning_rate']] + [base_learning_rate * decay for decay in params['lr_decay_factors']]
+        learning_rate = tf.train.piecewise_constant(tf.cast(global_step, tf.int32),
+                                                    [params['warmup_steps']] + [int(float(ep)*params['steps_per_epoch']) for ep in params['decay_boundaries']],
+                                                    lr_values)
+        truncated_learning_rate = tf.maximum(learning_rate, tf.constant(params['end_learning_rate'], dtype=learning_rate.dtype), name='learning_rate')
+        tf.summary.scalar('lr', truncated_learning_rate)
+
+        optimizer = tf.train.MomentumOptimizer(learning_rate=truncated_learning_rate,
+                                                momentum=params['momentum'])
+
+        optimizer = tf_replicate_model_fn.TowerOptimizer(optimizer)
+
+        # Batch norm requires update_ops to be added as a train_op dependency.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(loss, global_step)
+    else:
+        train_op = None
+
+    return tf.estimator.EstimatorSpec(
+                          mode=mode,
+                          predictions=predictions,
+                          loss=loss,
+                          train_op=train_op,
+                          eval_metric_ops=metrics,
+                          scaffold=tf.train.Scaffold(init_fn=train_helper.get_init_fn_for_scaffold_(params['checkpoint_path'], params['model_dir'], params['checkpoint_exclude_scopes'], params['model_scope'], params['checkpoint_model_scope'], params['ignore_missing_vars'])))
+
+def parse_comma_list(args):
+    return [float(s.strip()) for s in args.split(',')]
+
+def sub_loop(model_fn, model_scope, model_dir, run_config, train_epochs, epochs_per_eval, lr_decay_factors, decay_boundaries, checkpoint_path=None, checkpoint_exclude_scopes='', checkpoint_model_scope='', ignore_missing_vars=True):
+    steps_per_epoch = config.split_size[(model_scope if 'all' not in model_scope else '*')]['train'] // FLAGS.batch_size
+
+    _replicate_model_fn = tf_replicate_model_fn.replicate_model_fn(model_fn, loss_reduction=tf.losses.Reduction.MEAN)
+
+    fashionAI = tf.estimator.Estimator(
+        model_fn=_replicate_model_fn, model_dir=model_dir, config=run_config.replace(save_checkpoints_steps=2*steps_per_epoch),
+        params={
+            'checkpoint_path': checkpoint_path,
+            'model_dir': model_dir,
+            'checkpoint_exclude_scopes': checkpoint_exclude_scopes,
+            'model_scope': model_scope,
+            'checkpoint_model_scope': checkpoint_model_scope,
+            'ignore_missing_vars': ignore_missing_vars,
+            'train_image_size': FLAGS.train_image_size,
+            'heatmap_size': FLAGS.heatmap_size,
+            'data_format': FLAGS.data_format,
+            'steps_per_epoch': steps_per_epoch,
+            'use_ohkm': FLAGS.use_ohkm,
+            'batch_size': FLAGS.batch_size,
+            'weight_decay': FLAGS.weight_decay,
+            'mse_weight': FLAGS.mse_weight,
+            'momentum': FLAGS.momentum,
+            'learning_rate': FLAGS.learning_rate,
+            'end_learning_rate': FLAGS.end_learning_rate,
+            'warmup_learning_rate': FLAGS.warmup_learning_rate,
+            'warmup_steps': FLAGS.warmup_steps,
+            'decay_boundaries': parse_comma_list(decay_boundaries),
+            'lr_decay_factors': parse_comma_list(lr_decay_factors),
+        })
+
+    tf.gfile.MakeDirs(model_dir)
+    tf.logging.info('Starting to train model {}.'.format(model_scope))
+    for _ in range(train_epochs // epochs_per_eval):
+        tensors_to_log = {
+            'lr': 'learning_rate',
+            'loss': 'total_loss',
+            'mse': 'mse_loss',
+            'ne': 'ne_mertric',
+        }
+
+        logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=FLAGS.log_every_n_steps, formatter=lambda dicts: '{}:'.format(model_scope) + (', '.join(['%s=%.6f' % (k, v) for k, v in dicts.items()])))
+
+        tf.logging.info('Starting a training cycle.')
+        fashionAI.train(input_fn=lambda : input_pipeline(True, model_scope, epochs_per_eval), hooks=[logging_hook], max_steps=(steps_per_epoch*train_epochs))
+
+        tf.logging.info('Starting to evaluate.')
+        eval_results = fashionAI.evaluate(input_fn=lambda : input_pipeline(False, model_scope, 1))
+        tf.logging.info(eval_results)
+    tf.logging.info('Finished model {}.'.format(model_scope))
+
+def main(_):
+    # Using the Winograd non-fused algorithms provides a small performance boost.
+    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+
+    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction)
+    sess_config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False, intra_op_parallelism_threads = FLAGS.num_cpu_threads, inter_op_parallelism_threads = FLAGS.num_cpu_threads, gpu_options = gpu_options)
+
+    # Set up a RunConfig to only save checkpoints once per training cycle.
+    run_config = tf.estimator.RunConfig().replace(
+                                        save_checkpoints_secs=None).replace(
+                                        save_checkpoints_steps=FLAGS.save_checkpoints_steps).replace(
+                                        save_summary_steps=FLAGS.save_summary_steps).replace(
+                                        keep_checkpoint_max=5).replace(
+                                        tf_random_seed=FLAGS.tf_random_seed).replace(
+                                        log_step_count_steps=FLAGS.log_every_n_steps).replace(
+                                        session_config=sess_config)
+
+    num_gpus = validate_batch_size_for_multi_gpu(FLAGS.batch_size)
+
+    full_model_dir = FLAGS.model_dir if FLAGS.run_on_cloud else FLAGS.model_dir
+    detail_params = {
+        'blouse': {
+            'model_dir' : os.path.join(full_model_dir, 'blouse'),
+            'train_epochs': 30,
+            'epochs_per_eval': 30,
+            'lr_decay_factors': '1, 0.5, 0.1, 0.01',
+            'decay_boundaries': '15, 20, 28',
+            'model_scope': 'blouse',
+            'checkpoint_path': os.path.join(FLAGS.data_dir, FLAGS.cloud_checkpoint_path) if FLAGS.run_on_cloud else FLAGS.checkpoint_path,
+            'checkpoint_model_scope': '',
+            'checkpoint_exclude_scopes': 'blouse/additional_layer',
+            'ignore_missing_vars': True,
+        },
+        'dress': {
+            'model_dir' : os.path.join(full_model_dir, 'dress'),
+            'train_epochs': 30,
+            'epochs_per_eval': 30,
+            'lr_decay_factors': '1, 0.5, 0.1, 0.01',
+            'decay_boundaries': '15, 20, 28',
+            'model_scope': 'dress',
+            'checkpoint_path': os.path.join(FLAGS.data_dir, FLAGS.cloud_checkpoint_path) if FLAGS.run_on_cloud else FLAGS.checkpoint_path,
+            'checkpoint_model_scope': '',
+            'checkpoint_exclude_scopes': 'dress/additional_layer',
+            'ignore_missing_vars': True,
+        },
+        'outwear': {
+            'model_dir' : os.path.join(full_model_dir, 'outwear'),
+            'train_epochs': 30,
+            'epochs_per_eval': 30,
+            'lr_decay_factors': '1, 0.5, 0.1, 0.01',
+            'decay_boundaries': '15, 20, 28',
+            'model_scope': 'outwear',
+            'checkpoint_path': os.path.join(FLAGS.data_dir, FLAGS.cloud_checkpoint_path) if FLAGS.run_on_cloud else FLAGS.checkpoint_path,
+            'checkpoint_model_scope': '',
+            'checkpoint_exclude_scopes': 'outwear/additional_layer',
+            'ignore_missing_vars': True,
+        },
+        'skirt': {
+            'model_dir' : os.path.join(full_model_dir, 'skirt'),
+            'train_epochs': 30,
+            'epochs_per_eval': 30,
+            'lr_decay_factors': '1, 0.5, 0.1, 0.01',
+            'decay_boundaries': '15, 20, 28',
+            'model_scope': 'skirt',
+            'checkpoint_path': os.path.join(FLAGS.data_dir, FLAGS.cloud_checkpoint_path) if FLAGS.run_on_cloud else FLAGS.checkpoint_path,
+            'checkpoint_model_scope': '',
+            'checkpoint_exclude_scopes': 'skirt/additional_layer',
+            'ignore_missing_vars': True,
+        },
+        'trousers': {
+            'model_dir' : os.path.join(full_model_dir, 'trousers'),
+            'train_epochs': 30,
+            'epochs_per_eval': 30,
+            'lr_decay_factors': '1, 0.5, 0.1, 0.01',
+            'decay_boundaries': '15, 20, 28',
+            'model_scope': 'trousers',
+            'checkpoint_path': os.path.join(FLAGS.data_dir, FLAGS.cloud_checkpoint_path) if FLAGS.run_on_cloud else FLAGS.checkpoint_path,
+            'checkpoint_model_scope': '',
+            'checkpoint_exclude_scopes': 'trousers/additional_layer',
+            'ignore_missing_vars': True,
+        },
+    }
+    model_to_train = [s.strip() for s in FLAGS.model_to_train.split(',')]
+
+    for m in model_to_train:
+        sub_loop(keypoint_model_fn, m, detail_params[m]['model_dir'], run_config, detail_params[m]['train_epochs'], detail_params[m]['epochs_per_eval'], detail_params[m]['lr_decay_factors'], detail_params[m]['decay_boundaries'], detail_params[m]['checkpoint_path'], detail_params[m]['checkpoint_exclude_scopes'], detail_params[m]['checkpoint_model_scope'], detail_params[m]['ignore_missing_vars'])
+
+if __name__ == '__main__':
+  tf.logging.set_verbosity(tf.logging.INFO)
+  tf.app.run()