mindspore-lab · hqkate · Jun 12, 2023 · Jun 12, 2023 · Jun 15, 2023 · Jun 16, 2023
diff --git a/configs/rec/crnn/crnn_icdar15.yaml b/configs/rec/crnn/crnn_icdar15.yaml
@@ -96,16 +96,12 @@ train:
           character_dict_path: *character_dict_path
           use_space_char: *use_space_char
           lower: True
-      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize.
+      - RecResizeNormImg:
           image_shape: [32, 100] # H, W
           infer_mode: *infer_mode
           character_dict_path: *character_dict_path
           padding: False # aspect ratio will be preserved if true.
-      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec.
-          bgr_to_rgb: True
-          is_hwc: True
-          mean : [127.0, 127.0, 127.0]
-          std : [127.0, 127.0, 127.0]
+          norm_before_pad: False
       - ToCHWImage:
     #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
     output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']

diff --git a/configs/rec/crnn/crnn_resnet34.yaml b/configs/rec/crnn/crnn_resnet34.yaml
@@ -80,23 +80,19 @@ train:
     shuffle: True
     transform_pipeline:
       - DecodeImage:
-          img_mode: BGR
+          img_mode: RGB
           to_float32: False
       - RecCTCLabelEncode:
           max_text_len: *max_text_len
           character_dict_path: *character_dict_path
           use_space_char: *use_space_char
           lower: True
-      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize.
+      - RecResizeNormImg:
           image_shape: [32, 100] # H, W
           infer_mode: *infer_mode
           character_dict_path: *character_dict_path
           padding: False # aspect ratio will be preserved if true.
-      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec.
-          bgr_to_rgb: True
-          is_hwc: True
-          mean : [127.0, 127.0, 127.0]
-          std : [127.0, 127.0, 127.0]
+          norm_before_pad: False
       - ToCHWImage:
     #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
     output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']

diff --git a/configs/rec/crnn/crnn_resnet34_ch.yaml b/configs/rec/crnn/crnn_resnet34_ch.yaml
@@ -84,7 +84,7 @@ train:
     max_text_len: *max_text_len
     transform_pipeline:
       - DecodeImage:
-          img_mode: BGR
+          img_mode: RGB
           to_float32: False
       - RecCTCLabelEncode:
           max_text_len: *max_text_len
@@ -94,16 +94,12 @@ train:
       - Rotate90IfVertical:
           threshold: 2.0
           direction: counterclockwise
-      - RecResizeImg:
-          image_shape: [32, 320]
+      - RecResizeNormImg:
+          image_shape: [32, 320] # H, W
           infer_mode: *infer_mode
           character_dict_path: *character_dict_path
-          padding: True
-      - NormalizeImage:
-          bgr_to_rgb: True
-          is_hwc: True
-          mean: [127.0, 127.0, 127.0]
-          std: [127.0, 127.0, 127.0]
+          padding: True # aspect ratio will be preserved if true.
+          norm_before_pad: False
       - ToCHWImage:
     output_columns: ["image", "text_seq"]
     net_input_column_index: [0]

diff --git a/configs/rec/crnn/crnn_resnet34_server.yaml b/configs/rec/crnn/crnn_resnet34_server.yaml
@@ -0,0 +1,150 @@
+system:
+  mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
+  distribute: True
+  amp_level: 'O3'
+  seed: 42
+  log_interval: 100
+  val_while_train: True
+  drop_overflow_update: False
+
+common:
+  character_dict_path: &character_dict_path  mindocr/utils/dict/en_dict.txt
+  num_classes: &num_classes 96 # num_chars_in_dict+1,  TODO: retreive it from dict or check correctness
+  max_text_len: &max_text_len 24
+  infer_mode: &infer_mode False
+  use_space_char: &use_space_char True
+  lower: &lower False
+  batch_size: &batch_size 64
+
+model:
+  type: rec
+  transform: null
+  backbone:
+    name: rec_resnet34
+    pretrained: False
+  neck:
+    name: RNNEncoder
+    hidden_size: 256
+  head:
+    name: CTCHead
+    weight_init: crnn_customised
+    bias_init: crnn_customised
+    out_channels: *num_classes
+
+postprocess:
+  name: RecCTCLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+
+metric:
+  name: RecMetric
+  main_indicator: acc
+  character_dict_path: *character_dict_path
+  ignore_space: True
+  print_flag: False
+
+loss:
+  name: CTCLoss
+  pred_seq_len: 25 # TODO: retrieve from the network output shape.
+  max_label_len: *max_text_len  # this value should be smaller than pre_seq_len
+  batch_size: *batch_size
+
+scheduler:
+  scheduler: warmup_cosine_decay
+  min_lr: 0.000001
+  lr: 0.001
+  num_epochs: 30
+  warmup_epochs: 2
+  decay_epochs: 28
+
+optimizer:
+  opt: adamw
+  filter_bias_and_bn: True
+  momentum: 0.95
+  weight_decay: 0.0001
+  nesterov: False
+
+loss_scaler:
+  type: dynamic
+  loss_scale: 512
+  scale_factor: 2.0
+  scale_window: 1000
+
+train:
+  ckpt_save_dir: './crnn_resnet34_server'
+  pred_cast_fp32: False # let CTCLoss cast internally
+  ema: True # added
+  dataset_sink_mode: False
+  dataset:
+    type: LMDBDataset
+    dataset_root: /path/to/data_lmdb_release/
+    data_dir: training/
+    # label_file: # not required when using LMDBDataset
+    sample_ratio: 1.0
+    shuffle: True
+    transform_pipeline:
+      - DecodeImage:
+          img_mode: RGB
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          lower: *lower
+      - RecResizeNormImg:
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true.
+          norm_before_pad: True
+      - ToCHWImage:
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
+    output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']
+    net_input_column_index: [0] # input indices for network forward func in output_columns
+    label_column_index: [1] # input indices marked as label
+    #keys_for_loss: 4 # num labels for loss func
+
+  loader:
+      shuffle: True
+      batch_size: *batch_size
+      drop_remainder: True
+      max_rowsize: 12
+      num_workers: 8
+
+eval:
+  ckpt_load_path: ./crnn_resnet34_server/best.ckpt
+  dataset_sink_mode: False
+  dataset:
+    type: LMDBDataset
+    dataset_root: /path/to/data_lmdb_release/
+    data_dir: validation/
+    # label_file: # not required when using LMDBDataset
+    sample_ratio: 1.0
+    shuffle: False
+    transform_pipeline:
+      - DecodeImage:
+          img_mode: RGB
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          lower: *lower
+      - RecResizeNormImg:
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true.
+          norm_before_pad: True
+      - ToCHWImage:
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
+    output_columns: ['image', 'text_padded', 'text_length']  # TODO return text string padding w/ fixed length, and a scaler to indicate the length
+    net_input_column_index: [0] # input indices for network forward func in output_columns
+    label_column_index: [1, 2] # input indices marked as label
+
+  loader:
+      shuffle: False # TODO: tbc
+      batch_size: 64
+      drop_remainder: False
+      max_rowsize: 12
+      num_workers: 8
diff --git a/configs/rec/crnn/crnn_vgg7.yaml b/configs/rec/crnn/crnn_vgg7.yaml
@@ -81,23 +81,19 @@ train:
     shuffle: True
     transform_pipeline:
       - DecodeImage:
-          img_mode: BGR
+          img_mode: RGB
           to_float32: False
       - RecCTCLabelEncode:
           max_text_len: *max_text_len
           character_dict_path: *character_dict_path
           use_space_char: *use_space_char
           lower: True
-      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize.
+      - RecResizeNormImg:
           image_shape: [32, 100] # H, W
           infer_mode: *infer_mode
           character_dict_path: *character_dict_path
           padding: False # aspect ratio will be preserved if true.
-      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec.
-          bgr_to_rgb: True
-          is_hwc: True
-          mean : [127.0, 127.0, 127.0]
-          std : [127.0, 127.0, 127.0]
+          norm_before_pad: False
       - ToCHWImage:
     #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
     output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']

diff --git a/configs/rec/rare/rare_resnet34.yaml b/configs/rec/rare/rare_resnet34.yaml
@@ -83,16 +83,12 @@ train:
           character_dict_path: *character_dict_path
           use_space_char: *use_space_char
           lower: True
-      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize.
+      - RecResizeNormImg:
           image_shape: [32, 100] # H, W
           infer_mode: *infer_mode
           character_dict_path: *character_dict_path
           padding: False # aspect ratio will be preserved if true.
-      - NormalizeImage: # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec.
-          bgr_to_rgb: True
-          is_hwc: True
-          mean: [127.0, 127.0, 127.0]
-          std: [127.0, 127.0, 127.0]
+          norm_before_pad: False
       - ToCHWImage:
     output_columns: ["image", "text_seq"]
     net_input_column_index: [0, 1] # input indices for network forward func in output_columns

diff --git a/configs/rec/rare/rare_resnet34_ch.yaml b/configs/rec/rare/rare_resnet34_ch.yaml
@@ -93,16 +93,12 @@ train:
       - Rotate90IfVertical:
           threshold: 2.0
           direction: counterclockwise
-      - RecResizeImg:
-          image_shape: [32, 320]
+      - RecResizeNormImg:
+          image_shape: [32, 320] # H, W
           infer_mode: *infer_mode
           character_dict_path: *character_dict_path
-          padding: True
-      - NormalizeImage:
-          bgr_to_rgb: True
-          is_hwc: True
-          mean: [127.0, 127.0, 127.0]
-          std: [127.0, 127.0, 127.0]
+          padding: True # aspect ratio will be preserved if true.
+          norm_before_pad: False
       - ToCHWImage:
     output_columns: ["image", "text_seq"]
     net_input_column_index: [0, 1]