diff --git a/preprocess.sh b/preprocess.sh index c5ca311..fe1e004 100644 --- a/preprocess.sh +++ b/preprocess.sh @@ -2,4 +2,6 @@ python3 crop.py python3 create_classification_dataset.py python3 classification_vgg.py python3 create_PSOL_dataset.py -python3 PSOL/generate_box_imagenet_crop.py \ No newline at end of file +python3 PSOL/generate_box_imagenet_crop.py +python3 tools/train.py -c svtr_large_train_stn.yml # Distributed Training: python -m paddle.distributed.launch --gpus '0,1,2,3' -c svtr_large_train_stn.yml +python3 tools/train.py -c svtr_large_train_stn.yml -o Global.pretrained_model=svtr_large_stn/best_accuracy \ No newline at end of file diff --git a/rec_PP-OCRv3.yml b/rec_PP-OCRv3.yml deleted file mode 100644 index b27c6d5..0000000 --- a/rec_PP-OCRv3.yml +++ /dev/null @@ -1,130 +0,0 @@ -Global: - debug: false - use_gpu: true - epoch_num: 500 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./ppocrv3 - save_epoch_step: 200 - eval_batch_step: [0, 2000] - cal_metric_during_train: true - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: false - infer_img: ./data/DDT_crop/test/ - character_dict_path: ppocr/utils/EN_symbol_dict.txt - max_text_length: &max_text_length 25 - infer_mode: false - use_space_char: true - distributed: true - save_res_path: ./ppocrv3.txt - - -Optimizer: - name: Adam - beta1: 0.9 - beta2: 0.999 - lr: - name: Cosine - learning_rate: 0.001 - warmup_epoch: 5 - regularizer: - name: L2 - factor: 3.0e-05 - - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [1, 2] - last_pool_type: avg - Head: - name: MultiHead - head_list: - - CTCHead: - Neck: - name: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - fc_decay: 0.00001 - - SARHead: - enc_dim: 512 - max_text_length: *max_text_length - -Loss: - name: MultiLoss - loss_config_list: - - CTCLoss: - - SARLoss: - -PostProcess: - name: CTCLabelDecode - -Metric: - name: RecMetric - main_indicator: acc - ignore_space: False - -Train: - dataset: - name: SimpleDataSet - label_file_list: ['./data/train.txt'] - data_dir: './data/train_set_random/' - ratio_list: 1.0 - transforms: - - DecodeImage: - img_mode: BGR - channel_first: false - - RecConAug: - prob: 0.5 - ext_data_num: 2 - image_shape: [48, 480, 3] - - RecAug: - - MultiLabelEncode: - - RecResizeImg: - image_shape: [3, 48, 480] - - KeepKeys: - keep_keys: - - image - - label_ctc - - label_sar - - length - - valid_ratio - loader: - shuffle: true - batch_size_per_card: 256 - drop_last: true - num_workers: 4 -Eval: - dataset: - name: SimpleDataSet - label_file_list: ['./data/val.txt'] - data_dir: './data/train_set_random/' - ratio_list: 1.0 - transforms: - - DecodeImage: - img_mode: BGR - channel_first: false - - MultiLabelEncode: - - RecResizeImg: - image_shape: [3, 48, 480] - - KeepKeys: - keep_keys: - - image - - label_ctc - - label_sar - - length - - valid_ratio - loader: - shuffle: false - drop_last: false - batch_size_per_card: 128 - num_workers: 4 diff --git a/rec_svtrnet.yml b/rec_svtrnet.yml deleted file mode 100644 index f0bfa85..0000000 --- a/rec_svtrnet.yml +++ /dev/null @@ -1,118 +0,0 @@ -Global: - use_gpu: True - epoch_num: 500 - log_smooth_window: 20 - print_batch_step: 20 - save_model_dir: ./svtr/ - save_epoch_step: 200 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 50000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: data/DDT_crop/test/ - # for data or label process - character_dict_path: ./ppocr/utils/EN_symbol_dict.txt - max_text_length: 25 - infer_mode: False - use_space_char: False - save_res_path: ./svtr.csv - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.0005 - warmup_epoch: 2 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [32, 100] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [32, 100] - out_char_num: 25 - out_channels: 192 - patch_merging: 'Conv' - embed_dim: [64, 128, 256] - depth: [3, 6, 3] - num_heads: [2, 4, 8] - mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] - local_mixer: [[7, 11], [7, 11], [7, 11]] - last_stage: True - prenorm: false - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: CTCLabelDecode - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: SimpleDataSet - label_file_list: ['./data/train.txt'] - data_dir: './data/DDT_crop/train' - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - character_dict_path: - image_shape: [3, 24, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 256 - drop_last: True - num_workers: 6 - -Eval: - dataset: - name: SimpleDataSet - label_file_list: ['./data/val.txt'] - data_dir: './data/DDT_crop/train' - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - character_dict_path: - image_shape: [3, 24, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 64 - num_workers: 4 diff --git a/requirements.txt b/requirements.txt index 0d47333..1321a02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ cython lxml premailer openpyxl -attrdict \ No newline at end of file +attrdict +pyyaml \ No newline at end of file diff --git a/svtr_large.yml b/svtr_large.yml deleted file mode 100644 index 7e04955..0000000 --- a/svtr_large.yml +++ /dev/null @@ -1,119 +0,0 @@ -Global: - use_gpu: True - epoch_num: 500 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./svtr_large - save_epoch_step: 200 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 20000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: data/DDT_crop/test/ - # for data or label process - character_dict_path: ./ppocr/utils/EN_symbol_dict.txt - character_type: en - max_text_length: 25 - infer_mode: False - use_space_char: False - save_res_path: ./svtr.csv - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.000125 - warmup_epoch: 2 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [48, 160] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [48, 160] - out_char_num: 40 - out_channels: 384 - patch_merging: 'Conv' - embed_dim: [192, 256, 512] - depth: [3, 9, 9] - num_heads: [6, 8, 16] - mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] - local_mixer: [[7, 11], [7, 11], [7, 11]] - prenorm: false - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: SVTRLabelDecode # SVTRLabelDecode is used for eval after train, please change to CTCLabelDecode when training - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: SimpleDataSet - label_file_list: ['./data/train.txt'] - data_dir: './data/DDT_crop/train' - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - RecAug: - - CTCLabelEncode: # Class handling label - - RecResizeImg: - character_dict_path: - image_shape: [3, 32, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 256 - drop_last: True - num_workers: 6 - -Eval: - dataset: - name: SimpleDataSet - label_file_list: ['./data/val.txt'] - data_dir: './data/DDT_crop/train' - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval after train, please change to RecResizeImg when training - character_dict_path: - image_shape: [3, 32, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 128 - num_workers: 4 diff --git a/svtr_large_train_stn.yml b/svtr_large_train_stn.yml new file mode 100644 index 0000000..f6e5eb3 --- /dev/null +++ b/svtr_large_train_stn.yml @@ -0,0 +1,158 @@ +Global: + use_gpu: True + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./svtr_large_stn + save_epoch_step: 400 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 5000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: data/DDT_crop/test/ + # for data or label process + character_dict_path: ./ppocr/utils/EN_symbol_dict.txt + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./svtr_large_stn.csv + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.0005 + warmup_epoch: 10 +Architecture: + model_type: rec + algorithm: SVTR + Backbone: + name: SVTRNet + img_size: + - 32 + - 200 + out_char_num: 40 + out_channels: 384 + patch_merging: Conv + embed_dim: + - 192 + - 256 + - 512 + depth: + - 3 + - 9 + - 9 + num_heads: + - 6 + - 8 + - 16 + mixer: + - Local + - Local + - Local + - Local + - Local + - Local + - Local + - Local + - Local + - Local + - Global + - Global + - Global + - Global + - Global + - Global + - Global + - Global + - Global + - Global + - Global + local_mixer: + - - 7 + - 11 + - - 7 + - 11 + - - 7 + - 11 + prenorm: false + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead +Loss: + name: CTCLoss +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc +Train: + dataset: + name: SimpleDataSet + label_file_list: + - ./data/train.txt + data_dir: ./data/DDT_crop/train + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: null + - CTCLabelEncode: null + - RecResizeImg: + character_dict_path: null + image_shape: + - 3 + - 32 + - 200 + padding: false + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 20 +Eval: + dataset: + name: SimpleDataSet + label_file_list: + - ./data/val.txt + data_dir: ./data/DDT_crop/train + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: null + - RecResizeImg: + character_dict_path: null + image_shape: + - 3 + - 32 + - 200 + padding: false + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 64 + num_workers: 4 +profiler_options: null