From 97716c579a7b2a4c99910c629c5b4a95497baa8c Mon Sep 17 00:00:00 2001 From: Shank2358 Date: Tue, 27 Jul 2021 20:35:09 +0800 Subject: [PATCH] debug3 --- .../events.out.tfevents.1627388720.Shank | Bin 0 -> 40 bytes .../events.out.tfevents.1627388833.Shank | Bin 0 -> 40 bytes modelR/lodet.py | 7 +- modelR/lodet_hbb.py | 42 +++ modelR/loss/loss.py | 2 +- modelR/loss/loss_hbb.py | 107 ++++++++ trainHBB.py | 241 ++++++++++++++++++ trainR.py | 89 ++++--- 8 files changed, 446 insertions(+), 42 deletions(-) create mode 100644 log/event/events.out.tfevents.1627388720.Shank create mode 100644 log/event/events.out.tfevents.1627388833.Shank create mode 100644 modelR/lodet_hbb.py create mode 100644 modelR/loss/loss_hbb.py create mode 100644 trainHBB.py diff --git a/log/event/events.out.tfevents.1627388720.Shank b/log/event/events.out.tfevents.1627388720.Shank new file mode 100644 index 0000000000000000000000000000000000000000..814dad9bf4797cfbd692a410423e69a541c1900e GIT binary patch literal 40 rcmb1OfPlsI-b$QL-1: ## multi GPUs + print("Let's use", torch.cuda.device_count(), "GPUs!") + net_model = torch.nn.DataParallel(net_model) + self.model = net_model.to(self.device) + elif torch.cuda.device_count() ==1: + self.model = net_model.to(self.device) ## Single GPU + + #self.optimizer = optim.SGD(self.model.parameters(), lr=cfg.TRAIN["LR_INIT"], momentum=cfg.TRAIN["MOMENTUM"], weight_decay=cfg.TRAIN["WEIGHT_DECAY"]) + self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.TRAIN["LR_INIT"]) + + + self.criterion = Loss(anchors=cfg.MODEL["ANCHORS"], strides=cfg.MODEL["STRIDES"], + iou_threshold_loss=cfg.TRAIN["IOU_THRESHOLD_LOSS"]) + + if resume: + self.__load_model_weights(weight_path) + + self.scheduler = cosine_lr_scheduler.CosineDecayLR(self.optimizer, + T_max=self.epochs*len(self.train_dataloader), + lr_init=cfg.TRAIN["LR_INIT"], + lr_min=cfg.TRAIN["LR_END"], + warmup=cfg.TRAIN["WARMUP_EPOCHS"] * len(self.train_dataloader)) + + + + + def __load_model_weights(self, weight_path): + last_weight = os.path.join(os.path.split(weight_path)[0], "last.pt") + chkpt = torch.load(last_weight, map_location=self.device) + self.model.load_state_dict(chkpt['model'])#, False + self.start_epoch = chkpt['epoch'] + 1 + if chkpt['optimizer'] is not None: + self.optimizer.load_state_dict(chkpt['optimizer']) + self.best_mAP = chkpt['best_mAP'] + del chkpt + + + def __save_model_weights(self, epoch, mAP): + if mAP > self.best_mAP: + self.best_mAP = mAP + best_weight = os.path.join(os.path.split(self.weight_path)[0], "best.pt") + last_weight = os.path.join(os.path.split(self.weight_path)[0], "last.pt") + chkpt = {'epoch': epoch, + 'best_mAP': self.best_mAP, + 'model': self.model.state_dict(), + 'optimizer': self.optimizer.state_dict()} + torch.save(chkpt, last_weight,_use_new_zipfile_serialization=False) + + if self.best_mAP == mAP: + torch.save(chkpt['model'], best_weight,_use_new_zipfile_serialization=False) + if epoch > 0 and epoch % 5 == 0: + torch.save(chkpt, os.path.join(os.path.split(self.weight_path)[0], 'backup_epoch%g.pt'%epoch)) + # + del chkpt + + def __save_model_weights1(self, epoch, mAP): + if mAP > self.best_mAP: + self.best_mAP = mAP + best_weight = os.path.join(os.path.split(self.weight_path)[0], "best1.pt") + last_weight = os.path.join(os.path.split(self.weight_path)[0], "last1.pt") + chkpt = {'epoch': epoch, + 'best_mAP': self.best_mAP, + 'model': self.model.state_dict(), + 'optimizer': self.optimizer.state_dict()} + torch.save(chkpt, last_weight,_use_new_zipfile_serialization=False) + + torch.save(chkpt['model'], best_weight, _use_new_zipfile_serialization=False) + torch.save(chkpt, os.path.join(os.path.split(self.weight_path)[0], 'backup_epoch%g.pt'%epoch)) + # + del chkpt + + def train(self): + global writer + logger.info(self.model) + logger.info(" Training start! Img size:{:d}, Batchsize:{:d}, Number of workers:{:d}".format( + cfg.TRAIN["TRAIN_IMG_SIZE"], cfg.TRAIN["BATCH_SIZE"], cfg.TRAIN["NUMBER_WORKERS"])) + logger.info(" Train datasets number is : {}".format(len(self.train_dataset))) + + for epoch in range(self.start_epoch, self.epochs): + start = time.time() + self.model.train() + + ''' + ################################################################################## + sr_flag = get_sr_flag(epoch, self.sr) + if self.prune == 1: + CBL_idx, _, prune_idx, shortcut_idx, _ = parse_module_defs2(self.model) ############ + if self.sr: + print('shortcut sparse training') + elif self.prune == 0: + CBL_idx, _, prune_idx = parse_module_defs(self.model) ############ model.cfg -> idx + if self.sr: + print('normal sparse training ') + print(prune_idx)#[1, 3, 7, 10, 14, 17, 20, 23, 26, 29, 32, 35, 39, 42, 45, 48, 51, 54, 57, 60, 64, 67, 70, 73, 76, 77, 78, 79, 80, 81, 88, 89, 90, 91, 92, 93, 100, 101, 102, 103, 104, 105] + ################################################################################### + ''' + + mloss = torch.zeros(4) + mAP = 0 + self.__save_model_weights1(epoch, mAP) + for i, (imgs, label_sbbox, label_mbbox, label_lbbox, + sbboxes, mbboxes, lbboxes) in enumerate(self.train_dataloader): + + self.scheduler.step(len(self.train_dataloader)*epoch + i) + imgs = imgs.to(self.device) + label_sbbox = label_sbbox.to(self.device) + label_mbbox = label_mbbox.to(self.device) + label_lbbox = label_lbbox.to(self.device) + sbboxes = sbboxes.to(self.device) + mbboxes = mbboxes.to(self.device) + lbboxes = lbboxes.to(self.device) + p, p_d = self.model(imgs) + + loss, loss_iou, loss_conf, loss_cls = self.criterion(p, p_d, label_sbbox, label_mbbox, + label_lbbox, sbboxes, mbboxes, lbboxes) + self.optimizer.zero_grad() + + + loss.backward() + self.optimizer.step() + + ''' + ######################## + idx2mask = None + # if opt.sr and opt.prune==1 and epoch > opt.epochs * 0.5: + # idx2mask = get_mask2(model, prune_idx, 0.85) + ##self.model.module_list = self.model.module.module_list + BNOptimizer.updateBN(sr_flag, self.model, 0.001, prune_idx, epoch, idx2mask) ###########实际剪枝更新的部分 + ################################################### + ''' + + loss_items = torch.tensor([loss_iou, loss_conf, loss_cls, loss]) + mloss = (mloss * i + loss_items) / (i + 1) + + if i % 50 == 0: + logger.info( + " Epoch:[{:3}/{}] Batch:[{:3}/{}] Img_size:[{:3}] Loss:{:.4f} " + "Loss_IoU:{:.4f} | Loss_Conf:{:.4f} | Loss_Cls:{:.4f} LR:{:g}".format( + epoch, self.epochs, i, len(self.train_dataloader) - 1, self.train_dataset.img_size, + mloss[3], mloss[0], mloss[1], mloss[2], self.optimizer.param_groups[0]['lr'] + )) + writer.add_scalar('loss_iou', mloss[0], len(self.train_dataloader) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + writer.add_scalar('loss_conf', mloss[1], len(self.train_dataloader) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + writer.add_scalar('loss_cls', mloss[2], len(self.train_dataloader) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + writer.add_scalar('train_loss', mloss[3], len(self.train_dataloader) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + + if self.multi_scale_train and (i+1) % 10 == 0: + self.train_dataset.img_size = random.choice(range( + cfg.TRAIN["MULTI_TRAIN_RANGE"][0], cfg.TRAIN["MULTI_TRAIN_RANGE"][1], + cfg.TRAIN["MULTI_TRAIN_RANGE"][2])) * 32 + + + if epoch >= 60 and epoch % 5 == 0 and cfg.TRAIN["EVAL_TYPE"] == 'VOC': + logger.info("===== Validate =====".format(epoch, self.epochs)) + with torch.no_grad(): + APs, inference_time = Evaluator(self.model).APs_voc() + for i in APs: + logger.info("{} --> mAP : {}".format(i, APs[i])) + mAP += APs[i] + mAP = mAP / self.train_dataset.num_classes + logger.info("mAP : {}".format(mAP)) + logger.info("inference time: {:.2f} ms".format(inference_time)) + writer.add_scalar('mAP', mAP, epoch) + + elif epoch >= 60 and epoch % 5 == 0 and cfg.TRAIN["EVAL_TYPE"] == 'COCO': + logger.info("===== Validate =====".format(epoch, self.epochs)) + with torch.no_grad(): + evaluator = COCOEvaluator(data_dir=cfg.DATA_PATH, + img_size=cfg.TEST["TEST_IMG_SIZE"], + confthre=cfg.TEST["CONF_THRESH"], + nmsthre=cfg.TEST["NMS_THRESH"]) + ap50_95, ap50, inference_time = evaluator.evaluate(self.model) + mAP = ap50 + logger.info('ap50_95:{} | ap50:{}'.format(ap50_95, ap50)) + logger.info("inference time: {:.2f} ms".format(inference_time)) + writer.add_scalar('val/COCOAP50', ap50, epoch) + writer.add_scalar('val/COCOAP50_95', ap50_95, epoch) + + self.__save_model_weights(epoch, mAP) + logger.info('Save weights Done') + logger.info("mAP: {:.3f}".format(mAP)) + end = time.time() + logger.info("Inference time: {:.4f}s".format(end - start)) + + logger.info("Training finished. Best_mAP: {:.3f}%".format(self.best_mAP)) + +if __name__ == "__main__": + global logger, writer + parser = argparse.ArgumentParser() + parser.add_argument('--weight_path', type=str, default='weight/mobilenetv2_1.0-0c6065bc.pth', + help='weight file path') #default=None + parser.add_argument('--resume', action='store_true',default=False, help='resume training flag') + parser.add_argument('--gpu_id', type=int, default=0, help='gpu id') + parser.add_argument('--log_path', type=str, default='log/', help='log path') + opt = parser.parse_args() + writer = SummaryWriter(logdir=opt.log_path + '/event') + logger = Logger(log_file_name=opt.log_path + '/log.txt', log_level=logging.DEBUG, logger_name='NPMMRDet').get_log() + + Trainer(weight_path=opt.weight_path, resume=opt.resume, gpu_id=opt.gpu_id).train() \ No newline at end of file diff --git a/trainR.py b/trainR.py index 9437967..3cd94aa 100644 --- a/trainR.py +++ b/trainR.py @@ -16,8 +16,6 @@ class Trainer(object): def __init__(self, weight_path, resume, gpu_id): init_seeds(0) - self.prune=0 - self.sr=True self.device = gpu.select_device(gpu_id) print(self.device) self.start_epoch = 0 @@ -32,8 +30,7 @@ def __init__(self, weight_path, resume, gpu_id): self.train_dataloader = DataLoader(self.train_dataset, batch_size=cfg.TRAIN["BATCH_SIZE"], num_workers=cfg.TRAIN["NUMBER_WORKERS"], - shuffle=True, - pin_memory=True) + shuffle=True) net_model = LODet() if torch.cuda.device_count() >1: ## multi GPUs @@ -43,15 +40,15 @@ def __init__(self, weight_path, resume, gpu_id): elif torch.cuda.device_count() ==1: self.model = net_model.to(self.device) ## Single GPU - #self.optimizer = optim.SGD(self.model.parameters(), lr=cfg.TRAIN["LR_INIT"], momentum=cfg.TRAIN["MOMENTUM"], weight_decay=cfg.TRAIN["WEIGHT_DECAY"]) - self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.TRAIN["LR_INIT"]) - + self.optimizer = optim.SGD(self.model.parameters(), lr=cfg.TRAIN["LR_INIT"], + momentum=cfg.TRAIN["MOMENTUM"], weight_decay=cfg.TRAIN["WEIGHT_DECAY"]) self.criterion = Loss(anchors=cfg.MODEL["ANCHORS"], strides=cfg.MODEL["STRIDES"], iou_threshold_loss=cfg.TRAIN["IOU_THRESHOLD_LOSS"]) if resume: self.__load_model_weights(weight_path) + #self.__save_model_weights_best(160) self.scheduler = cosine_lr_scheduler.CosineDecayLR(self.optimizer, T_max=self.epochs*len(self.train_dataloader), @@ -59,6 +56,21 @@ def __init__(self, weight_path, resume, gpu_id): lr_min=cfg.TRAIN["LR_END"], warmup=cfg.TRAIN["WARMUP_EPOCHS"] * len(self.train_dataloader)) + ''' + def __load_model_weights(self, weight_path, resume): + if resume: + last_weight = os.path.join(os.path.split(weight_path)[0], "last.pt") + chkpt = torch.load(last_weight, map_location=self.device) + self.model.load_state_dict(chkpt['model'])#, False + self.start_epoch = chkpt['epoch'] + 1 + if chkpt['optimizer'] is not None: + self.optimizer.load_state_dict(chkpt['optimizer']) + self.best_mAP = chkpt['best_mAP'] + del chkpt + #else: + #self.model.load_darknet_weights(weight_path) ## Single GPU + #self.model.module.load_darknet_weights(weight_path) ## multi GPUs + ''' def __load_model_weights(self, weight_path): last_weight = os.path.join(os.path.split(weight_path)[0], "last.pt") chkpt = torch.load(last_weight, map_location=self.device) @@ -79,24 +91,21 @@ def __save_model_weights(self, epoch, mAP): 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict()} torch.save(chkpt, last_weight,_use_new_zipfile_serialization=False) + if self.best_mAP == mAP: torch.save(chkpt['model'], best_weight,_use_new_zipfile_serialization=False) if epoch > 0 and epoch % 5 == 0: torch.save(chkpt, os.path.join(os.path.split(self.weight_path)[0], 'backup_epoch%g.pt'%epoch)) + # del chkpt - def __save_model_weights1(self, epoch, mAP): - if mAP > self.best_mAP: - self.best_mAP = mAP - best_weight = os.path.join(os.path.split(self.weight_path)[0], "best1.pt") - last_weight = os.path.join(os.path.split(self.weight_path)[0], "last1.pt") + def __save_model_weights_best(self, epoch): + best_weight = os.path.join(os.path.split(self.weight_path)[0], "best.pt") chkpt = {'epoch': epoch, 'best_mAP': self.best_mAP, 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict()} - torch.save(chkpt, last_weight,_use_new_zipfile_serialization=False) - torch.save(chkpt['model'], best_weight, _use_new_zipfile_serialization=False) - torch.save(chkpt, os.path.join(os.path.split(self.weight_path)[0], 'backup_epoch%g.pt'%epoch)) + torch.save(chkpt['model'], best_weight,_use_new_zipfile_serialization=False) del chkpt def train(self): @@ -109,12 +118,9 @@ def train(self): for epoch in range(self.start_epoch, self.epochs): start = time.time() self.model.train() + mloss = torch.zeros(7) - mloss = torch.zeros(4) - mAP = 0 - #self.__save_model_weights1(epoch, mAP) - for i, (imgs, label_sbbox, label_mbbox, label_lbbox, - sbboxes, mbboxes, lbboxes) in enumerate(self.train_dataloader): + for i, (imgs, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes) in enumerate(self.train_dataloader): self.scheduler.step(len(self.train_dataloader)*epoch + i) imgs = imgs.to(self.device) @@ -125,39 +131,43 @@ def train(self): mbboxes = mbboxes.to(self.device) lbboxes = lbboxes.to(self.device) p, p_d = self.model(imgs) - - loss, loss_iou, loss_conf, loss_cls = self.criterion(p, p_d, label_sbbox, label_mbbox, + loss, loss_iou, loss_conf, loss_cls, loss_a, loss_r, loss_s = self.criterion(p, p_d, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes) self.optimizer.zero_grad() - loss.backward() self.optimizer.step() - loss_items = torch.tensor([loss_iou, loss_conf, loss_cls, loss]) + loss_items = torch.tensor([loss_iou, loss_conf, loss_cls, loss_a, loss_r, loss_s, loss]) mloss = (mloss * i + loss_items) / (i + 1) - + mAP = 0 if i % 50 == 0: logger.info( " Epoch:[{:3}/{}] Batch:[{:3}/{}] Img_size:[{:3}] Loss:{:.4f} " - "Loss_IoU:{:.4f} | Loss_Conf:{:.4f} | Loss_Cls:{:.4f} LR:{:g}".format( + "Loss_IoU:{:.4f} | Loss_Conf:{:.4f} | Loss_Cls:{:.4f} | Loss_avgA:{:.4f} | Loss_R:{:.4f} | Loss_S:{:.4f} LR:{:g}".format( epoch, self.epochs, i, len(self.train_dataloader) - 1, self.train_dataset.img_size, - mloss[3], mloss[0], mloss[1], mloss[2], self.optimizer.param_groups[0]['lr'] + mloss[6], mloss[0], mloss[1], mloss[2], mloss[3], mloss[4], mloss[5], self.optimizer.param_groups[0]['lr'] )) writer.add_scalar('loss_iou', mloss[0], len(self.train_dataloader) - * (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) writer.add_scalar('loss_conf', mloss[1], len(self.train_dataloader) - * (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) writer.add_scalar('loss_cls', mloss[2], len(self.train_dataloader) - * (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) - writer.add_scalar('train_loss', mloss[3], len(self.train_dataloader) - * (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + writer.add_scalar('loss_a', mloss[3], len(self.train_dataloader) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + writer.add_scalar('loss_r', mloss[4], len(self.train_dataloader) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + writer.add_scalar('loss_s', mloss[5], len(self.train_dataloader) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) + writer.add_scalar('train_loss', mloss[6], len(self.train_dataloader) + / (cfg.TRAIN["BATCH_SIZE"]) * epoch + i) if self.multi_scale_train and (i+1) % 10 == 0: self.train_dataset.img_size = random.choice(range( cfg.TRAIN["MULTI_TRAIN_RANGE"][0], cfg.TRAIN["MULTI_TRAIN_RANGE"][1], cfg.TRAIN["MULTI_TRAIN_RANGE"][2])) * 32 - - if epoch >= 30 and epoch % 5 == 0 and cfg.TRAIN["EVAL_TYPE"] == 'VOC': + self.__save_model_weights(epoch, mAP) + if epoch >= 70 and epoch % 5 == 0 and cfg.TRAIN["EVAL_TYPE"] == 'VOC': logger.info("===== Validate =====".format(epoch, self.epochs)) with torch.no_grad(): APs, inference_time = Evaluator(self.model).APs_voc() @@ -169,7 +179,7 @@ def train(self): logger.info("inference time: {:.2f} ms".format(inference_time)) writer.add_scalar('mAP', mAP, epoch) - elif epoch >= 30 and epoch % 5 == 0 and cfg.TRAIN["EVAL_TYPE"] == 'COCO': + elif epoch >= 50 and epoch % 5 == 0 and cfg.TRAIN["EVAL_TYPE"] == 'COCO': logger.info("===== Validate =====".format(epoch, self.epochs)) with torch.no_grad(): evaluator = COCOEvaluator(data_dir=cfg.DATA_PATH, @@ -179,6 +189,7 @@ def train(self): ap50_95, ap50, inference_time = evaluator.evaluate(self.model) mAP = ap50 logger.info('ap50_95:{} | ap50:{}'.format(ap50_95, ap50)) + logger.info("mAP: {:.3f}".format(mAP)) logger.info("inference time: {:.2f} ms".format(inference_time)) writer.add_scalar('val/COCOAP50', ap50, epoch) writer.add_scalar('val/COCOAP50_95', ap50_95, epoch) @@ -194,12 +205,14 @@ def train(self): if __name__ == "__main__": global logger, writer parser = argparse.ArgumentParser() - parser.add_argument('--weight_path', type=str, default='weight/mobilenetv2_1.0-0c6065bc.pth', - help='weight file path') #default=None + parser.add_argument('--weight_path', type=str, default=None#'weight/mobilenetv2_1.0-0c6065bc.pth' + , help='weight file path') #default=None + #parser.add_argument('--weight_path', type=str, default='weight/yolov4.conv.137', help='weight file path') # default=None parser.add_argument('--resume', action='store_true',default=False, help='resume training flag') parser.add_argument('--gpu_id', type=int, default=0, help='gpu id') parser.add_argument('--log_path', type=str, default='log/', help='log path') opt = parser.parse_args() writer = SummaryWriter(logdir=opt.log_path + '/event') - logger = Logger(log_file_name=opt.log_path + '/log.txt', log_level=logging.DEBUG, logger_name='LODet').get_log() + logger = Logger(log_file_name=opt.log_path + '/log.txt', log_level=logging.DEBUG, logger_name='NPMMRDet').get_log() + Trainer(weight_path=opt.weight_path, resume=opt.resume, gpu_id=opt.gpu_id).train() \ No newline at end of file