diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..a1fd792 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 82f9275..b37a559 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ parts/ sdist/ var/ wheels/ +pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg @@ -49,7 +50,6 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ -cover/ # Translations *.mo @@ -72,7 +72,6 @@ instance/ docs/_build/ # PyBuilder -.pybuilder/ target/ # Jupyter Notebook @@ -83,9 +82,7 @@ profile_default/ ipython_config.py # pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -94,24 +91,7 @@ ipython_config.py # install all needed dependencies. #Pipfile.lock -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +# PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff @@ -148,15 +128,13 @@ dmypy.json # Pyre type checker .pyre/ -# pytype static type analyzer -.pytype/ +# Tensorflow event emitter +events.out.* +training_log.txt -# Cython debug symbols -cython_debug/ +# +.vscode -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +*.ckpt +Models/Lseg/lseg/* +PCAonGPU/PCA_instance/* \ No newline at end of file diff --git a/Config/LatentBKI_default.yaml b/Config/LatentBKI_default.yaml new file mode 100644 index 0000000..2efa819 --- /dev/null +++ b/Config/LatentBKI_default.yaml @@ -0,0 +1,13 @@ +dataset: "mp3d" +meas_result: True +with_variance: False +use_relative_pose: True +pseduo_discrete: True +save_map: True +result_split: "val" +grid_params: + grid_size: [ 100.0, 100.0, 100.0 ] + min_bound: [-5.0, -5.0, -5.0] + max_bound: [ 5.0, 5.0, 5.0] +filter_size: 3 +ell: 0.5 \ No newline at end of file diff --git a/Config/LatentBKI_kitti.yaml b/Config/LatentBKI_kitti.yaml new file mode 100644 index 0000000..016dc4a --- /dev/null +++ b/Config/LatentBKI_kitti.yaml @@ -0,0 +1,12 @@ +dataset: "semantic_kitti" +meas_result: True +with_variance: False +use_relative_pose: True +pseduo_discrete: True +save_map: True +grid_params: + grid_size: [ 400.0, 400.0, 26.0 ] + min_bound: [ -40.0, -40.0, -2.6 ] + max_bound: [ 40.0, 40.0, 2.6 ] +filter_size: 3 +ell: 0.5 \ No newline at end of file diff --git a/Config/LatentBKI_realworld.yaml b/Config/LatentBKI_realworld.yaml new file mode 100644 index 0000000..5521d18 --- /dev/null +++ b/Config/LatentBKI_realworld.yaml @@ -0,0 +1,13 @@ +dataset: "realworld" +meas_result: True +with_variance: False +use_relative_pose: True +pseduo_discrete: True +save_map: True +result_split: "val" +grid_params: + grid_size: [ 200.0, 200.0, 80.0,] + min_bound: [-5.0, -5.0, -2.0, ] + max_bound: [ 5.0, 5.0, 2.0,] +filter_size: 3 +ell: 0.5 \ No newline at end of file diff --git a/Config/LatentBKI_vlmap.yaml b/Config/LatentBKI_vlmap.yaml new file mode 100644 index 0000000..e9ffcc9 --- /dev/null +++ b/Config/LatentBKI_vlmap.yaml @@ -0,0 +1,12 @@ +dataset: "mp3d" +meas_result: True +with_variance: False +use_relative_pose: True +pseduo_discrete: True +save_map: True +grid_params: + grid_size: [ 240.0, 240.0, 80.0 ] + min_bound: [-6.0, -6.0, -2.0] + max_bound: [ 6.0, 6.0, 2.0] +filter_size: 1 +ell: 0.5 \ No newline at end of file diff --git a/Config/mp3d.yaml b/Config/mp3d.yaml new file mode 100644 index 0000000..4952dc6 --- /dev/null +++ b/Config/mp3d.yaml @@ -0,0 +1,59 @@ +num_classes: 40 +data_dir: "/mp3d/vlmaps_data_dir/vlmaps_dataset/" +feature_dir: 'lseg_feature' +pca_path: "PCAonGPU/PCA_instance/mp3d_pca_64.pkl" +feature_size: 64 +grid_mask: False +down_sample_feature: True +raw_data: True +subsample_points: 1 +intrinsic: [540, 0, 540, 0, 540, 360, 0, 0, 1] + +sequences: [ + '5LpN3gDmAk7_1', + 'gTV8FGcVJC9_1', +] + +category: [ + "void", + "wall", + "floor", + "chair", + "door", + "table", + "picture", + "cabinet", + "cushion", + "window", + "sofa", + "bed", + "curtain", + "chest_of_drawers", + "plant", + "sink", + "stairs", + "ceiling", + "toilet", + "stool", + "towel", + "mirror", + "tv_monitor", + "shower", + "column", + "bathtub", + "counter", + "fireplace", + "lighting", + "beam", + "railing", + "shelving", + "blinds", + "gym_equipment", + "seating", + "board_panel", + "furniture", + "appliances", + "clothes", + "objects", + # "misc", +] \ No newline at end of file diff --git a/Config/realworld.yaml b/Config/realworld.yaml new file mode 100644 index 0000000..9f19cd9 --- /dev/null +++ b/Config/realworld.yaml @@ -0,0 +1,27 @@ +num_classes: 10 +data_dir: "/mp3d/real_world" +feature_dir: 'lseg_feature' +pca_path: "PCAonGPU/PCA_instance/mp3d_pca_64.pkl" +feature_size: 64 +grid_mask: True +down_sample_feature: True +raw_data: False +subsample_points: 1 +intrinsic: [8.009776000976562500e+02, 0, 4.731800537109375000e+02, 0, 8.009776000976562500e+02, 3.628266601562500000e+02, 0, 0, 1] + +sequences: [ + "my_house_long", +] + +category: [ + "wall", + "floor", + "ceiling", + "chair", + "table", + "TV_screen", + "bed", + "window", + "lightings", + "other", +] \ No newline at end of file diff --git a/Config/semantic_kitti.yaml b/Config/semantic_kitti.yaml new file mode 100644 index 0000000..f28fd68 --- /dev/null +++ b/Config/semantic_kitti.yaml @@ -0,0 +1,12 @@ +num_classes: 20 +data_dir: "" # defined in SPVCNN kitti data config +feature_dir: "" # defined in SPVCNN kitti data config +pca_path: "" # never used +feature_size: 96 +grid_mask: True +down_sample_feature: False +subsample_points: 1 +raw_data: False +intrinsic: [] # don't use intrinsic +sequences: [] # defined in SPVCNN kitti data, choose val which only contain sequence 08 +category: [] # defined in SPVCNN kitti data config \ No newline at end of file diff --git a/Data/KITTI_SPVCNN.py b/Data/KITTI_SPVCNN.py new file mode 100644 index 0000000..a4e5df5 --- /dev/null +++ b/Data/KITTI_SPVCNN.py @@ -0,0 +1,118 @@ +import torch +import yaml +import importlib + +from easydict import EasyDict + +from torch.utils.data import Dataset +from TwoDPASS.dataloader.dataset import get_model_class, get_collate_class +from TwoDPASS.dataloader.pc_dataset import get_pc_model_class + +def load_yaml(file_name): + with open(file_name, 'r') as f: + try: + config = yaml.load(f, Loader=yaml.FullLoader) + except: + config = yaml.load(f) + return config + + +def parse_config(): + args = {} + args['config_path'] = 'TwoDPASS/config/SPVCNN-semantickitti.yaml' + args['seed'] = 0 + args['gpu'] = (0,) + + # training + args['log_dir'] = 'default' + args['monitor'] = 'val/mIoU' + args['stop_patience'] = 50 + args['save_top_k'] = 1 + args['check_val_every_n_epoch'] = 1 + args['SWA'] = False + args['baseline_only'] = False + # testing + args['test'] = True + args['fine_tune'] = False + args['pretrain2d'] = False + args['num_vote'] = 1 + args['submit_to_server'] = False + args['checkpoint'] = 'TwoDPASS/pretrained/SPVCNN/best_model.ckpt' + # debug + args['debug'] = False + + config = load_yaml(args['config_path']) + config.update(args) # override the configuration using the value in args + + # voting test + if args['test']: + config['dataset_params']['val_data_loader']['batch_size'] = args['num_vote'] + if args['num_vote'] > 1: + config['dataset_params']['val_data_loader']['rotate_aug'] = True + config['dataset_params']['val_data_loader']['transform_aug'] = True + if args['debug']: + config['dataset_params']['val_data_loader']['batch_size'] = 2 + config['dataset_params']['val_data_loader']['num_workers'] = 0 + + return EasyDict(config) + +class KITTI_SPVCNN_config(): + def __init__(self) -> None: + self.config = parse_config() + +class KITTI_SPVCNN(Dataset): + def __init__(self, device, grid_params, grid_mask=True) -> None: + super().__init__() + self.device = device + self.grid_mask = grid_mask + self._grid_size = grid_params['grid_size'] + self.coor_ranges = grid_params['min_bound'] + grid_params['max_bound'] + self.voxel_sizes = [abs(self.coor_ranges[3] - self.coor_ranges[0]) / self._grid_size[0], + abs(self.coor_ranges[4] - self.coor_ranges[1]) / self._grid_size[1], + abs(self.coor_ranges[5] - self.coor_ranges[2]) / self._grid_size[2]] + self.min_bound = torch.tensor(self.coor_ranges[:3]) + self.max_bound = torch.tensor(self.coor_ranges[3:]) + self.config = parse_config() + self.init_dataset() + self.init_model() + + def init_dataset(self): + pc_dataset = get_pc_model_class(self.config['dataset_params']['pc_dataset_type']) + dataset_type = get_model_class(self.config['dataset_params']['dataset_type']) + val_config = self.config['dataset_params']['val_data_loader'] + val_pt_dataset = pc_dataset(self.config, data_path=val_config['data_path'], imageset='val', num_vote=val_config["batch_size"]) + + self.kitti_dataset = dataset_type(val_pt_dataset, self.config, val_config, num_vote=val_config["batch_size"]) + self.collate_fn = get_collate_class(self.config['dataset_params']['collate_type']) + + def init_model(self): + model_file = importlib.import_module('TwoDPASS.network.' + self.config['model_params']['model_architecture']) + my_model = model_file.get_model(self.config) ######## get model ############ + my_model = my_model.load_from_checkpoint(self.config.checkpoint, config=self.config, strict=(not self.config.pretrain2d)) + my_model = my_model.eval() + + self.my_model = my_model.to(self.device) + + def __len__(self): + return len(self.kitti_dataset) + + def __getitem__(self, idx): + return self.get_test_item(idx) + + def get_test_item(self, idx): + data_dict = self.collate_fn([self.kitti_dataset[idx]]) + with torch.no_grad(): + features = self.my_model.encode_points(data_dict, self.device) + + points = data_dict['points'].F + gt_labels = data_dict['targets_mapped'].F.reshape(-1,1) + + # only take points in the grid + if self.grid_mask: + grid_point_mask = torch.all( (points < self.max_bound) & (points >= self.min_bound), axis=1) + points = points[grid_point_mask] + gt_labels = gt_labels[grid_point_mask] + features = features[grid_point_mask] + + return data_dict['global_pose'][0], points, features, gt_labels, data_dict['scene_id'][0], data_dict['frame_id'][0] + \ No newline at end of file diff --git a/Data/MP3D.py b/Data/MP3D.py new file mode 100644 index 0000000..9946892 --- /dev/null +++ b/Data/MP3D.py @@ -0,0 +1,201 @@ +import os +import numpy as np +from torch.utils.data import Dataset +from torchvision.io import read_image +import torch +from scipy.spatial.transform import Rotation as R +from Data.utils import depth2pc + +class MP3D(Dataset): + """Matterport3D Dataset for Neural BKI project + + Access to the processed data, currently the predicted labels are the same as the ground truth + """ + + def __init__(self, + grid_params, + intrinsics, + segmentation_encode = None, + pca_downsample = None, + feature_dir = 'lseg_feature', + directory="/home/jason/kitti", + device='cuda', + num_classes = 42, + latent_size = 512, + down_sample_feature = False, + sequences = [], + subsample_points=0.5, + grid_mask=True, + raw=False, # wether to sotre data in [rgb, depth] or [point cloud, feagures] + ): + if raw and segmentation_encode == None: + raise ValueError("If want to load raw rgb data, must specify the segmentation ecnoding funciton") + + self.raw = raw + self.grid_mask = grid_mask + + self._grid_size = grid_params['grid_size'] + self.grid_dims = np.asarray(self._grid_size) + self._eval_size = list(np.uint32(self._grid_size)) + self.coor_ranges = grid_params['min_bound'] + grid_params['max_bound'] + self.voxel_sizes = [abs(self.coor_ranges[3] - self.coor_ranges[0]) / self._grid_size[0], + abs(self.coor_ranges[4] - self.coor_ranges[1]) / self._grid_size[1], + abs(self.coor_ranges[5] - self.coor_ranges[2]) / self._grid_size[2]] + self.min_bound = np.asarray(self.coor_ranges[:3]) + self.max_bound = np.asarray(self.coor_ranges[3:]) + self.voxel_sizes = np.asarray(self.voxel_sizes) + + self.segmentation_encode = segmentation_encode + self.pca_downsample = pca_downsample + + self.feature_dir = feature_dir + self._directory = directory + self.device = device + self.num_classes = num_classes + self.latent_size = latent_size + self.down_sample_feature = down_sample_feature + self.sequences = sequences + self.subsample_points = subsample_points + + if raw: + self._rgb_list = [] + self._depth_list = [] + self._intrinsic_matrix = np.array(intrinsics).reshape(3,3) + else: + self._velodyne_list = [] + self._pred_list = [] # feature/categorical predictions + + self._label_list = [] # categorical semantic labels + self._poses = np.empty((0,7)) + + self._seqs = self.sequences + self._frames_list = [] # store the name of the frames + self._num_frames_per_scene = [] # number of frames per sequence + self._scene_id = [] # repeat the scene id/name for the number of frames of times + self._base_pose0_list = [] # repeat the first base pose of each scene for the number of frames of times + + for seq in self._seqs: + label_dir = os.path.join(self._directory, seq, 'semantic') + label_path_list = sorted(os.listdir(label_dir)) # contain all file path to the frame labels + ##### debug purpose ##### + # label_path_list = label_path_list[50:60] # only take the first ten sequence + ##### debug purpose ##### + frames_list = [os.path.splitext(filename)[0] for filename in label_path_list] # contain all frame number of the sequence eg 000000, 000019, etc. + self._frames_list.extend(frames_list) + + # sequence statistics + self._num_frames_per_scene.append(len(frames_list)) + self._scene_id += [seq] * len(frames_list) + + # categorical label for each point/pixel + self._label_list.extend([os.path.join(label_dir, str(frame).zfill(6)+'.npy') for frame in frames_list]) + + if raw: + rgb_dir = os.path.join(self._directory, seq, 'rgb') + depth_dir = os.path.join(self._directory, seq, 'depth') + self._rgb_list.extend([os.path.join(rgb_dir, str(frame).zfill(6)+'.png') for frame in frames_list]) + self._depth_list.extend([os.path.join(depth_dir, str(frame).zfill(6)+'.npy') for frame in frames_list]) + else: + velodyne_dir = os.path.join(self._directory, seq, 'point_cloud') + preds_dir = os.path.join(self._directory, seq, self.feature_dir) + self._velodyne_list.extend([os.path.join(velodyne_dir, str(frame).zfill(6)+'.npy') for frame in frames_list]) + self._pred_list.extend([os.path.join(preds_dir, str(frame).zfill(6)+'.npy') for frame in frames_list]) + + pose = np.loadtxt(os.path.join(self._directory, seq, 'poses.txt'))[:(len(frames_list))] # xyz + quaternion + self._base_pose0_list += [pose[0]] * len(frames_list) + self._poses = np.vstack((self._poses, pose)) + + def collate_fn(self, data): + points_batch = [bi[0] for bi in data] + label_batch = [bi[1] for bi in data] + gt_label_batch = [bi[2] for bi in data] + return points_batch, label_batch, gt_label_batch + + def get_pose(self, frame_id): + q_pose = self._poses[frame_id,:] + xyz = q_pose[:3] + q = q_pose[3:] # (x, y, z, w) + + ### coordinate transformation matrix, openGL to x forward, z upward ### + pose = np.zeros((4, 4)) + pos_quat_vec = np.array([q[0], q[1], q[2], q[3]]) + pose[:3, :3] = R.from_quat(pos_quat_vec.flatten()).as_matrix() + pose[:3, 3] = xyz.reshape(1,3) + pose[3, 3] = 1 + + CT = np.array([[0,0,-1,0], + [-1,0,0,0], + [0,1,0,0], + [0,0,0,1]]) + + pose = CT @ pose @ np.linalg.inv(CT) + global_pose = pose.astype(np.float32) + ### coordinate transformation matrix, openGL to x forward, z upward ### + + return global_pose + + # Use all frames, if there is no data then zero pad + def __len__(self): + return sum(self._num_frames_per_scene) + + def __getitem__(self, idx): + self.get_test_item(idx) + + def get_test_item(self, idx): + ''' + Load one data point of everything + pose: in global (rviz) frame x(forwad), y (left), z (up) + point_cloud: defined in terms of camera position, but switched to rviz coordinate + ''' + scene_name = self._scene_id[idx] + scene_id = scene_name #int(scene_name) # Scene ID + frame_id = int(self._frames_list[idx]) # Frame ID in current scene ID + + pose = self.get_pose(idx) + gt_labels = np.load(self._label_list[idx]).astype(np.uint8).reshape((-1, 1)) + + if self.raw: + img = read_image(self._rgb_list[idx]) #.to(torch.float) / 255 + img = img.permute(1,2,0) + pred_labels = self.segmentation_encode(img) #self.segmentation_model.encoding_feature(img) + pred_labels = pred_labels.detach().cpu().numpy() + with open(self._depth_list[idx], "rb") as f: + depth_map = np.load(f) + # depth_map = np.load(self._depth_list[idx]) + points, _ = depth2pc(depth_map, intr_mat=self._intrinsic_matrix, min_depth=0.1, max_depth=6) + else: + points = np.load(self._velodyne_list[idx]).astype(np.float32).reshape(-1,3)[:, :3] + pred_labels = np.load(self._pred_list[idx]).astype(np.float32) + + pred_labels = pred_labels.squeeze(0).transpose(1,2,0) + pred_labels = pred_labels.reshape(-1,pred_labels.shape[-1]) + + # change points coordinate from camera to rviz + # camera x (right), y (down), z (forward) + # rviz x (forward), y (left), z (up) + points_x = points[:,0] + points_y = points[:,1] + points_z = points[:,2] + points = np.vstack((points_z, -points_x, -points_y)).T + + ### TODO: filter point outside the grid ### + if self.grid_mask: + grid_point_mask = np.all( (points < self.max_bound) & (points >= self.min_bound), axis=1) + points = points[grid_point_mask] + gt_labels = gt_labels[grid_point_mask] + pred_labels = pred_labels[grid_point_mask] + + # filter out points that's too close to camera + close_point_mask = points[:,0] > 0.2 + points = points[close_point_mask] + gt_labels = gt_labels[close_point_mask] + pred_labels = pred_labels[close_point_mask] + ### TODO: filter point outside the grid ### + + if self.down_sample_feature: + pred_labels = torch.from_numpy(pred_labels) + pred_labels = self.pca_downsample(pred_labels) + pred_labels = pred_labels.detach().cpu().numpy().reshape(-1, self.latent_size) + pred_labels = pred_labels.reshape(-1, self.latent_size) # dummy way + + return torch.from_numpy(pose), torch.from_numpy(points), torch.from_numpy(pred_labels), torch.from_numpy(gt_labels), scene_id, frame_id diff --git a/Data/RealWorldData.py b/Data/RealWorldData.py new file mode 100644 index 0000000..7a57add --- /dev/null +++ b/Data/RealWorldData.py @@ -0,0 +1,220 @@ +import os +import torch +from torch.utils.data import Dataset +from torchvision.io import read_image +import numpy as np +from glob import glob +from scipy.spatial.transform import Rotation as R +from plyfile import PlyData + +class RealWorldData(Dataset): + """Matterport3D Dataset for Neural BKI project + + Access to the processed data, currently the predicted labels are the same as the ground truth + """ + + def __init__(self, + grid_params, + intrinsics, + segmentation_encode = None, + pca_downsample = None, + feature_dir = 'lseg_feature', + directory="/home/jason/kitti", + device='cuda', + latent_size = 512, + down_sample_feature = False, + sequences = [], + subsample_points=0.5, + ): + + self._grid_size = grid_params['grid_size'] + self.coor_ranges = grid_params['min_bound'] + grid_params['max_bound'] + self.voxel_sizes = [abs(self.coor_ranges[3] - self.coor_ranges[0]) / self._grid_size[0], + abs(self.coor_ranges[4] - self.coor_ranges[1]) / self._grid_size[1], + abs(self.coor_ranges[5] - self.coor_ranges[2]) / self._grid_size[2]] + self.min_bound = np.asarray(self.coor_ranges[:3]) + self.max_bound = np.asarray(self.coor_ranges[3:]) + self.voxel_sizes = np.asarray(self.voxel_sizes) + + # segmentation module + self.segmentation_encode = segmentation_encode + self.pca_downsample = pca_downsample + + self.feature_dir = feature_dir + self._directory = directory + self.device = device + self.latent_size = latent_size + self.down_sample_feature = down_sample_feature + self.subsample_points = subsample_points + + # if raw: + self._rgb_list = [] + self._depth_list = [] + self._conf_list = [] + # TODO: Change according to the output from Record3D software + self._intrinsic_matrix = np.array(intrinsics).reshape(3,3) + # else: + self._velodyne_list = [] + self._pred_list = [] # feature/categorical predictions + self._poses = np.empty((0,7)) + + self._seqs = sequences + self._frames_list = [] # store the name of the frames + self._num_frames_per_scene = [] # number of frames per sequence + self._scene_id = [] # repeat the scene id/name for the number of frames of times + + for seq in self._seqs: + # each sequence must have rgb data + rgb_dir = os.path.join(self._directory, seq, 'rgb') + rgb_list = glob(os.path.join(rgb_dir, "*")) + rgb_list.sort() + frames_list = [os.path.basename(path).split('.')[0] for path in rgb_list] # contain all frame number of the sequence eg 000000, 000019, etc. + self._frames_list.extend(frames_list) + + # sequence statistics + self._num_frames_per_scene.append(len(frames_list)) + self._scene_id += [seq] * len(frames_list) + + # point cloud + velodyne_dir = os.path.join(self._directory, seq, 'point_cloud') + if os.path.exists(velodyne_dir): + velodyne_list = glob(os.path.join(velodyne_dir, "*")) + velodyne_list.sort() + self._velodyne_list.extend(velodyne_list) + else: + depth_dir = os.path.join(self._directory, seq, 'depth') + self._depth_list.extend([os.path.join(depth_dir, str(frame).zfill(6)+'.npy') for frame in frames_list]) + + # depth confident + conf_dir = os.path.join(self._directory, seq, 'conf') + self._conf_list.extend([os.path.join(conf_dir, str(frame).zfill(6)+'.npy') for frame in frames_list]) + + # per pixel feature + preds_dir = os.path.join(self._directory, seq, self.feature_dir) + if os.path.exists(preds_dir): + pred_list = glob(os.path.join(preds_dir,"*")) + pred_list.sort() + self._pred_list.extend(pred_list) + else: + self._rgb_list.extend(rgb_list) + + pose = np.loadtxt(os.path.join(self._directory, seq, 'poses.txt'))[:(len(frames_list))] # xyz + quaternion + self._poses = np.vstack((self._poses, pose)) + + def collate_fn(self, data): + points_batch = [bi[0] for bi in data] + label_batch = [bi[1] for bi in data] + gt_label_batch = [bi[2] for bi in data] + return points_batch, label_batch, gt_label_batch + + def get_pose(self, frame_id): + q_pose = self._poses[frame_id,:] + xyz = q_pose[:3] + q = q_pose[3:] # (x, y, z, w) + + pos_quat_vec = np.array([q[1], q[2], q[3], q[0]]) + rot = R.from_quat(pos_quat_vec.flatten()).as_matrix() + + pose = np.zeros((4,4)) + pose[3,3] = 1 + pose[:3,:3] = rot + pose[:3,3] = xyz.reshape(-1) + + CT = np.array([[0,0,-1,0], + [-1,0,0,0], + [0,1,0,0], + [0,0,0,1]]) + + pose = CT @ pose @ np.linalg.inv(CT) + global_pose = pose.astype(np.float32) + + return global_pose + + def create_point_cloud_depth(self, depth, conf, intrinsics): + fx, fy, cx, cy = intrinsics[0,0], intrinsics[1,1], intrinsics[0,2], intrinsics[1,2] + depth_shape = depth.shape + [x_d, y_d] = np.meshgrid(range(0, depth_shape[1]), range(0, depth_shape[0])) + x3 = np.divide(np.multiply((x_d-cx), depth), fx) + y3 = np.divide(np.multiply((y_d-cy), depth), fy) + z3 = depth + + coord = np.stack((x3, y3, z3), axis=2) + + valid_depth = ~np.isnan(depth.reshape(-1,)) + conf_mask = conf.reshape(-1) >= 2 + valid_depth = np.logical_and(valid_depth, conf_mask) + + return coord.reshape(-1,3)[valid_depth], valid_depth + + # Use all frames, if there is no data then zero pad + def __len__(self): + return sum(self._num_frames_per_scene) + + def __getitem__(self, idx): + return self.get_test_item(idx) + + def get_test_item(self, idx): + ''' + Load one data point of everything + pose: in global (rviz) frame x(forwad), y (left), z (up) + point_cloud: defined in terms of camera position, but switched to rviz coordinate + ''' + scene_name = self._scene_id[idx] + scene_id = scene_name #int(scene_name) # Scene ID + frame_id = int(self._frames_list[idx]) # Frame ID in current scene ID + + pose = self.get_pose(idx) + gt_labels = np.empty(0) + + if len(self._velodyne_list) != 0: + # read ply file + plydata = PlyData.read(self._velodyne_list[idx]) + points = np.stack((np.asarray(plydata.elements[0]["x"]), + np.asarray(plydata.elements[0]["y"]), + np.asarray(plydata.elements[0]["z"])), axis=1).astype(np.float32) + else: + with open(self._depth_list[idx], "rb") as f: + depth_map = np.load(f) + conf_map = np.load(self._conf_list[idx]) + points, valid_depth = self.create_point_cloud_depth(depth_map, conf_map, self._intrinsic_matrix) + + if len(self._pred_list) != 0: + pred_labels = np.load(self._pred_list[idx]).astype(np.float32) + else: + img = read_image(self._rgb_list[idx]) + img = img[:3] # images may have alpha channel + img = img.permute(1,2,0) + pred_labels = self.segmentation_encode(img) + pred_labels = pred_labels.detach().cpu().numpy() + + pred_labels = pred_labels.squeeze(0).transpose(1,2,0) + pred_labels = pred_labels.reshape(-1,pred_labels.shape[-1]) + pred_labels = pred_labels[valid_depth] + + # change points coordinate from camera to rviz + # camera x (right), y (down), z (forward) + # mp3d global pose x (right), y(up), z(back) + # rviz x (forward), y (left), z (up) + ### TODO: Debug purpose commet out ### + points_x = points[:,0] + points_y = points[:,1] + points_z = points[:,2] + points = np.vstack((points_z, -points_x, -points_y)).T + + ### filter point outside the grid ### + grid_point_mask = np.all( (points < self.max_bound) & (points >= self.min_bound), axis=1) + points = points[grid_point_mask] + pred_labels = pred_labels[grid_point_mask] + + # filter out points that's too close to camera + close_point_mask = points[:,0] > 0.1 # z forward + points = points[close_point_mask] + pred_labels = pred_labels[close_point_mask] + + if self.down_sample_feature: + pred_labels = torch.from_numpy(pred_labels) + pred_labels = self.pca_downsample(pred_labels) + pred_labels = pred_labels.detach().cpu().numpy().reshape(-1, self.latent_size) + pred_labels = pred_labels.reshape(-1, self.latent_size) # dummy way + + return torch.from_numpy(pose), torch.from_numpy(points), torch.from_numpy(pred_labels), torch.from_numpy(gt_labels), scene_id, frame_id \ No newline at end of file diff --git a/Data/generate_64_pred.py b/Data/generate_64_pred.py new file mode 100644 index 0000000..e0af384 --- /dev/null +++ b/Data/generate_64_pred.py @@ -0,0 +1,61 @@ +# %% +import sys +import os +import clip +from pathlib import Path +sys.path.append('/workspace/LatentBKI') +import numpy as np +import torch +from tqdm import tqdm +from glob import glob +from Models.Lseg.Lseg_module import Lseg_module +from torchvision.io import read_image + +# %% +PCA_PATH = '/mp3d/PCA/64/ipca_7.pkl' +seg_module = Lseg_module(pca_path=PCA_PATH) + +data_path = '/mp3d/vlmaps_data_dir/vlmaps_dataset/gTV8FGcVJC9_1' +pred_64_save = os.path.join(data_path, 'lseg_pred_64_new') +Path(pred_64_save).mkdir(parents=True, exist_ok=True) + +# %% +# category_clip = torch.tensor(np.load("/workspace/LatentBKI/Data/category_vlmap_features.npy"),device='cuda').to(torch.float32) + +CATEGORY = np.loadtxt("/workspace/LatentBKI/Data/category_vlmap.txt", dtype='str') +device = "cuda" if torch.cuda.is_available() else "cpu" +clip_model, preprocess = clip.load("ViT-B/32", device=device) +text = clip.tokenize(CATEGORY).to(device) +with torch.no_grad(): + text_features = clip_model.encode_text(text) + text_features = text_features.to(torch.float32) + CATEGORY_CLIP = text_features / text_features.norm(dim=-1, keepdim=True) + +# %% +img_path_list = glob(os.path.join(data_path,'rgb/*.png')) +img_path_list.sort() +print(img_path_list) +print(len(img_path_list)) + +# %% +for i, img_path in tqdm(enumerate(img_path_list), total=len(img_path_list)): + img = read_image(img_path) #.to(torch.float) / 255 + img = img[:3] + img = img.permute(1,2,0) + # encode image + clip_features = seg_module.encoding_feature(img) + clip_features = clip_features.squeeze(0).permute(1,2,0) + clip_features = clip_features.reshape(-1,512) + # project to 64 and back to 512 + clip_features_64 = seg_module.down_sampling(clip_features) + clip_featreu_bp = seg_module.backproject_to_clip(clip_features_64) + # to category + lseg_logits_64 = seg_module.decoding_feature(clip_featreu_bp, CATEGORY_CLIP) + lseg_logits_64 = lseg_logits_64.softmax(dim=-1) # convert to probability + lseg_pred_64 = lseg_logits_64.argmax(dim=-1) + lseg_pred_64 = lseg_pred_64.reshape(720,1080).cpu().numpy() + # save image + save_path = os.path.join(pred_64_save, '%06i.npy' % i) + np.save(save_path, lseg_pred_64) + + diff --git a/Data/select_r3d_frames.py b/Data/select_r3d_frames.py new file mode 100644 index 0000000..68e1e23 --- /dev/null +++ b/Data/select_r3d_frames.py @@ -0,0 +1,89 @@ +import os +import sys +import yaml +import numpy as np +import cv2 +import liblzfse +import shutil +from glob import glob +from pathlib import Path +from tqdm import tqdm + +def load_depth(filepath): + with open(filepath, 'rb') as depth_fh: + raw_bytes = depth_fh.read() + decompressed_bytes = liblzfse.decompress(raw_bytes) + depth_img = np.frombuffer(decompressed_bytes, dtype=np.float32) + depth_img = depth_img.reshape((192, 256)) + return depth_img + +def load_conf(filepath): + with open(filepath, 'rb') as depth_fh: + raw_bytes = depth_fh.read() + decompressed_bytes = liblzfse.decompress(raw_bytes) + conf = np.frombuffer(decompressed_bytes, dtype=np.int8) + conf = conf.reshape((192, 256)) + return np.float32(conf) + +def main(args): + # set data directory + data_dir = "/Users/multyxu/Desktop/Programming/LatentBKI data/record3d/my_house_long" + output_dir = os.path.join('/Users/multyxu/Desktop/Programming/LatentBKI data/', os.path.basename(data_dir)) + + # create directoris + rgb_dir = os.path.join(output_dir, "rgb") + dpeth_dir = os.path.join(output_dir, "depth") + conf_dir = os.path.join(output_dir, "conf") + Path(output_dir).mkdir(parents=True, exist_ok=True) + Path(rgb_dir).mkdir(parents=True, exist_ok=True) + Path(dpeth_dir).mkdir(parents=True, exist_ok=True) + Path(conf_dir).mkdir(parents=True, exist_ok=True) + + # read meata data: camera intrinsics and pose + metadata_path = os.path.join(data_dir, 'metadata') + metadata = yaml.safe_load(open(metadata_path, 'r')) + K = np.asarray(metadata['K']).reshape(3,3).T + poses = np.array(metadata['poses']) + q_list = np.hstack((poses[:, 3].reshape(-1,1), poses[:, :3])) # (w, x, y, z) + xyz_list = poses[:, 4:] + + # write intrinsics to file + np.savetxt(os.path.join(output_dir, 'intrinsics.txt'), K) + + # count total number of frames + num_frames = len(glob(os.path.join(data_dir,'rgbd/*.depth' ))) + print("Num Frames:", num_frames) + + pose_list = [] + frame_count = 0 + # fps is very high, take images every 5 frames, making it 15fps + for i in tqdm(range(0, num_frames, 5)): + depth_filepath = os.path.join(data_dir, f'rgbd/{i}.depth') + rgb_filepath = os.path.join(data_dir, f'rgbd/{i}.jpg') + conf_filepath = os.path.join(data_dir, f'rgbd/{i}.conf') + + depth_img = load_depth(str(depth_filepath)) + conf = load_conf(str(conf_filepath)) + + depth_resized = cv2.resize(depth_img, (960, 720)) + conf_resized = cv2.resize(conf, (960, 720), cv2.INTER_NEAREST_EXACT) + + # save to file + rgb_output = os.path.join(rgb_dir, '%06i.jpg' % frame_count) + depth_output = os.path.join(dpeth_dir, '%06i.npy' % frame_count) + conf_output = os.path.join(conf_dir, '%06i.npy' % frame_count) + + shutil.copyfile(rgb_filepath, rgb_output) + np.save(depth_output, depth_resized) + np.save(conf_output, conf_resized) + + # append OpenGL camera pose + pose_list.append(np.hstack((xyz_list[i], q_list[i])).tolist()) + + frame_count += 1 + + # save poses + np.savetxt(os.path.join(output_dir, "poses.txt"), np.array(pose_list)) + +if __name__ == '__main__': + main(sys.argv) \ No newline at end of file diff --git a/Data/utils.py b/Data/utils.py new file mode 100644 index 0000000..5c5549e --- /dev/null +++ b/Data/utils.py @@ -0,0 +1,295 @@ +import os +import pdb +from matplotlib import markers +# import rospy # comment out for initial trial becuase no ros in container +import numpy as np +import time +import os +import pdb +import torch +# comment out for initial trial becuase no ros in container +# from visualization_msgs.msg import * +# from geometry_msgs.msg import Point32 +# from std_msgs.msg import ColorRGBA +from scipy.spatial.transform import Rotation as R + + +def get_sim_cam_mat_with_fov(h, w, fov): + cam_mat = np.eye(3) + cam_mat[0, 0] = cam_mat[1, 1] = w / (2.0 * np.tan(np.deg2rad(fov / 2))) + cam_mat[0, 2] = w / 2.0 + cam_mat[1, 2] = h / 2.0 + return cam_mat + +def depth2pc(depth, fov=90, intr_mat=None, min_depth=0.1, max_depth=10): + """ + Return Nx3 array and the mask of valid points in [min_depth, max_depth]. + """ + + h, w = depth.shape + + cam_mat = intr_mat + if intr_mat is None: + cam_mat = get_sim_cam_mat_with_fov(h, w, fov) + # cam_mat[:2, 2] = 0 + cam_mat_inv = np.linalg.inv(cam_mat) + + y, x = np.meshgrid(np.arange(h), np.arange(w), indexing="ij") + x = x.reshape((1, -1))[:, :] + 0.5 + y = y.reshape((1, -1))[:, :] + 0.5 + z = depth.reshape((1, -1))[:, :] + + p_2d = np.vstack([x, y, np.ones_like(x)]) + pc = cam_mat_inv @ p_2d + pc = pc * z + mask = pc[2, :] > min_depth + + mask = np.logical_and(mask, pc[2, :] < max_depth) + # pc = pc[:, mask] + pc = pc.T.astype(np.float32) + mask = mask.T + return pc, mask + +# def qvec2rotmat(qvec): +# # input (x, y, z, w) +# w = qvec[3] +# qvec[1:] = qvec[:3] +# qvec[0] = w +# # (w, x, y, z) +# return np.array( +# [ +# [ +# 1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2, +# 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3], +# 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2], +# ], +# [ +# 2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3], +# 1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2, +# 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1], +# ], +# [ +# 2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2], +# 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1], +# 1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2, +# ], +# ] +# ) + +def rot2eul(R): + beta = -np.arcsin(R[2,0]) + alpha = np.arctan2(R[2,1]/np.cos(beta),R[2,2]/np.cos(beta)) + gamma = np.arctan2(R[1,0]/np.cos(beta),R[0,0]/np.cos(beta)) + return np.array((alpha, beta, gamma)) + +# Intersection, union for one frame +def iou_one_frame(pred, target, n_classes=21): + pred = pred.reshape(-1) + target = target.reshape(-1) + intersection = np.zeros(n_classes) + union = np.zeros(n_classes) + + for cls in range(n_classes): + pred_inds = pred == cls + target_inds = target == cls + + intersection[cls] = (pred_inds[target_inds]).long().sum().item() # Cast to long to prevent overflows + union[cls] = pred_inds.long().sum().item() + target_inds.long().sum().item() - intersection[cls] + return intersection, union + +def points_to_voxels_torch(voxel_grid, points, min_bound, grid_dims, voxel_sizes): + voxels = torch.floor((points - min_bound) / voxel_sizes).to(dtype=torch.int) + # Clamp to account for any floating point errors + maxes = (grid_dims - 1).reshape(1, 3) + mins = torch.zeros_like(maxes) + voxels = torch.clip(voxels, mins, maxes).to(dtype=torch.long) + + voxel_grid = voxel_grid[voxels[:, 0], voxels[:, 1], voxels[:, 2]] + return voxel_grid + + +# Remap colors to np array 0 to 1 +def remap_colors(colors): + # color + colors_temp = np.zeros((len(colors), 3)) + for i in range(len(colors)): + colors_temp[i, :] = colors[i] + colors = colors_temp.astype("int") + colors = colors / 255.0 + return colors + + +def publish_voxels(map_object, min_dim, max_dim, grid_dims, colors, next_map): + next_map.markers.clear() + marker = Marker() + marker.id = 0 + marker.ns = "Global_Semantic_Map" + marker.header.frame_id = "map" # change this to match model + scene name LMSC_000001 + marker.type = marker.CUBE_LIST + marker.action = marker.ADD + marker.lifetime.secs = 0 + marker.header.stamp = rospy.Time.now() + + marker.pose.orientation.x = 0.0 + marker.pose.orientation.y = 0.0 + marker.pose.orientation.z = 0.0 + marker.pose.orientation.w = 1 + + marker.scale.x = (max_dim[0] - min_dim[0]) / grid_dims[0] + marker.scale.y = (max_dim[1] - min_dim[1]) / grid_dims[1] + marker.scale.z = (max_dim[2] - min_dim[2]) / grid_dims[2] + + semantic_labels = map_object.global_map[:,3:] + centroids = map_object.global_map[:, :3] + + # Threshold here + total_probs = np.sum(semantic_labels, axis=-1, keepdims=False) + not_prior = total_probs > 1 + semantic_labels = semantic_labels[not_prior, :] + centroids = centroids[not_prior, :] + + semantic_labels = np.argmax(semantic_labels, axis=-1) + semantic_labels = semantic_labels.reshape(-1, 1) + + for i in range(semantic_labels.shape[0]): + pred = semantic_labels[i] + point = Point32() + color = ColorRGBA() + point.x = centroids[i, 0] + point.y = centroids[i, 1] + point.z = centroids[i, 2] + color.r, color.g, color.b = colors[pred].squeeze() + + color.a = 1.0 + marker.points.append(point) + marker.colors.append(color) + + next_map.markers.append(marker) + return next_map + + +def publish_local_map(labeled_grid, centroids, grid_params, colors, next_map): + max_dim = grid_params["max_bound"] + min_dim = grid_params["min_bound"] + grid_dims = grid_params["grid_size"] + + next_map.markers.clear() + marker = Marker() + marker.id = 0 + marker.ns = "Local Semantic Map" + marker.header.frame_id = "map" + marker.type = marker.CUBE_LIST + marker.action = marker.ADD + marker.lifetime.secs = 0 + marker.header.stamp = rospy.Time.now() + + marker.pose.orientation.x = 0.0 + marker.pose.orientation.y = 0.0 + marker.pose.orientation.z = 0.0 + marker.pose.orientation.w = 1 + + marker.scale.x = (max_dim[0] - min_dim[0]) / grid_dims[0] + marker.scale.y = (max_dim[1] - min_dim[1]) / grid_dims[1] + marker.scale.z = (max_dim[2] - min_dim[2]) / grid_dims[2] + + X, Y, Z, C = labeled_grid.shape + semantic_labels = labeled_grid.view(-1, C).detach().cpu().numpy() + centroids = centroids.detach().cpu().numpy() + + semantic_sums = np.sum(semantic_labels, axis=-1, keepdims=False) + valid_mask = semantic_sums >= 1 + + semantic_labels = semantic_labels[valid_mask, :] + centroids = centroids[valid_mask, :] + + semantic_labels = np.argmax(semantic_labels / np.sum(semantic_labels, axis=-1, keepdims=True), axis=-1) + semantic_labels = semantic_labels.reshape(-1, 1) + + for i in range(semantic_labels.shape[0]): + pred = semantic_labels[i] + point = Point32() + color = ColorRGBA() + point.x = centroids[i, 0] + point.y = centroids[i, 1] + point.z = centroids[i, 2] + color.r, color.g, color.b = colors[pred].squeeze() + + color.a = 1.0 + marker.points.append(point) + marker.colors.append(color) + + next_map.markers.append(marker) + return next_map + +#################################################################### + +def base_pos2grid_id_3d(x_base, y_base, z_base): + gs = 1000 + cs = 0.05 + row = int(gs / 2 - int(x_base / cs)) + col = int(gs / 2 - int(y_base / cs)) + h = int(z_base / cs) + return [row, col, h] + +def out_of_range(row: int, col: int, height: int) -> bool: + gs = 1000 + camera_height = 1.5 + cs = 0.05 + vh = int(camera_height / cs) + return col >= gs or row >= gs or height >= vh or col < 0 or row < 0 or height < 0 + + +def transform_pc(pc, pose): + """ + pose: the pose of the camera coordinate where the pc is in + pc: (3, N) + """ + # pose_inv = np.linalg.inv(pose) + + pc_homo = np.vstack([pc, np.ones((1, pc.shape[1]))]) + + pc_global_homo = pose @ pc_homo + + return pc_global_homo[:3, :] + + + +def cvt_pose_vec2tf(pos_quat_vec: np.ndarray) -> np.ndarray: + """ + pos_quat_vec: (px, py, pz, qx, qy, qz, qw) + """ + pose_tf = np.eye(4) + pose_tf[:3, 3] = pos_quat_vec[:3].flatten() + rot = R.from_quat(pos_quat_vec[3:].flatten()) + pose_tf[:3, :3] = rot.as_matrix() + return pose_tf + + +def get_pc_transform(base_posevec, base_pose0): + camera_height = 1.5 + base2cam_rot = [1, 0, 0, 0, -1, 0, 0, 0, -1] + base_forward_axis = [0, 0, -1] + base_left_axis = [-1, 0, 0] + base_up_axis = [0, 1, 0] + + base2cam_tf = np.eye(4) + base2cam_tf[:3, :3] = np.array([base2cam_rot]).reshape((3, 3)) + base2cam_tf[1, 3] = camera_height + # transform the base coordinate such that x is forward, y is leftward, z is upward + base_transform = np.eye(4) + base_transform[0, :3] = base_forward_axis + base_transform[1, :3] = base_left_axis + base_transform[2, :3] = base_up_axis + + init_base_tf = ( + base_transform @ cvt_pose_vec2tf(base_pose0) @ np.linalg.inv(base_transform) + ) + inv_init_base_tf = np.linalg.inv(init_base_tf) + habitat_base_pose = cvt_pose_vec2tf(base_posevec) + base_pose = base_transform @ habitat_base_pose @ np.linalg.inv(base_transform) + tf = inv_init_base_tf @ base_pose + pc_transform = tf @ base_transform @ base2cam_tf + return pc_transform + + +#################################################################### diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f273fb5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 UMich-CURLY + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Models/.DS_Store b/Models/.DS_Store new file mode 100644 index 0000000..f51ac5f Binary files /dev/null and b/Models/.DS_Store differ diff --git a/Models/LatentBKI.py b/Models/LatentBKI.py new file mode 100755 index 0000000..b28a975 --- /dev/null +++ b/Models/LatentBKI.py @@ -0,0 +1,252 @@ +import torch +torch.backends.cudnn.deterministic = True + +class LatentBKI(torch.nn.Module): + def __init__(self, grid_size, min_bound, max_bound, filter_size=3, + num_classes=42, latent_dim=512, device="cpu", datatype=torch.float32, + max_dist=0.5, kernel="sparse", pseduo_discrete = True): + ''' + Input: + grid_size: (x, y, z) int32 array, number of voxels + min_bound: (x, y, z) float32 array, lower bound on local map + max_bound: (x, y, z) float32 array, upper bound on local map + filter_size: int, dimension of the kernel on each axis (must be odd) + num_classes: int, number of classes + prior: float32, value of prior in map + device: cpu or gpu + max_dist: size of the kernel ell parameter + kernel: kernel to choose + per_class: whether to learn a different kernel for each class + ''' + super().__init__() + self.min_bound = min_bound.view(-1, 3).to(device) + self.max_bound = max_bound.view(-1, 3).to(device) + self.grid_size = grid_size + self.dtype = datatype + + self.kernel = kernel + self.device = device + self.num_classes = num_classes + self.latent_dim = latent_dim + self.pseduo_discrete = pseduo_discrete + + self.voxel_sizes = (self.max_bound.view(-1) - self.min_bound.view(-1)) / self.grid_size.to(self.device) + + self.pi = torch.acos(torch.zeros(1)).item() * 2 + self.ell = torch.tensor(max_dist, dtype=self.dtype, device=self.device, requires_grad=False) + self.sigma = torch.tensor(1.0, device=self.device) # Kernel must map to 0 to 1 + self.filter_size = torch.tensor(filter_size, dtype=torch.long, requires_grad=False, device=self.device) + + self.initialize_kernel() + + [xs, ys, zs] = [(max_bound[i]-min_bound[i])/(2*grid_size[i]) + + torch.linspace(min_bound[i], max_bound[i], device=device, steps=grid_size[i]+1)[:-1] + for i in range(3)] + + self.centroids = torch.cartesian_prod(xs, ys, zs).to(device) + + def initialize_kernel(self): + # Initialize with sparse kernel + assert(self.filter_size % 2 == 1) + + # Distances + middle_ind = torch.floor(self.filter_size / 2) + self.kernel_dists = torch.zeros([1, 1, self.filter_size, self.filter_size, self.filter_size], + device=self.device) + for x_ind in range(self.filter_size): + for y_ind in range(self.filter_size): + for z_ind in range(self.filter_size): + x_dist = torch.abs(x_ind - middle_ind) * self.voxel_sizes[0] + y_dist = torch.abs(y_ind - middle_ind) * self.voxel_sizes[1] + z_dist = torch.abs(z_ind - middle_ind) * self.voxel_sizes[2] + total_dist = torch.sqrt(x_dist ** 2 + y_dist ** 2 + z_dist ** 2) + self.kernel_dists[0, 0, x_ind, y_ind, z_ind] = total_dist + + def sparse_kernel(self, d, ell, sigma): + kernel_val = sigma * ((1.0/3)*(2 + torch.cos(2 * self.pi * d/ell))*(1 - d/ell) + + 1.0/(2*self.pi) * torch.sin(2 * self.pi * d / ell)) + kernel_val[d >= ell] = 0 + return torch.clamp(kernel_val, min=0.0, max=1.0) + + def calculate_kernel_from_distance_vector(self, distance_vector): + ''' + Input: + distance_vector: (n, 3) + semantic_class: (n, 1) + Output: + kernel_val: (n, features_size) + ''' + kernel_val = None + ds = torch.norm(distance_vector, dim=-1, keepdim=True) + if self.kernel == "sparse": + kernel_val = self.sparse_kernel(ds, self.ell, self.sigma) + return kernel_val + + def initialize_grid(self): + mean_map = torch.zeros(self.grid_size[0], self.grid_size[1], self.grid_size[2], + self.latent_dim, device=self.device, requires_grad=False, + dtype=self.dtype) + variance_map = torch.ones(self.grid_size[0], self.grid_size[1], self.grid_size[2], + self.latent_dim, device=self.device, requires_grad=False, + dtype=self.dtype) + + confidence_map = torch.zeros(self.grid_size[0], self.grid_size[1], self.grid_size[2], + 1, device=self.device, requires_grad=False, + dtype=self.dtype) + + return (mean_map, variance_map, confidence_map) + + def grid_ind(self, input_pc, min_bound=None, max_bound=None): + ''' + Input: + input_xyz: N * (x, y, z, c) float32 array, point cloud + Output: + grid_inds: N' * (x, y, z, c) int32 array, point cloud mapped to voxels + ''' + if min_bound is None: + min_bound = self.min_bound + if max_bound is None: + max_bound = self.max_bound + input_xyz = input_pc[:, :3] + labels = input_pc[:, 3:] + + grid_inds = torch.floor((input_xyz - min_bound) / self.voxel_sizes) + return torch.hstack((grid_inds, labels)) + + def grid_to_continuous(self, grid_point, min_bound=None): + ''' + Input: + grid_inds: N' * (x, y, z, c) int32 array, point cloud mapped to voxels + Output: + grid_metric: N * (x, y, z, c) float32 array, grid point in metric space (same as point cloud) + ''' + if min_bound == None: + min_bound = self.min_bound + + grid_inds = grid_point[..., :3] + labels = grid_point[..., 3:] + + grid_xyz = grid_inds * self.voxel_sizes + min_bound # TODO: Check the math is correct + + return torch.cat((grid_xyz, labels),dim=-1) + + def calculate_kernel(self, i=0): + kernel_val = None + kernel_val = self.sparse_kernel(self.kernel_dists, self.ell, self.sigma) + return kernel_val + + def construct_neighbor_grid(self, grid_point, store_coresponding_index=False): + ''' + Input: + gird_point: (N, 3+feautre) int32 tensor, store the indicies of the grids + Output: + neighbor_grid_point: (N, filter_size^3, 3+feature+index), first elements cosresponding to all nigherbors of the first elemnts in the grid_points that recive the feature update + ''' + # construct offsets + offsets_range = torch.tensor(range(int(-(self.filter_size-1)/2), int((self.filter_size+1)/2)), device=self.device) + offsets = torch.cartesian_prod(offsets_range, offsets_range, offsets_range) # TODO: redundent memory + + # append the coresponding index at the end of each grid point, so after boradcasting, the neigbors will all contain the indices of the original grid point that they are centered on + if store_coresponding_index: + indicies = torch.arange(0, grid_point.shape[0], dtype=grid_point.dtype, device=self.device).reshape(-1,1) + grid_point = torch.hstack((grid_point, indicies)) + + # make the offsets the same shape as grid_point + pad_offsets = torch.zeros(offsets.shape[0], grid_point.shape[-1], device=self.device) + pad_offsets[:,:3] = offsets + + # construct neighbor grids + neighbor_grid_point = grid_point.reshape(-1,1,grid_point.shape[-1]) + neighbor_grid_point = neighbor_grid_point + pad_offsets # (number of points, number of neighbors, grid_indcies & feature) + + return neighbor_grid_point + + def latent_map_update(self, current_map, point_cloud, max_bound=None, min_bound=None): + # mean_map, mean_confidence_map, variance_map, variance_confidence_map = current_map[0], current_map[1], current_map[2], current_map[3] + mean_map, variance_map, confidence_map = current_map[0], current_map[1], current_map[2] + + if min_bound is None: + min_bound = self.min_bound + if max_bound is None: + max_bound = self.max_bound + + grid_pc = self.grid_ind(point_cloud, min_bound=min_bound, max_bound=max_bound) + + # construct neighbor grids + neighbor_grid_pc = self.construct_neighbor_grid(grid_pc) # (N, f^3, 3+feature) + + # construct valid mask + valid_input_mask = torch.all((neighbor_grid_pc[:,:,:3] < self.grid_size) & (neighbor_grid_pc[:,:,:3] >= torch.tensor([0,0,0], device=self.device)), axis=-1) + + # turn index into position in metric space + neighbor_grid_metric = self.grid_to_continuous(neighbor_grid_pc, min_bound) + + # construct distance vector, the last dimension now contain a vector from a point to its neighbor grid centroid + if self.pseduo_discrete: + # NOTE: TEST substitute point_cloud with grid_pc + neighbor_grid_metric[:,:,:3] = neighbor_grid_metric[:,:,:3] - self.grid_to_continuous(grid_pc, min_bound).unsqueeze(1)[:,:,:3] + else: + neighbor_grid_metric[:,:,:3] = neighbor_grid_metric[:,:,:3] - point_cloud.unsqueeze(1)[:,:,:3] # (n, f^3, xyz + feature) + + # select the valid grids + neighbor_grid_pc = neighbor_grid_pc[valid_input_mask] + valid_neighbor_grid_indices = [*neighbor_grid_pc[:,:3].T.to(torch.long)] # (3+feature, m) list contain indices in each coordinate + neighbor_grid_metric = neighbor_grid_metric[valid_input_mask] # (m, xyz+feature) + + # calculate kernel value + kernel_vals = self.calculate_kernel_from_distance_vector(neighbor_grid_metric[:,:3]) # (m, 1) + + # compute sum of weight for each voxel in this frame + k_bar_map = torch.zeros_like(confidence_map, dtype=self.dtype, device=self.device).index_put_(valid_neighbor_grid_indices[:3], kernel_vals, accumulate=True) + + # compute contribution of observed sample in this frame + y_bar_map = torch.zeros_like(mean_map, dtype=self.dtype, device=self.device).index_put_(valid_neighbor_grid_indices[:3], neighbor_grid_metric[:,3:] * kernel_vals, accumulate=True) / (k_bar_map + 1e-6) # TODO ways to remove this 1e-6 + + # update variance map + unique_grid_index = [*torch.unique(neighbor_grid_pc[:,:3], dim=0).T.to(torch.long)] # list (3, K), let number of updated grid be K + delta_update_perpoint = neighbor_grid_metric[:,3:] - y_bar_map[valid_neighbor_grid_indices[:3]] # (m, 512) + delta_mean = y_bar_map[unique_grid_index] - mean_map[unique_grid_index] # (K, C) y_bar - miu_0 for all the grids that have been updated, + + # sum up the values that belongs to the same grid + E_bar = delta_mean * delta_mean # (K, C), element-wise product + S_bar_map = torch.zeros_like(mean_map, dtype=self.dtype, device=self.device).index_put_(valid_neighbor_grid_indices[:3], kernel_vals * delta_update_perpoint * delta_update_perpoint, accumulate=True) + S_bar = S_bar_map[unique_grid_index] # (K, C) + + updated_lambda = confidence_map[unique_grid_index] + updated_k = k_bar_map[unique_grid_index] + scaling_factor = (updated_lambda * updated_k) / (updated_lambda + updated_k + 1e-6) # TODO: prevent division by zero + variance_map[unique_grid_index] += S_bar + scaling_factor * E_bar + + # update mean map + mean_map = (confidence_map * mean_map + k_bar_map * y_bar_map) / (confidence_map + k_bar_map + 1e-6) # TODO: prevent division by zero + + # update confidence map + confidence_map += k_bar_map + + return (mean_map, variance_map, confidence_map) + + def forward(self, current_map, point_cloud, iterative = False, min_bound = None, max_bound = None): + if iterative: + # trade speed for memory + batch_size = 100000 # 64 + # batch_size = 1000 # 512 + start = 0 + end = start + batch_size + N = point_cloud.shape[0] + while end < N: + batch_point_cloud = point_cloud[start:end] + current_map = self.latent_map_update(current_map, batch_point_cloud, min_bound=min_bound, max_bound=max_bound) + start = end + end = min(start+batch_size, N) + # process last part + batch_point_cloud = point_cloud[start:end] + current_map = self.latent_map_update(current_map, batch_point_cloud, min_bound=min_bound, max_bound=max_bound) + return current_map + else: + # take too much memory + return self.latent_map_update(current_map, point_cloud, min_bound=min_bound, max_bound=max_bound) + + + + + \ No newline at end of file diff --git a/Models/Lseg/Lseg_module.py b/Models/Lseg/Lseg_module.py new file mode 100644 index 0000000..4e8d34a --- /dev/null +++ b/Models/Lseg/Lseg_module.py @@ -0,0 +1,84 @@ +import torch +import clip +import pickle as pk +import os + +from Models.Lseg.lseg_utils import get_lseg_feat, init_lseg +from PCAonGPU.gpu_pca.pca_module import IncrementalPCAonGPU + +class Lseg_module(): + def __init__(self, pca_path = None, device=("cuda" if torch.cuda.is_available() else "cpu")): + self.device = device + self.clip_model, _ = clip.load("ViT-B/32", device=self.device) + (self.lseg_model, self.lseg_transform, + self.crop_size, self.base_size, self.norm_mean, + self.norm_std, self.clip_feat_dim) = init_lseg(self.device) + self.pca = None + if pca_path is not None: + if os.path.basename(pca_path).split('.')[-1] == 'pkl': + self.pca = pk.load(open(pca_path,'rb')) + elif os.path.basename(pca_path).split('.')[-1] == 'pt': + self.pca = IncrementalPCAonGPU(device=self.device) + self.pca.load_vars(pca_path) + + + def encoding_feature(self, rgb : torch.Tensor) -> torch.Tensor: + ''' + Input: + rgb: image torch tensor + Output: + features: per pixel features in the same shape + ''' + labels = ["example"] + pix_feats = get_lseg_feat( + self.lseg_model, rgb.to('cpu').numpy(), + labels, self.lseg_transform, self.device, + self.crop_size, rgb.shape[1], self.norm_mean, + self.norm_std, vis=False + ) + pix_feats = torch.tensor(pix_feats).to(self.device) + return pix_feats + + def decoding_feature(self, features : torch.Tensor, category_features : torch.Tensor) -> torch.Tensor: + ''' + Input: + features: (N, C), features of N elements + Category_features: (M, C), M is the number of categories you have + Output: + semantic_probs: (N, C), category probability for each element + ''' + similarity_matrix = (features / features.norm(dim=-1,keepdim=True)) @ (category_features / (category_features.norm(dim=-1,keepdim=True))).T + # similarity_matrix = features @ category_features.T + # similarity_matrix = torch.nn.functional.cosine_similarity(features.unsqueeze(1), category_features,) + # print(similarity_matrix.shape) + # semantic_probs = similarity_matrix.softmax(dim=-1) # convert to probability + return similarity_matrix # TODO: return category instead? + + def words_to_clip(self, word_list) -> torch.Tensor: + text = clip.tokenize(word_list).to(self.device) + with torch.no_grad(): + text_features = self.clip_model.encode_text(text) + text_features /= text_features.norm(dim=1, keepdim=True) + return text_features + + def down_sampling(self, features) -> torch.Tensor: + ''' + Input: + features: (N, C), features of N elements with C dimensions of the feature vector + pca: predefined IncrementalPCAonGPU(n_components=D) object + Output: + tf_features: (N, D), features of N elements with D dimensions of the feature vector + ''' + return self.pca.transform(features) + + def backproject_to_clip(self, features) -> torch.Tensor: + ''' + Input: + features: (N, D), features of N elements with D dimensions of the feature vector + pca: predefined IncrementalPCAonGPU(n_components=D) object + Output: + tf_features: (N, C), features of N elements with C dimensions of the feature vector + ''' + return self.pca.inverse_transform(features) + + \ No newline at end of file diff --git a/Models/Lseg/lseg_utils.py b/Models/Lseg/lseg_utils.py new file mode 100644 index 0000000..19690b2 --- /dev/null +++ b/Models/Lseg/lseg_utils.py @@ -0,0 +1,237 @@ +""" +This code is adapted from the open-source project [Project Name] +Original source: [URL to the original code repository] +Author(s): [Original Author(s) Names] +License: [License Type, e.g., MIT, GPL] +""" +import math +import os +from pathlib import Path +import gdown + +import numpy as np +import cv2 +import torch +import torch.nn.functional as F +import torchvision.transforms as transforms + +from matplotlib import pyplot as plt +import matplotlib.patches as mpatches + +from PIL import Image + +from .models.lseg_net import LSegEncNet + +def init_lseg(device): + crop_size = 480 # 480 + base_size = 1080 # 520 + lseg_model = LSegEncNet("", arch_option=0, block_depth=0, activation="lrelu", crop_size=crop_size) + model_state_dict = lseg_model.state_dict() + checkpoint_dir = Path(__file__).resolve().parents[0] / "lseg" / "checkpoints" + checkpoint_path = checkpoint_dir / "demo_e200.ckpt" + os.makedirs(checkpoint_dir, exist_ok=True) + if not checkpoint_path.exists(): + print("Downloading LSeg checkpoint...") + # the checkpoint is from official LSeg github repo + # https://github.com/isl-org/lang-seg + checkpoint_url = "https://drive.google.com/u/0/uc?id=1ayk6NXURI_vIPlym16f_RG3ffxBWHxvb" + gdown.download(checkpoint_url, output=str(checkpoint_path)) + + pretrained_state_dict = torch.load(checkpoint_path, map_location=device) + pretrained_state_dict = {k.lstrip("net."): v for k, v in pretrained_state_dict["state_dict"].items()} + model_state_dict.update(pretrained_state_dict) + lseg_model.load_state_dict(pretrained_state_dict) + + lseg_model.eval() + lseg_model = lseg_model.to(device) + + norm_mean = [0.5, 0.5, 0.5] + norm_std = [0.5, 0.5, 0.5] + lseg_transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]), + ] + ) + clip_feat_dim = lseg_model.out_c + return lseg_model, lseg_transform, crop_size, base_size, norm_mean, norm_std, clip_feat_dim + +def resize_image(img, h, w, **up_kwargs): + return F.interpolate(img, (h, w), **up_kwargs) + +def pad_image(img, mean, std, crop_size): + b,c,h,w = img.shape #.size() + assert(c==3) + padh = crop_size - h if h < crop_size else 0 + padw = crop_size - w if w < crop_size else 0 + pad_values = -np.array(mean) / np.array(std) + img_pad = img.new().resize_(b,c,h+padh,w+padw) + for i in range(c): + # note that pytorch pad params is in reversed orders + img_pad[:,i,:,:] = F.pad(img[:,i,:,:], (0, padw, 0, padh), value=pad_values[i]) + assert(img_pad.size(2)>=crop_size and img_pad.size(3)>=crop_size) + return img_pad + +def crop_image(img, h0, h1, w0, w1): + return img[:,:,h0:h1,w0:w1] + +def get_new_pallete(num_cls): + n = num_cls + pallete = [0] * (n * 3) + # hsv_step = int(179 / n) + # for j in range(0, n): + # hsv = np.array([hsv_step * j, 255, 255], dtype=np.uint8).reshape((1,1,3)) + # rgb = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB) + # rgb = rgb.reshape(-1) + # pallete[j * 3 + 0] = rgb[0] + # pallete[j * 3 + 1] = rgb[1] + # pallete[j * 3 + 2] = rgb[2] + + for j in range(0, n): + lab = j + pallete[j * 3 + 0] = 0 + pallete[j * 3 + 1] = 0 + pallete[j * 3 + 2] = 0 + i = 0 + while lab > 0: + pallete[j * 3 + 0] |= ((lab >> 0) & 1) << (7 - i) + pallete[j * 3 + 1] |= ((lab >> 1) & 1) << (7 - i) + pallete[j * 3 + 2] |= ((lab >> 2) & 1) << (7 - i) + i = i + 1 + lab >>= 3 + return pallete + + +def get_new_mask_pallete(npimg, new_palette, out_label_flag=False, labels=None, ignore_ids_list=[]): + """Get image color pallete for visualizing masks""" + # put colormap + out_img = Image.fromarray(npimg.squeeze().astype("uint8")) + out_img.putpalette(new_palette) + + if out_label_flag: + assert labels is not None + u_index = np.unique(npimg) + patches = [] + for i, index in enumerate(u_index): + if index in ignore_ids_list: + continue + label = labels[index] + cur_color = [ + new_palette[index * 3] / 255.0, + new_palette[index * 3 + 1] / 255.0, + new_palette[index * 3 + 2] / 255.0, + ] + red_patch = mpatches.Patch(color=cur_color, label=label) + patches.append(red_patch) + return out_img, patches + + + +def get_lseg_feat( + model: LSegEncNet, + image: np.array, + labels, + transform, + device, + crop_size=480, + base_size=1080, + norm_mean=[0.5, 0.5, 0.5], + norm_std=[0.5, 0.5, 0.5], + vis=False, +): + if vis: + vis_image = image.clone().numpy() #.copy() + image = transform(image).unsqueeze(0).to(device) + # print("image shape: ", image.shape) + img = image[0].permute(1, 2, 0) + img = img * 0.5 + 0.5 + + batch, _, h, w = image.size() + # print("h: ", h) + # print("w: ", w) + stride_rate = 2.0 / 3.0 + stride = int(crop_size * stride_rate) + # print("stride: ", stride) + + # long_size = int(math.ceil(base_size * scale)) + long_size = base_size + if h > w: + height = long_size + width = int(1.0 * w * long_size / h + 0.5) + short_size = width + else: + width = long_size + height = int(1.0 * h * long_size / w + 0.5) + short_size = height + + cur_img = resize_image(image, height, width, **{"mode": "bilinear", "align_corners": True}) + # print("cur_img size", np.shape(cur_img)) + + if long_size <= crop_size: + pad_img = pad_image(cur_img, norm_mean, norm_std, crop_size) + with torch.no_grad(): + # outputs = model(pad_img) + outputs, logits = model(pad_img, labels) + outputs = crop_image(outputs, 0, height, 0, width) + else: + if short_size < crop_size: + # pad if needed + pad_img = pad_image(cur_img, norm_mean, norm_std, crop_size) + else: + pad_img = cur_img + # print("pad_img shape", pad_img.shape) + _, _, ph, pw = pad_img.shape # .size() + assert ph >= height and pw >= width + h_grids = int(math.ceil(1.0 * (ph - crop_size) / stride)) + 1 + w_grids = int(math.ceil(1.0 * (pw - crop_size) / stride)) + 1 + with torch.cuda.device_of(image): + with torch.no_grad(): + outputs = image.new().resize_(batch, model.out_c, ph, pw).zero_().to(device) + logits_outputs = image.new().resize_(batch, len(labels), ph, pw).zero_().to(device) + count_norm = image.new().resize_(batch, 1, ph, pw).zero_().to(device) + # grid evaluation + for idh in range(h_grids): + for idw in range(w_grids): + h0 = idh * stride + w0 = idw * stride + h1 = min(h0 + crop_size, ph) + w1 = min(w0 + crop_size, pw) + crop_img = crop_image(pad_img, h0, h1, w0, w1) + # pad if needed + pad_crop_img = pad_image(crop_img, norm_mean, norm_std, crop_size) + with torch.no_grad(): + # output = model(pad_crop_img) + output, logits = model(pad_crop_img, labels) + # print("pad_crop_img.shape", pad_crop_img.shape) + cropped = crop_image(output, 0, h1 - h0, 0, w1 - w0) + cropped_logits = crop_image(logits, 0, h1 - h0, 0, w1 - w0) + outputs[:, :, h0:h1, w0:w1] += cropped + logits_outputs[:, :, h0:h1, w0:w1] += cropped_logits + count_norm[:, :, h0:h1, w0:w1] += 1 + assert (count_norm == 0).sum() == 0 + outputs = outputs / count_norm + logits_outputs = logits_outputs / count_norm + outputs = outputs[:, :, :height, :width] + logits_outputs = logits_outputs[:, :, :height, :width] + # outputs = resize_image(outputs, h, w, **{'mode': 'bilinear', 'align_corners': True}) + # outputs = resize_image(outputs, image.shape[0], image.shape[1], **{'mode': 'bilinear', 'align_corners': True}) + outputs = outputs.cpu() + outputs = outputs.numpy() # B, D, H, W + predicts = [torch.max(logit, 0)[1].cpu().numpy() for logit in logits_outputs] + pred = predicts[0] + #print("pred.shape: ", pred.shape) + if vis: + new_palette = get_new_pallete(len(labels)) + mask, patches = get_new_mask_pallete(pred, new_palette, out_label_flag=True, labels=labels) + seg = mask.convert("RGBA") + cv2.imshow("image", vis_image[:, :, [2, 1, 0]]) + cv2.waitKey() + fig = plt.figure() + plt.imshow(seg) + plt.legend(handles=patches, loc="upper left", bbox_to_anchor=(1.0, 1), prop={"size": 20}) + plt.axis("off") + + plt.tight_layout() + plt.show() + #print("outputs.shape", outputs.shape) + return outputs diff --git a/Models/Lseg/models/lseg_blocks.py b/Models/Lseg/models/lseg_blocks.py new file mode 100644 index 0000000..8641422 --- /dev/null +++ b/Models/Lseg/models/lseg_blocks.py @@ -0,0 +1,366 @@ +""" +This code is adapted from the open-source project [Project Name] +Original source: [URL to the original code repository] +Author(s): [Original Author(s) Names] +License: [License Type, e.g., MIT, GPL] +""" +import torch +import torch.nn as nn + +from .lseg_vit import ( + _make_pretrained_clip_vitl16_384, + _make_pretrained_clip_vitb32_384, + _make_pretrained_clipRN50x16_vitl16_384, + forward_vit, +) + + +def _make_encoder( + backbone, + features, + use_pretrained=True, + groups=1, + expand=False, + exportable=True, + hooks=None, + use_vit_only=False, + use_readout="ignore", + enable_attention_hooks=False, +): + if backbone == "clip_vitl16_384": + clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384( + use_pretrained, + hooks=hooks, + use_readout=use_readout, + enable_attention_hooks=enable_attention_hooks, + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) + elif backbone == "clipRN50x16_vitl16_384": + clip_pretrained, pretrained = _make_pretrained_clipRN50x16_vitl16_384( + use_pretrained, + hooks=hooks, + use_readout=use_readout, + enable_attention_hooks=enable_attention_hooks, + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) + elif backbone == "clip_vitb32_384": + clip_pretrained, pretrained = _make_pretrained_clip_vitb32_384( + use_pretrained, + hooks=hooks, + use_readout=use_readout, + ) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, expand=expand + ) + else: + print(f"Backbone '{backbone}' not implemented") + assert False + + return clip_pretrained, pretrained, scratch + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + out_shape4 = out_shape + if expand == True: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], + out_shape1, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], + out_shape2, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], + out_shape3, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer4_rn = nn.Conv2d( + in_shape[3], + out_shape4, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + + return scratch + + +class Interpolate(nn.Module): + """Interpolation module.""" + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, + scale_factor=self.scale_factor, + mode=self.mode, + align_corners=self.align_corners, + ) + + return x + + +class ResidualConvUnit(nn.Module): + """Residual convolution module.""" + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True + ) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True + ) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + out = self.relu(x) + out = self.conv1(out) + out = self.relu(out) + out = self.conv2(out) + + return out + x + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block.""" + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.resConfUnit1 = ResidualConvUnit(features) + self.resConfUnit2 = ResidualConvUnit(features) + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + output += self.resConfUnit1(xs[1]) + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode="bilinear", align_corners=True + ) + + return output + + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module.""" + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups = 1 + + self.conv1 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=not self.bn, + groups=self.groups, + ) + + self.conv2 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=not self.bn, + groups=self.groups, + ) + + if self.bn == True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn == True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn == True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + # return out + x + + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block.""" + + def __init__( + self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True, + ): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups = 1 + + self.expand = expand + out_features = features + if self.expand == True: + out_features = features // 2 + + self.out_conv = nn.Conv2d( + features, + out_features, + kernel_size=1, + stride=1, + padding=0, + bias=True, + groups=1, + ) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + # print("xs.shape: ", xs.shape) + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + # output += res + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode="bilinear", align_corners=self.align_corners + ) + + output = self.out_conv(output) + + return output + diff --git a/Models/Lseg/models/lseg_net.py b/Models/Lseg/models/lseg_net.py new file mode 100644 index 0000000..ee60194 --- /dev/null +++ b/Models/Lseg/models/lseg_net.py @@ -0,0 +1,364 @@ +""" +This code is adapted from the open-source project [Project Name] +Original source: [URL to the original code repository] +Author(s): [Original Author(s) Names] +License: [License Type, e.g., MIT, GPL] +""" +import math +import types + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .lseg_blocks import FeatureFusionBlock, Interpolate, _make_encoder, FeatureFusionBlock_custom, forward_vit +import clip +import numpy as np +import pandas as pd +import os + +class depthwise_clipseg_conv(nn.Module): + def __init__(self): + super(depthwise_clipseg_conv, self).__init__() + self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1) + + def depthwise_clipseg(self, x, channels): + x = torch.cat([self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)], dim=1) + return x + + def forward(self, x): + channels = x.shape[1] + out = self.depthwise_clipseg(x, channels) + return out + + +class depthwise_conv(nn.Module): + def __init__(self, kernel_size=3, stride=1, padding=1): + super(depthwise_conv, self).__init__() + self.depthwise = nn.Conv2d(1, 1, kernel_size=kernel_size, stride=stride, padding=padding) + + def forward(self, x): + # support for 4D tensor with NCHW + C, H, W = x.shape[1:] + x = x.reshape(-1, 1, H, W) + x = self.depthwise(x) + x = x.view(-1, C, H, W) + return x + + +class depthwise_block(nn.Module): + def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'): + super(depthwise_block, self).__init__() + self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1) + if activation == 'relu': + self.activation = nn.ReLU() + elif activation == 'lrelu': + self.activation = nn.LeakyReLU() + elif activation == 'tanh': + self.activation = nn.Tanh() + + def forward(self, x, act=True): + x = self.depthwise(x) + if act: + x = self.activation(x) + return x + + +class bottleneck_block(nn.Module): + def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'): + super(bottleneck_block, self).__init__() + self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1) + if activation == 'relu': + self.activation = nn.ReLU() + elif activation == 'lrelu': + self.activation = nn.LeakyReLU() + elif activation == 'tanh': + self.activation = nn.Tanh() + + + def forward(self, x, act=True): + sum_layer = x.max(dim=1, keepdim=True)[0] + x = self.depthwise(x) + x = x + sum_layer + if act: + x = self.activation(x) + return x + +class BaseModel(torch.nn.Module): + def load(self, path): + """Load model from file. + Args: + path (str): file path + """ + parameters = torch.load(path, map_location=torch.device("cpu")) + + if "optimizer" in parameters: + parameters = parameters["model"] + + self.load_state_dict(parameters) + +def _make_fusion_block(features, use_bn): + return FeatureFusionBlock_custom( + features, + activation=nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + ) + +class LSeg(BaseModel): + def __init__( + self, + head, + features=256, + backbone="clip_vitl16_384", + readout="project", + channels_last=False, + use_bn=False, + **kwargs, + ): + super(LSeg, self).__init__() + + self.channels_last = channels_last + + hooks = { + "clip_vitl16_384": [5, 11, 17, 23], + "clipRN50x16_vitl16_384": [5, 11, 17, 23], + "clip_vitb32_384": [2, 5, 8, 11], + } + + # Instantiate backbone and reassemble blocks + self.clip_pretrained, self.pretrained, self.scratch = _make_encoder( + backbone, + features, + groups=1, + expand=False, + exportable=False, + hooks=hooks[backbone], + use_readout=readout, + ) + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)).exp() + if backbone in ["clipRN50x16_vitl16_384"]: + self.out_c = 768 + else: + self.out_c = 512 + self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1) + + self.arch_option = kwargs["arch_option"] + if self.arch_option == 1: + self.scratch.head_block = bottleneck_block(activation=kwargs["activation"]) + self.block_depth = kwargs['block_depth'] + elif self.arch_option == 2: + self.scratch.head_block = depthwise_block(activation=kwargs["activation"]) + self.block_depth = kwargs['block_depth'] + + self.scratch.output_conv = head + + self.text = clip.tokenize(self.labels) + + def forward(self, x, labelset=''): + if labelset == '': + text = self.text + else: + text = clip.tokenize(labelset) + + if self.channels_last == True: + x.contiguous(memory_format=torch.channels_last) + + layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + text = text.to(x.device) + self.logit_scale = self.logit_scale.to(x.device) + text_features = self.clip_pretrained.encode_text(text) + + image_features = self.scratch.head1(path_1) + + imshape = image_features.shape + image_features = image_features.permute(0,2,3,1).reshape(-1, self.out_c) + + # normalized features + image_features = image_features / image_features.norm(dim=-1, keepdim=True) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + # pixel_encoding = self.logit_scale * image_features.half() + pixel_encoding = self.logit_scale * image_features + logits_per_image = pixel_encoding @ text_features.t() + + + out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3], -1).permute(0,3,1,2) + + if self.arch_option in [1, 2]: + for _ in range(self.block_depth - 1): + out = self.scratch.head_block(out) + out = self.scratch.head_block(out, False) + + out = self.scratch.output_conv(out) + + return out + + +class LSegNet(LSeg): + """Network for semantic segmentation.""" + def __init__(self, labels, path=None, scale_factor=0.5, crop_size=480, **kwargs): + + features = kwargs["features"] if "features" in kwargs else 256 + kwargs["use_bn"] = True + + self.crop_size = crop_size + self.scale_factor = scale_factor + self.labels = labels + + head = nn.Sequential( + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + ) + + super().__init__(head, **kwargs) + + if path is not None: + self.load(path) + +class LSegEnc(BaseModel): + def __init__( + self, + head, + features=256, + backbone="clip_vitl16_384", + readout="project", + channels_last=False, + use_bn=False, + **kwargs, + ): + super(LSegEnc, self).__init__() + + self.channels_last = channels_last + + hooks = { + "clip_vitl16_384": [5, 11, 17, 23], + "clipRN50x16_vitl16_384": [5, 11, 17, 23], + "clip_vitb32_384": [2, 5, 8, 11], + } + + # Instantiate backbone and reassemble blocks + self.clip_pretrained, self.pretrained, self.scratch = _make_encoder( + backbone, + features, + groups=1, + expand=False, + exportable=False, + hooks=hooks[backbone], + use_readout=readout, + ) + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)).exp() + if backbone in ["clipRN50x16_vitl16_384"]: + self.out_c = 768 + else: + self.out_c = 512 + self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1) + + self.arch_option = kwargs["arch_option"] + if self.arch_option == 1: + self.scratch.head_block = bottleneck_block(activation=kwargs["activation"]) + self.block_depth = kwargs['block_depth'] + elif self.arch_option == 2: + self.scratch.head_block = depthwise_block(activation=kwargs["activation"]) + self.block_depth = kwargs['block_depth'] + + self.scratch.output_conv = head + + self.text = clip.tokenize(self.labels) + + def forward(self, x, labelset): + if labelset == '': + text = self.text + else: + text = clip.tokenize(labelset) + + if self.channels_last == True: + x.contiguous(memory_format=torch.channels_last) + + layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + text = text.to(x.device) + self.logit_scale = self.logit_scale.to(x.device) + text_features = self.clip_pretrained.encode_text(text) + + image_features = self.scratch.head1(path_1) + + imshape = image_features.shape + image_features = image_features.permute(0,2,3,1).reshape(-1, self.out_c) + + # normalized features + image_features = image_features / image_features.norm(dim=-1, keepdim=True) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + # pixel_encoding = self.logit_scale * image_features.half() + pixel_encoding = self.logit_scale * image_features.float() + + logits_per_image = pixel_encoding @ text_features.t().float() + pixel_encoding = pixel_encoding.float().view(imshape[0], imshape[2], imshape[3], -1).permute(0,3,1,2) + + out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3], -1).permute(0,3,1,2) + + # if self.arch_option in [1, 2]: + # for _ in range(self.block_depth - 1): + # out = self.scratch.head_block(out) + # out = self.scratch.head_block(out, False) + + pixel_encoding = self.scratch.output_conv(pixel_encoding) + out = self.scratch.output_conv(out) + + return pixel_encoding, out + + +class LSegEncNet(LSegEnc): + """Network for semantic segmentation.""" + def __init__(self, labels, path=None, scale_factor=0.5, crop_size=480, **kwargs): + + features = kwargs["features"] if "features" in kwargs else 256 + kwargs["use_bn"] = True + + self.crop_size = crop_size + self.scale_factor = scale_factor + self.labels = labels + + head = nn.Sequential( + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + ) + + super().__init__(head, **kwargs) + + if path is not None: + self.load(path) diff --git a/Models/Lseg/models/lseg_vit.py b/Models/Lseg/models/lseg_vit.py new file mode 100644 index 0000000..3445575 --- /dev/null +++ b/Models/Lseg/models/lseg_vit.py @@ -0,0 +1,559 @@ +""" +This code is adapted from the open-source project [Project Name] +Original source: [URL to the original code repository] +Author(s): [Original Author(s) Names] +License: [License Type, e.g., MIT, GPL] +""" +import torch +import torch.nn as nn +import timm +import types +import math +import torch.nn.functional as F +import clip + +activations = {} + + +def get_activation(name): + def hook(model, input, output): + activations[name] = output + + return hook + + +attention = {} + + +def get_attention(name): + def hook(module, input, output): + x = input[0] + B, N, C = x.shape + qkv = ( + module.qkv(x) + .reshape(B, N, 3, module.num_heads, C // module.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * module.scale + + attn = attn.softmax(dim=-1) # [:,:,1,1:] + attention[name] = attn + + return hook + + +def get_mean_attention_map(attn, token, shape): + attn = attn[:, :, token, 1:] + attn = attn.unflatten(2, torch.Size([shape[2] // 16, shape[3] // 16])).float() + attn = torch.nn.functional.interpolate( + attn, size=shape[2:], mode="bicubic", align_corners=False + ).squeeze(0) + + all_attn = torch.mean(attn, 0) + + return all_attn + + +class Slice(nn.Module): + def __init__(self, start_index=1): + super(Slice, self).__init__() + self.start_index = start_index + + def forward(self, x): + return x[:, self.start_index :] + + +class AddReadout(nn.Module): + def __init__(self, start_index=1): + super(AddReadout, self).__init__() + self.start_index = start_index + + def forward(self, x): + if self.start_index == 2: + readout = (x[:, 0] + x[:, 1]) / 2 + else: + readout = x[:, 0] + return x[:, self.start_index :] + readout.unsqueeze(1) + + +class ProjectReadout(nn.Module): + def __init__(self, in_features, start_index=1): + super(ProjectReadout, self).__init__() + self.start_index = start_index + + self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU()) + + def forward(self, x): + readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :]) + features = torch.cat((x[:, self.start_index :], readout), -1) + + return self.project(features) + + +class Transpose(nn.Module): + def __init__(self, dim0, dim1): + super(Transpose, self).__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + x = x.transpose(self.dim0, self.dim1) + return x + + +def forward_vit(pretrained, x): + b, c, h, w = x.shape + + # encoder + glob = pretrained.model.forward_flex(x) + + layer_1 = pretrained.activations["1"] + layer_2 = pretrained.activations["2"] + layer_3 = pretrained.activations["3"] + layer_4 = pretrained.activations["4"] + + layer_1 = pretrained.act_postprocess1[0:2](layer_1) + layer_2 = pretrained.act_postprocess2[0:2](layer_2) + layer_3 = pretrained.act_postprocess3[0:2](layer_3) + layer_4 = pretrained.act_postprocess4[0:2](layer_4) + + unflatten = nn.Sequential( + nn.Unflatten( + 2, + torch.Size( + [ + h // pretrained.model.patch_size[1], + w // pretrained.model.patch_size[0], + ] + ), + ) + ) + + if layer_1.ndim == 3: + layer_1 = unflatten(layer_1) + if layer_2.ndim == 3: + layer_2 = unflatten(layer_2) + if layer_3.ndim == 3: + layer_3 = unflatten(layer_3) + if layer_4.ndim == 3: + layer_4 = unflatten(layer_4) + + layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1) + layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2) + layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3) + layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def _resize_pos_embed(self, posemb, gs_h, gs_w): + posemb_tok, posemb_grid = ( + posemb[:, : self.start_index], + posemb[0, self.start_index :], + ) + + gs_old = int(math.sqrt(len(posemb_grid))) + + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear") + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + +def forward_flex(self, x): + b, c, h, w = x.shape + + pos_embed = self._resize_pos_embed( + self.pos_embed, h // self.patch_size[1], w // self.patch_size[0] + ) + + B = x.shape[0] + + if hasattr(self.patch_embed, "backbone"): + x = self.patch_embed.backbone(x) + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) + + if getattr(self, "dist_token", None) is not None: + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + dist_token = self.dist_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, dist_token, x), dim=1) + else: + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + x = x + pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + + return x + + +def get_readout_oper(vit_features, features, use_readout, start_index=1): + if use_readout == "ignore": + readout_oper = [Slice(start_index)] * len(features) + elif use_readout == "add": + readout_oper = [AddReadout(start_index)] * len(features) + elif use_readout == "project": + readout_oper = [ + ProjectReadout(vit_features, start_index) for out_feat in features + ] + else: + assert ( + False + ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" + + return readout_oper + + +def _make_pretrained_clip_vitl16_384( + pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False +): + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + clip_pretrained, _ = clip.load("ViT-B/32", device=device, jit=False) + model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks == None else hooks + + pretrained = _make_vit_b16_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + enable_attention_hooks=enable_attention_hooks, + ) + return clip_pretrained, pretrained + + +def _make_pretrained_clipRN50x16_vitl16_384( + pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False +): + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + clip_pretrained, _ = clip.load("RN50x16", device=device, jit=False) + model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks == None else hooks + + pretrained = _make_vit_b16_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + enable_attention_hooks=enable_attention_hooks, + ) + return clip_pretrained, pretrained + + +def _make_pretrained_clip_vitb32_384(pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False): + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + clip_pretrained, _ = clip.load("ViT-B/32", device=device, jit=False) + model = timm.create_model("vit_base_patch32_384", pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks == None else hooks + + pretrained = _make_vit_b32_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout, + enable_attention_hooks=False, + ) + return clip_pretrained, pretrained + + +def _make_vit_b32_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, + enable_attention_hooks=False, +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + pretrained.model.patch_size = [32, 32] + pretrained.model.start_index = start_index + + if enable_attention_hooks: + pretrained.model.blocks[hooks[0]].attn.register_forward_hook( + get_attention("attn_1") + ) + pretrained.model.blocks[hooks[1]].attn.register_forward_hook( + get_attention("attn_2") + ) + pretrained.model.blocks[hooks[2]].attn.register_forward_hook( + get_attention("attn_3") + ) + pretrained.model.blocks[hooks[3]].attn.register_forward_hook( + get_attention("attn_4") + ) + pretrained.attention = attention + + readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) + + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // pretrained.model.patch_size[1], size[1] // pretrained.model.patch_size[0]])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=8, + stride=8, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // pretrained.model.patch_size[1], size[1] // pretrained.model.patch_size[0]])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // pretrained.model.patch_size[1], size[1] // pretrained.model.patch_size[0]])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[2], + out_channels=features[2], + kernel_size=2, + stride=2, + padding=0, + # output_padding=output_padding, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // pretrained.model.patch_size[1], size[1] // pretrained.model.patch_size[0]])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model + ) + + return pretrained + + +def _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, + enable_attention_hooks=False, +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + if enable_attention_hooks: + pretrained.model.blocks[hooks[0]].attn.register_forward_hook( + get_attention("attn_1") + ) + pretrained.model.blocks[hooks[1]].attn.register_forward_hook( + get_attention("attn_2") + ) + pretrained.model.blocks[hooks[2]].attn.register_forward_hook( + get_attention("attn_3") + ) + pretrained.model.blocks[hooks[3]].attn.register_forward_hook( + get_attention("attn_4") + ) + pretrained.attention = attention + + readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) + + # 32, 48, 136, 384 + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model + ) + + return pretrained \ No newline at end of file diff --git a/Models/SPVCNN/SPVCNN_module.py b/Models/SPVCNN/SPVCNN_module.py new file mode 100644 index 0000000..1f24b48 --- /dev/null +++ b/Models/SPVCNN/SPVCNN_module.py @@ -0,0 +1,44 @@ +import torch +import importlib +import numpy as np + +from Data.KITTI_SPVCNN import KITTI_SPVCNN_config + +class SPVCNN_Module(): + def __init__(self, device=("cuda" if torch.cuda.is_available() else "cpu")): + self.device = device + self.config = KITTI_SPVCNN_config() + self.config = self.config.config + + model_file = importlib.import_module('TwoDPASS.network.' + self.config['model_params']['model_architecture']) + my_model = model_file.get_model(self.config) ######## get model ############ + my_model = my_model.load_from_checkpoint(self.config.checkpoint, config=self.config, strict=(not self.config.pretrain2d)) + my_model = my_model.eval() + + self.my_model = my_model.to(self.device) + + + def encoding_feature(self, data_dict) -> torch.Tensor: + ''' + Input: + rgb: image torch tensor + Output: + features: per pixel features in the same shape + ''' + with torch.no_grad(): + features = self.my_model.encode_points(data_dict, self.device) + return features + + def decoding_feature(self, features, category_features = None) -> torch.Tensor: + ''' + Input: + features: (N, C), features of N elements + Category_features: (M, C), M is the number of categories you have + Output: + semantic_probs: (N, C), category logits for each element + ''' + with torch.no_grad(): + logits = self.my_model.decode_points(features, self.device) + return logits # TODO: return category instead? + + \ No newline at end of file diff --git a/Models/mapping_utils.py b/Models/mapping_utils.py new file mode 100644 index 0000000..f2ee4e4 --- /dev/null +++ b/Models/mapping_utils.py @@ -0,0 +1,227 @@ +# This file contains classes for local and global offline mapping (not running semantic prediction) +import torch +import numpy as np +from tqdm import tqdm +from Models.LatentBKI import LatentBKI +from pytorch3d.ops import knn_points + +# Save grid in CPU memory, load to GPU when needed for update step +class GlobalMapContinuous(LatentBKI): + def __init__(self, grid_size, min_bound, max_bound, filter_size, ell, category_feature, + decode = None, pca_upsample = None, num_classes=42, latent_dim=512, device="cpu", datatype=torch.float32, delete_time=10, use_relative_pose=True, pseduo_discrete=True): + super().__init__(grid_size, min_bound, max_bound, max_dist=ell, filter_size=filter_size, + num_classes=num_classes, latent_dim=latent_dim, device=device, datatype=datatype, pseduo_discrete=pseduo_discrete) + self.reset_grid() + self.delete_time = delete_time + self.category_feature = category_feature + self.decode = decode + self.pca_upsample = pca_upsample + self.use_relative_pose = use_relative_pose + self.sum_void_voxel = 0 + + def reset_grid(self): + self.global_map = None + self.map_times = None + self.initial_pose = None + self.translation_discretized = np.zeros(3) + self.points_rotation = torch.eye(3, dtype=self.dtype, device=self.device) + self.points_translation = torch.zeros(3, dtype=self.dtype, device=self.device) + + def inside_mask(self, min_bounds, max_bounds): + inside = torch.all((self.global_map[:, :3] >= min_bounds) & (self.global_map[:, :3] < max_bounds), axis=1) + return inside + + def get_local_map(self, min_bound=None, max_bound=None): + # Fetch local map from CPU (anything not seen is prior) + local_map = self.initialize_grid() # NOTE: N * (x, y, z, c) + inside_mask = None + if min_bound is None: + min_bound = self.min_bound + if max_bound is None: + max_bound = self.max_bound + local_min_bound = min_bound + torch.from_numpy(self.voxel_translation).to(self.device) + local_max_bound = max_bound + torch.from_numpy(self.voxel_translation).to(self.device) + if self.global_map is not None: + inside_mask = self.inside_mask(local_min_bound.detach().cpu(), local_max_bound.detach().cpu()) # NOTE: mask the select existing global grid within the local map boundry + allocated_map = self.global_map[inside_mask].clone().to(device=self.device, dtype=self.dtype) # NOTE: grid thats already exsists in the global map + grid_map = self.grid_ind(allocated_map, min_bound=local_min_bound, max_bound=local_max_bound) # TODO: why this step? + grid_indices = grid_map[:, :3].to(torch.long) # NOTE: store grid int(x, y, z) + local_map[0][grid_indices[:, 0], grid_indices[:, 1], grid_indices[:, 2], :] = allocated_map[:, 3:self.latent_dim+3] # mean + local_map[1][grid_indices[:, 0], grid_indices[:, 1], grid_indices[:, 2], :] = allocated_map[:, self.latent_dim+3:2*self.latent_dim+3] # variance + local_map[2][grid_indices[:, 0], grid_indices[:, 1], grid_indices[:, 2], :] = allocated_map[:, -1].view(-1,1) # confidence + return local_map, local_min_bound, local_max_bound, inside_mask + + def camera_to_global(self, points): + global_pose = torch.from_numpy(self.global_pose).to(self.device) + return torch.matmul(global_pose[:3, :3], points.T).T + global_pose[:3, 3] # NOTE: global (x,y,z) + + def discretize_to_centroid(self, points): + grid_inds = torch.floor(points / self.voxel_sizes) # assume globle voxel, min starts at 0 + grid_centroids = grid_inds * self.voxel_sizes + self.voxel_sizes/2 + return grid_centroids + + # Propagate map given a transformation matrix + def propagate(self, pose): + # Was just initialized + if self.initial_pose is None: + self.initial_pose = pose + self.inv_rotation = np.linalg.inv(self.initial_pose[:3, :3]) + + # we set the first frame as initial pose, assuem at origin + # the global pose will be the relative pose to the origin + # global_translation = self.inv_rotation @ (pose[:3, 3] - self.initial_pose[:3, 3]) + if self.use_relative_pose: + global_translation = pose[:3, 3] - self.initial_pose[:3, 3] + global_rotation = pose[:3, :3] @ self.inv_rotation + else: + global_translation = pose[:3, 3] + global_rotation = pose[:3, :3] + + self.global_pose = np.zeros((4,4), dtype=np.float32) + self.global_pose[:3,3] = global_translation + self.global_pose[:3,:3] = global_rotation + self.global_pose[3,3] = 1 + + # Relative transformation between origin and current point + relative_translation = self.global_pose[:3, 3] + # To select voxels from memory, find the nearest voxel + voxel_sizes = self.voxel_sizes.detach().cpu().numpy() + self.voxel_translation = np.round(relative_translation / voxel_sizes) * voxel_sizes + if self.use_relative_pose: + self.nearest_voxel = self.initial_pose[:3, 3] + self.voxel_translation + else: + self.nearest_voxel = self.voxel_translation + + # Uses saved weights instead of generating a filter + def update_map(self, semantic_preds): + semantic_preds = semantic_preds.to(self.dtype) # NOTE: (n, 3+1) pose + semantic class, imagine number of semantic class of cube (xyz) stack on top of each other + local_map, local_min_bound, local_max_bound, inside_mask = self.get_local_map() # NOTE: local map contain exisiting global boundary (501, 501, 27, 20) + + # Rotate the point cloud and translate to global frame + global_pose = torch.from_numpy(self.global_pose).to(self.device) + semantic_preds[:, :3] = torch.matmul(global_pose[:3, :3], semantic_preds[:, :3].T).T + global_pose[:3, 3] # NOTE: global (x,y,z) + + # Update local map + with torch.no_grad(): + local_map = self.forward(local_map, semantic_preds, min_bound=local_min_bound, max_bound=local_max_bound, iterative=True) + + # Find updated cells + effective_cells = (local_map[2] > 0).reshape(-1,) # all the cells that has been updated in the local region (including cells updated in previous frame but not this frame) + updated_cells = effective_cells + + updated_centroids = self.centroids[updated_cells, :] + torch.from_numpy(self.voxel_translation).to(self.device) # NOTE: change from local to global, (n, 3) + updated_mean = local_map[0].view(-1,self.latent_dim)[updated_cells] + updated_variance = local_map[1].view(-1,self.latent_dim)[updated_cells] + updated_confidence = local_map[2].view(-1, 1)[updated_cells] + new_cells = torch.cat((updated_centroids, updated_mean, updated_variance, updated_confidence), dim=1) # NOTE: only contain updated cells position and feature, (n, 3 + features) + visited_times = torch.zeros(new_cells.shape[0], 1).detach().cpu().numpy() + + # If empty + if self.global_map is None: + self.global_map = new_cells.detach().cpu() + self.map_times = visited_times + else: + # Replace local cells + outside_mask = ~ inside_mask + # Add new cells + self.global_map = torch.vstack((self.global_map[outside_mask, :], new_cells.detach().cpu())) + self.map_times = np.vstack((self.map_times[outside_mask, :], visited_times)) + + # Garbage Collection + # self.garbage_collection() # remove old global map depends on timestamp + return self.global_map + + def garbage_collection(self): + self.map_times += 1 + # Remove cells with T > self.delete_time + recent_mask = self.map_times < self.delete_time + recent_mask = np.squeeze(recent_mask) + self.map_times = self.map_times[recent_mask, :] + self.global_map = self.global_map[recent_mask, :] + + def label_points_iterative(self, points, batch_size = 100000, with_variance = True): + # Preprocessing map + if with_variance: + predictions = torch.empty(0, self.global_map.shape[1]-3+1 ,dtype=torch.float) # features + variance + category (N, features_size) + else: + predictions = torch.empty(0, 1,dtype=torch.float) # category (N,) + ### NOTE: change order of computation, for mean, compute the logits first ### + map_feature = self.global_map[:,3:3+self.latent_dim] + if self.pca_upsample: + map_feature = self.pca_upsample(map_feature) + self.global_map = torch.hstack((self.global_map[:,:3].to(self.device), self.decode(map_feature.to(self.device), self.category_feature.to(self.device)))) # (N, 3 + num_classes) + + self.sum_void_voxel = 0 + N = points.shape[0] + for start in tqdm(range(0, N, batch_size)): + end = min(start+batch_size, N) + batch_points = points[start:end] + batch_pred = self.label_points(batch_points, with_variance) + + predictions = torch.vstack((predictions, batch_pred.cpu())) + + print("Point that does not fall in voxel:", self.sum_void_voxel/N, f"[{self.sum_void_voxel}/{N}]") + return predictions + + def label_points(self, points, with_variance = False): + ''' + Input: + points: (N, 3) unlabeld points in global frame + Output: + + ''' + K = 1 + N = points.shape[0] + + ### NOTE: change order of computation, for mean, compute the logits first ### + points = self.discretize_to_centroid(points) + + # find the neighbors for each of the points, (N, K, 3) + nn_results = knn_points(points.unsqueeze(0), self.global_map[:,:3].to(self.device).unsqueeze(0), K=K, return_sorted=False) + idx = nn_results.idx[0].detach().cpu() + dists = torch.sqrt(nn_results.dists[0].detach().cpu()) # knn returns squared distance + + # points that not fall inside voxel + far_dist_mask = (dists > self.voxel_sizes[0].item()/2).squeeze(1) + self.sum_void_voxel += far_dist_mask.sum().item() + + if with_variance: + nearest_labeled_pc = self.global_map[idx, :].to(self.device) # try label to be feature + variance + confidence + # transform to postiror predictive distribution + confidence = nearest_labeled_pc[:,:,-1].reshape(N,K,1) + nearest_labeled_pc[:, :, 3+self.latent_dim:3+2*self.latent_dim] = (confidence + 1) / (confidence * confidence) * nearest_labeled_pc[:, :, 3+self.latent_dim:3+2*self.latent_dim] + nearest_labeled_pc[far_dist_mask, :, 3:3+2*self.latent_dim] = 0 # points not fall in voxel has 0 features and variance and confidence + else: + nearest_labeled_pc = self.global_map[idx, :].to(self.device) # return, (N, 3+40), which is categoricall logits + + pred_features_variance = nearest_labeled_pc[:,:,3:] + pred_features_variance = pred_features_variance.squeeze(1) + pred_features_variance[far_dist_mask, 3:] = 0 + + # return pred_features, mask + if with_variance: + pred_features = pred_features_variance[:,:self.latent_dim] + else: + # NOTE: if not with_variance, here is (N, 40) logits + pred_features = pred_features_variance + + # backproject to clip if sample is small + if self.pca_upsample and with_variance: + pred_features = self.pca_upsample(pred_features) + + # decode features into probability for each class + if with_variance: + labels = self.decode(pred_features, self.category_feature) + labels = labels.softmax(dim=-1) + else: + labels = pred_features.softmax(dim=-1) + + predictions = torch.argmax(labels, dim=-1) #(N, ) + predictions[far_dist_mask] = -1 # points that falls in the void + + if with_variance: + predictions = torch.hstack((pred_features_variance, predictions.reshape(-1,1).to(torch.float))) # feature, variance, confidence, category + else: + predictions = predictions.reshape(-1,1) + + return predictions \ No newline at end of file diff --git a/PCAonGPU/PCA_instance/.gitignore b/PCAonGPU/PCA_instance/.gitignore new file mode 100644 index 0000000..86d0cb2 --- /dev/null +++ b/PCAonGPU/PCA_instance/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore \ No newline at end of file diff --git a/PCAonGPU/gpu_pca/__init__.py b/PCAonGPU/gpu_pca/__init__.py new file mode 100644 index 0000000..6679f29 --- /dev/null +++ b/PCAonGPU/gpu_pca/__init__.py @@ -0,0 +1 @@ +from PCAonGPU.gpu_pca.pca_module import IncrementalPCAonGPU \ No newline at end of file diff --git a/PCAonGPU/gpu_pca/pca_module.py b/PCAonGPU/gpu_pca/pca_module.py new file mode 100644 index 0000000..a8ae965 --- /dev/null +++ b/PCAonGPU/gpu_pca/pca_module.py @@ -0,0 +1,268 @@ +"""This module provides an implementation of Incremental Principal +Components Analysis (IPCA) using PyTorch for GPU acceleration. +IPCA is useful for datasets too large to fit into memory, as it +processes data in smaller chunks or batches. +""" + +import torch + +class IncrementalPCAonGPU(): + """ + An implementation of Incremental Principal Components Analysis (IPCA) that leverages PyTorch for GPU acceleration. + + This class provides methods to fit the model on data incrementally in batches, and to transform new data + based on the principal components learned during the fitting process. + + Attributes: + n_components (int, optional): Number of components to keep. If `None`, it's set to the minimum of the + number of samples and features. Defaults to None. + whiten (bool): When True, the `components_` vectors are divided to ensure uncorrelated outputs with + unit component-wise variances. Defaults to False. + copy (bool): If False, input data will be overwritten. Defaults to True. + batch_size (int, optional): The number of samples to use for each batch. If `None`, it's inferred from + the data and set to `5 * n_features`. Defaults to None. + """ + + def __init__(self, device = (torch.device("cuda" if torch.cuda.is_available() else "cpu")), n_components=None, *, whiten=False, copy=True, batch_size=None): + self.n_components = n_components + self.whiten = whiten + self.copy = copy + self.batch_size = batch_size + self.device = device + + # Set n_components_ based on n_components if provided + if n_components: + self.n_components_ = n_components + + # Initialize attributes to avoid errors during the first call to partial_fit + self.mean_ = None # Will be initialized properly in partial_fit based on data dimensions + self.var_ = None # Will be initialized properly in partial_fit based on data dimensions + self.n_samples_seen_ = 0 + + def _validate_data(self, X, dtype=torch.float32, copy=True): + """ + Validates and converts the input data `X` to the appropriate tensor format. + + This method ensures that the input data is in the form of a PyTorch tensor and resides on the correct device (CPU or GPU). + It also provides an option to create a copy of the tensor, which is useful when the input data should not be overwritten. + + Args: + X (Union[np.ndarray, torch.Tensor]): Input data which can be a numpy array or a PyTorch tensor. + dtype (torch.dtype, optional): Desired data type for the tensor. Defaults to torch.float32. + copy (bool, optional): Whether to clone the tensor. If True, a new tensor is returned; otherwise, the original tensor + (or its device-transferred version) is returned. Defaults to True. + + Returns: + torch.Tensor: Validated and possibly copied tensor residing on the specified device. + """ + if not isinstance(X, torch.Tensor): + X = torch.tensor(X, dtype=dtype).to(self.device) + elif X.device != self.device: + X = X.to(self.device) + if copy: + X = X.clone() + return X + + @staticmethod + def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count): + """ + Computes the incremental mean and variance for the data `X`. + + Args: + X (torch.Tensor): The batch input data tensor with shape (n_samples, n_features). + last_mean (torch.Tensor): The previous mean tensor with shape (n_features,). + last_variance (torch.Tensor): The previous variance tensor with shape (n_features,). + last_sample_count (torch.Tensor): The count tensor of samples processed before the current batch. + + Returns: + Tuple[torch.Tensor, torch.Tensor, int]: Updated mean, variance tensors, and total sample count. + """ + if X.shape[0] == 0: + return last_mean, last_variance, last_sample_count + + # If last_mean or last_variance is None, initialize them with zeros + if last_mean is None: + last_mean = torch.zeros(X.shape[1], device=X.device) + if last_variance is None: + last_variance = torch.zeros(X.shape[1], device=X.device) + + new_sample_count = X.shape[0] + new_mean = torch.mean(X, dim=0) + new_sum_square = torch.sum((X - new_mean) ** 2, dim=0) + + updated_sample_count = last_sample_count + new_sample_count + + updated_mean = (last_sample_count * last_mean + new_sample_count * new_mean) / updated_sample_count + updated_variance = (last_variance * (last_sample_count + new_sample_count * last_mean ** 2) + new_sum_square + new_sample_count * new_mean ** 2) / updated_sample_count - updated_mean ** 2 + + return updated_mean, updated_variance, updated_sample_count + + @staticmethod + def _svd_flip(u, v, u_based_decision=True): + """ + Adjusts the signs of the singular vectors from the SVD decomposition for deterministic output. + + This method ensures that the output remains consistent across different runs. + + Args: + u (torch.Tensor): Left singular vectors tensor. + v (torch.Tensor): Right singular vectors tensor. + u_based_decision (bool, optional): If True, uses the left singular vectors to determine the sign flipping. Defaults to True. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Adjusted left and right singular vectors tensors. + """ + if u_based_decision: + max_abs_cols = torch.argmax(torch.abs(u), dim=0) + signs = torch.sign(u[max_abs_cols, range(u.shape[1])]) + else: + max_abs_rows = torch.argmax(torch.abs(v), dim=1) + signs = torch.sign(v[range(v.shape[0]), max_abs_rows]) + u *= signs + v *= signs[:, None] + return u, v + + def fit(self, X, check_input=True): + """ + Fits the model with data `X` using minibatches of size `batch_size`. + + Args: + X (torch.Tensor): The input data tensor with shape (n_samples, n_features). + + Returns: + IncrementalPCAGPU: The fitted IPCA model. + """ + if check_input: + X = self._validate_data(X) + n_samples, n_features = X.shape + if self.batch_size is None: + self.batch_size_ = 5 * n_features + else: + self.batch_size_ = self.batch_size + + for start in range(0, n_samples, self.batch_size_): + end = min(start + self.batch_size_, n_samples) + X_batch = X[start:end] + self.partial_fit(X_batch, check_input=False) + + return self + + def partial_fit(self, X, check_input=True): + """ + Incrementally fits the model with batch data `X`. + + Args: + X (torch.Tensor): The batch input data tensor with shape (n_samples, n_features). + check_input (bool, optional): If True, validates the input. Defaults to True. + + Returns: + IncrementalPCAGPU: The updated IPCA model after processing the batch. + """ + first_pass = not hasattr(self, "components_") + + if check_input: + X = self._validate_data(X) + n_samples, n_features = X.shape + + if first_pass: + self.components_ = None + if self.n_components is None: + self.n_components_ = min(n_samples, n_features) + + col_mean, col_var, n_total_samples = self._incremental_mean_and_var( + X, self.mean_, self.var_, torch.tensor([self.n_samples_seen_], device=self.device) + ) + + # Whitening + if self.n_samples_seen_ == 0: + X -= col_mean + else: + col_batch_mean = torch.mean(X, dim=0) + X -= col_batch_mean + mean_correction_factor = torch.sqrt( + torch.tensor((self.n_samples_seen_ / n_total_samples.item()) * n_samples, device=self.device) + ) + mean_correction = mean_correction_factor * (self.mean_ - col_batch_mean) + + if self.singular_values_ is not None and self.components_ is not None: + X = torch.vstack( + ( + self.singular_values_.view((-1, 1)) * self.components_, + X, + mean_correction, + ) + ) + + U, S, Vt = torch.linalg.svd(X, full_matrices=False) + U, Vt = self._svd_flip(U, Vt, u_based_decision=False) + explained_variance = S**2 / (n_total_samples.item() - 1) + explained_variance_ratio = S**2 / torch.sum(col_var * n_total_samples.item()) + + self.n_samples_seen_ = n_total_samples.item() + self.components_ = Vt[: self.n_components_] + self.singular_values_ = S[: self.n_components_] + self.mean_ = col_mean + self.var_ = col_var + self.explained_variance_ = explained_variance[: self.n_components_] + self.explained_variance_ratio_ = explained_variance_ratio[: self.n_components_] + if self.n_components_ not in (n_samples, n_features): + self.noise_variance_ = explained_variance[self.n_components_ :].mean().item() + else: + self.noise_variance_ = 0.0 + return self + + def transform(self, X): + """ + Applies dimensionality reduction to `X`. + + The input data `X` is projected on the first principal components previously extracted from a training set. + + Args: + X (torch.Tensor): New data tensor with shape (n_samples, n_features) to be transformed. + + Returns: + torch.Tensor: Transformed data tensor with shape (n_samples, n_components). + """ + X = X.to(self.device) + return torch.mm(X - self.mean_, self.components_.T) + + def inverse_transform(self, X): + """ + Transform data back to its original space. + + In other words, return an input `X_original` whose transform would be X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_components) + New data, where `n_samples` is the number of samples + and `n_components` is the number of components. + + Returns + ------- + X_original array-like of shape (n_samples, n_features) + Original data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + """ + X = X.to(self.device) + return X @ self.components_ + self.mean_ + + def save_vars(self, save_path): + ''' + Move all tensor to cpu and save all the varialbes expect the 'device' + ''' + state_dict = vars(self).copy() + for key, value in state_dict.items(): + if type(value) is torch.Tensor: + state_dict[key] = value.detach().cpu() + state_dict.pop('device') + torch.save(state_dict, save_path) + + def load_vars(self, load_path): + state_dict = torch.load(load_path) + for key, value in state_dict.items(): + vars(self)[key] = value.to(self.device) if type(value) is torch.Tensor else value + + def get_vars(self): + return vars(self) \ No newline at end of file diff --git a/README.md b/README.md index 491ea46..683cdb8 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,146 @@ # LatentBKI -Repository for latent Bayesian Kernel Inference +This repository is the implementation of [Latent BKI](), aiming to reproduce the experiment results shown in the paper. + +## Installation (test with python 3.7 + torch1.11 + cuda11.3) + +``` +conda env create -f environment.yml +conda activate latentbki_env +``` + +There are some package needs manual installation: + +1. Clip: follow the [repo](https://github.com/openai/CLIP). +2. [torchsparse](https://github.com/mit-han-lab/torchsparse) (sudo apt-get install libsparsehash-dev, pip install --upgrade git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0) +3. [pytorch3d](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md#building--installing-from-source) + +(Optional) Install [ROS Melodic](https://wiki.ros.org/melodic/Installation/Ubuntu) to visualize map in Rviz + +> Note: Also checkout all [dependent codebase](#acknowledgements) if facing any issues. + +## Data preparation + +### Matterport 3D (MP3D) +Follow VLMap's [Generate dataset](https://github.com/vlmaps/vlmaps?tab=readme-ov-file#generate-dataset) section to get MP3D sequences. The ground truth semantic generated is a bit incorrect, so we provided a modifcation to obtain correct ground truth data [here](). + +### Semantic KITTI +Follow 2DPASS' [Data Preparation](https://github.com/yanx27/2DPASS?tab=readme-ov-file#data-preparation) section to obtain Semantic KIITI dataset under `Dataset` folder + +Download SPVCNN model checkpoint from [here](https://drive.google.com/drive/folders/1VpY2MCp5i654pXjizFMw0mrmuxC_XboW) and put it under `./TwoDPASS/pretrained/SPVCNN`. + +### Real World data with iPhone/iPad +1. Download [Record3D](https://record3d.app/) app on iPhone/iPad, record a video and export it as `.r3d` file. +2. Extract the files in `.r3d` same as extracting from zip files, you will get a folder named `rgbd` and a `metadata` file. +3. Run `Data/select_r3d_frames.py` with customized parameters to create following real world dataset folder structure: +``` +/[your dataset path] +├── +├── ... +└── real_world/ + ├──[sequence name] + ├── conf/ + | ├── 000000.npy + | ├── 000001.npy + | └── ... + └── depth/ + | ├── 000000.npy + | ├── 000001.npy + | └── ... + └── rgb/ + | ├── 000000.jpg + | ├── 000001.jpg + | └── ... + └── intrinsics.txt + └── poses.txt +``` +You can download an already processed Record3D data example, `my_house_long.zip`, [here](https://drive.google.com/drive/folders/1dWJXcvHyBimh8KMvA7e3zwApXju6tXTZ?usp=drive_link). + +### Download PCA downsampler + +Download `mp3d_pca_64.pkl` [here](https://drive.google.com/drive/folders/1dWJXcvHyBimh8KMvA7e3zwApXju6tXTZ?usp=drive_link) to `./PCAonGPU/PCA_instance` + +## Usage + +### MP3D data +Required to provide path in `./config/mp3d.yaml`to following parameters: + +``` +- data_dir: "/path/to/realworld/dataset/folder" +- pca_path: "/path/to/trained/pca/.pkl/file" +``` + +Other parameters are optional if only want to reproduce the result. + +### Real-world data + +Modify `realworld.yaml` under `./config` + + +``` +Required parameters: +- num_classes: [number of class desired] +- data_dir: "/path/to/realworld/dataset/folder" +- pca_path: "/path/to/trained/pca/.pkl/file" +- intrinsic: [matrix from the intrinsic.txt] +- sequences: [ + [your_sequences_name] +] +- category: [ + [List of words you want decode] +] + +Optional parameters: +- feature_size: [PCA downsampled size, default 64] +- grid_mask: [ignore points outside local grid, default True] +- down_sample_feature: [default True] +- raw_data: [Set to True only if features are saved to disk] +- subsample_points: [How much pixel feature to use, default 1, use all feature] +- feature_dir: [set it only if you save latent feature to disk] + +``` + +### KITTI + +NOTE: `semantic_kitti.yaml` is used to provide additional parameters, such as feature size. We are using the dataloader in 2DPASS. Change the following parameters in `TwoDPASS/config/SPVCNN-semantickitti.yaml`: + +``` +train_data_loader: + data_path: "/path/to/kitti/dataset/sequences" + +val_data_loader: + data_path: "/path/to/kitti/dataset/sequences" +``` + +### run mapping algorithm + +In `./generate_results.py`, set `MODEL_NAME` to one of the following: + +- "LatentBKI_default": latent mapping using MP3D +- "LatentBKI_kitti": latent mapping using semantic KITTI +- "LatentBKI_vlmap": including vlmap heuristic for comparison experiment +- "LatentBKI_realworld": map real-world environment captured by Record3D + +Generated latent map and evaluation result for each sequence will be under `Results` folder. + +### Evaluate map +In `./inference.py`, provide the following parameters: +``` +- RESULT_SAVE: the folder that contain the map you want to evaluate +- MODEL_NAME: The model you used to create the above map +- scenes: the sequences you want to evaluate +``` + +The evalution result will be under the folder you provided to `RESULT_SAVE` as a `results.txt` file. + +### Visualize map (ROS required) +1. Run `./publish_map.py` with `latent_map` and `category_map` set to the map you want to visualize. +2. Open Rviz and subscribe to topic `visualization_marker_array` + +### Open-Dictionary Query Demo (ROS required) +1. Run `./publish_map.py` with customized `MODEL_NAME` and `latent_map_path` parameter. +2. Open Rviz and subscribe to topic `Open_Query/Heatmap` and `Open_Query/Uncertainty` +3. In terminal follow the prompt to query arbitrary word. + +## Acknowledgements + +code is built based on [ConvBKI](https://github.com/UMich-CURLY/NeuralBKI), [VLMaps](https://github.com/vlmaps/vlmaps), [2DPASS](https://github.com/yanx27/2DPASS/blob/main/README.md?plain=1), [Record3D](https://github.com/marek-simonik/record3d) \ No newline at end of file diff --git a/Results/.gitignore b/Results/.gitignore new file mode 100644 index 0000000..86d0cb2 --- /dev/null +++ b/Results/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore \ No newline at end of file diff --git a/TwoDPASS/.gitattributes b/TwoDPASS/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/TwoDPASS/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/TwoDPASS/.gitignore b/TwoDPASS/.gitignore new file mode 100644 index 0000000..d9005f2 --- /dev/null +++ b/TwoDPASS/.gitignore @@ -0,0 +1,152 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/TwoDPASS/LICENSE b/TwoDPASS/LICENSE new file mode 100644 index 0000000..c10952a --- /dev/null +++ b/TwoDPASS/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Benny + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/TwoDPASS/README.md b/TwoDPASS/README.md new file mode 100644 index 0000000..32a6932 --- /dev/null +++ b/TwoDPASS/README.md @@ -0,0 +1,209 @@ + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/2dpass-2d-priors-assisted-semantic/3d-semantic-segmentation-on-semantickitti)](https://paperswithcode.com/sota/3d-semantic-segmentation-on-semantickitti?p=2dpass-2d-priors-assisted-semantic)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/2dpass-2d-priors-assisted-semantic/lidar-semantic-segmentation-on-nuscenes)](https://paperswithcode.com/sota/lidar-semantic-segmentation-on-nuscenes?p=2dpass-2d-priors-assisted-semantic) + +# 2DPASS + +[![arXiv](https://img.shields.io/badge/arXiv-2203.09065-b31b1b.svg)](https://arxiv.org/pdf/2207.04397.pdf) +[![GitHub Stars](https://img.shields.io/github/stars/yanx27/2DPASS?style=social)](https://github.com/yanx27/2DPASS) +![visitors](https://visitor-badge.glitch.me/badge?page_id=https://github.com/yanx27/2DPASS) + + + +This repository is for **2DPASS** introduced in the following paper + +[Xu Yan*](https://yanx27.github.io/), [Jiantao Gao*](https://github.com/Gao-JT), [Chaoda Zheng*](https://github.com/Ghostish), Chao Zheng, Ruimao Zhang, Shuguang Cui, [Zhen Li*](https://mypage.cuhk.edu.cn/academics/lizhen/), "*2DPASS: 2D Priors Assisted Semantic Segmentation on LiDAR Point Clouds*", ECCV 2022 [[arxiv]](https://arxiv.org/pdf/2207.04397.pdf). + ![image](figures/2DPASS.gif) + +If you find our work useful in your research, please consider citing: +```latex +@inproceedings{yan20222dpass, + title={2dpass: 2d priors assisted semantic segmentation on lidar point clouds}, + author={Yan, Xu and Gao, Jiantao and Zheng, Chaoda and Zheng, Chao and Zhang, Ruimao and Cui, Shuguang and Li, Zhen}, + booktitle={European Conference on Computer Vision}, + pages={677--695}, + year={2022}, + organization={Springer} +} + +@InProceedings{yan2022let, + title={Let Images Give You More: Point Cloud Cross-Modal Training for Shape Analysis}, + author={Xu Yan and Heshen Zhan and Chaoda Zheng and Jiantao Gao and Ruimao Zhang and Shuguang Cui and Zhen Li}, + year={2022}, + booktitle={NeurIPS} +} + +@article{yan2023benchmarking, + title={Benchmarking the Robustness of LiDAR Semantic Segmentation Models}, + author={Yan, Xu and Zheng, Chaoda and Li, Zhen and Cui, Shuguang and Dai, Dengxin}, + journal={arXiv preprint arXiv:2301.00970}, + year={2023} +} +``` +## News +* **2023-04-01** We merge MinkowskiNet and official SPVCNN models from [SPVNAS](https://github.com/mit-han-lab/spvnas) in our codebase. You can check these models in `config/`. We rename our baseline model from `spvcnn.py` to `baseline.py`. +* **2023-03-31** We provide codes for the robustness evaluation on SemanticKITTI-C. +* **2023-03-27** We release a model with higher performance on SemanticKITTI and codes for naive instance augmentation. +* **2023-02-25** We release a new robustness benchmark for LiDAR semantic segmentation at [SemanticKITTI-C](https://yanx27.github.io/RobustLidarSeg/). Welcome to test your models! +

+ +

+ + +* **2022-10-11** Our new work for cross-modal knowledge distillation is accepted at NeurIPS 2022:smiley: [paper](https://arxiv.org/pdf/2210.04208.pdf) / [code](https://github.com/ZhanHeshen/PointCMT). +* **2022-09-20** We release codes for SemanticKITTI single-scan and NuScenes :rocket:! +* **2022-07-03** 2DPASS is accepted at **ECCV 2022** :fire:! +* **2022-03-08** We achieve **1st** place in both single and multi-scans of [SemanticKITTI](http://semantic-kitti.org/index.html) and **3rd** place on [NuScenes-lidarseg](https://www.nuscenes.org/) :fire:! +

+ +

+

+ +

+

+ +

+ +## Installation + +### Requirements +- pytorch >= 1.8 +- yaml +- easydict +- pyquaternion +- [lightning](https://github.com/Lightning-AI/lightning) (tested with pytorch_lightning==1.3.8 and torchmetrics==0.5) +- [torch-scatter](https://github.com/rusty1s/pytorch_scatter) (pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+${CUDA}.html) +- [nuScenes-devkit](https://github.com/nutonomy/nuscenes-devkit) (optional for nuScenes) +- [spconv](https://github.com/traveller59/spconv) (tested with spconv==2.1.16 and cuda==11.1, pip install spconv-cu111==2.1.16) +- [torchsparse](https://github.com/mit-han-lab/torchsparse) (optional for MinkowskiNet and SPVCNN. sudo apt-get install libsparsehash-dev, pip install --upgrade git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0) + +## Data Preparation + +### SemanticKITTI +Please download the files from the [SemanticKITTI website](http://semantic-kitti.org/dataset.html) and additionally the [color data](http://www.cvlibs.net/download.php?file=data_odometry_color.zip) from the [Kitti Odometry website](http://www.cvlibs.net/datasets/kitti/eval_odometry.php). Extract everything into the same folder. +``` +./dataset/ +├── +├── ... +└── SemanticKitti/ + ├──sequences + ├── 00/ + │ ├── velodyne/ + | | ├── 000000.bin + | | ├── 000001.bin + | | └── ... + │ └── labels/ + | | ├── 000000.label + | | ├── 000001.label + | | └── ... + | └── image_2/ + | | ├── 000000.png + | | ├── 000001.png + | | └── ... + | calib.txt + ├── 08/ # for validation + ├── 11/ # 11-21 for testing + └── 21/ + └── ... +``` + +### NuScenes +Please download the Full dataset (v1.0) from the [NuScenes website](https://www.nuscenes.org/) with lidarseg and extract it. +``` +./dataset/ +├── +├── ... +└── nuscenes/ + ├──v1.0-trainval + ├──v1.0-test + ├──samples + ├──sweeps + ├──maps + ├──lidarseg +``` + +## Training +### SemanticKITTI +You can run the training with +```shell script +cd +python main.py --log_dir 2DPASS_semkitti --config config/2DPASS-semantickitti.yaml --gpu 0 +``` +The output will be written to `logs/SemanticKITTI/2DPASS_semkitti` by default. +### NuScenes +```shell script +cd +python main.py --log_dir 2DPASS_nusc --config config/2DPASS-nuscenese.yaml --gpu 0 1 2 3 +``` + +### Vanilla Training without 2DPASS +We take SemanticKITTI as an example. +```shell script +cd +python main.py --log_dir baseline_semkitti --config config/2DPASS-semantickitti.yaml --gpu 0 --baseline_only +``` + +## Testing +You can run the testing with +```shell script +cd +python main.py --config config/2DPASS-semantickitti.yaml --gpu 0 --test --num_vote 12 --checkpoint +``` +Here, `num_vote` is the number of views for the test-time-augmentation (TTA). We set this value to 12 as default (on a Tesla-V100 GPU), and if you use other GPUs with smaller memory, you can choose a smaller value. `num_vote=1` denotes there is no TTA used, and will cause about ~2\% performance drop. + +## Robustness Evaluation +Please download all subsets of [SemanticKITTI-C](https://arxiv.org/pdf/2301.00970.pdf) from [this link](https://cuhko365-my.sharepoint.com/personal/218012048_link_cuhk_edu_cn/_layouts/15/onedrive.aspx?id=%2Fpersonal%2F218012048%5Flink%5Fcuhk%5Fedu%5Fcn%2FDocuments%2FSemanticKITTIC&ga=1) and extract them. +``` +./dataset/ +├── +├── ... +└── SemanticKitti/ + ├──sequences + ├──SemanticKITTI-C + ├── clean_data/ + ├── dense_16beam/ + │ ├── velodyne/ + | | ├── 000000.bin + | | ├── 000001.bin + | | └── ... + │ └── labels/ + | | ├── 000000.label + | | ├── 000001.label + | | └── ... + ... +``` +You can run the robustness evaluation with +```shell script +cd +python robust_test.py --config config/2DPASS-semantickitti.yaml --gpu 0 --num_vote 12 --checkpoint +``` + +## Model Zoo +You can download the models with the scores below from [this Google drive folder](https://drive.google.com/drive/folders/1Xy6p_h827lv8J-2iZU8T6SLFkxfoXPBE?usp=sharing). +### SemanticKITTI +|Model (validation)|mIoU (vanilla)|mIoU (TTA)|Parameters| +|:---:|:---:|:---:|:---:| +|MinkowskiNet|65.1%|67.1%|21.7M| +|SPVCNN|65.9%|67.8%|21.8M| +|2DPASS (4scale-64dimension)|68.7%|70.0%|1.9M| +|2DPASS (6scale-256dimension)|70.7%|72.0%|45.6M| + +Here, we fine-tune 2DPASS models on SemanticKITTI with more epochs and thus gain the higher mIoU. If you train with 64 epochs, it should be gained about 66%/69% for vanilla and 69%/71% after TTA. + +### NuScenes +|Model (validation)|mIoU (vanilla)|mIoU (TTA)|Parameters| +|:---:|:---:|:---:|:---:| +|MinkowskiNet|74.3%|76.0%|21.7M| +|SPVCNN|74.9%|76.9%|21.8M| +|2DPASS (6scale-128dimension)|76.7%|79.6%|11.5M| +|2DPASS (6scale-256dimension)|78.0%|80.5%|45.6M| + +**Note that the results on benchmarks are gained by training with additional validation set and using instance-level augmentation.** + +## Acknowledgements +Code is built based on [SPVNAS](https://github.com/mit-han-lab/spvnas), [Cylinder3D](https://github.com/xinge008/Cylinder3D), [xMUDA](https://github.com/valeoai/xmuda) and [SPCONV](https://github.com/traveller59/spconv). + +## License +This repository is released under MIT License (see LICENSE file for details). + + + diff --git a/TwoDPASS/config/2DPASS-large-semantickitti.yaml b/TwoDPASS/config/2DPASS-large-semantickitti.yaml new file mode 100644 index 0000000..06edf0f --- /dev/null +++ b/TwoDPASS/config/2DPASS-large-semantickitti.yaml @@ -0,0 +1,117 @@ +# Config format schema number +format_version: 1 + +################### +## Model options +model_params: + model_architecture: "arch_2dpass" + + input_dims: 4 + spatial_shape: + - 1000 + - 1000 + - 60 + scale_list: + - 2 + - 4 + - 8 + - 16 + - 16 + - 16 + + hiden_size: 256 + num_classes: 20 + backbone_2d: resnet34 + pretrained2d: False + +################### +## Dataset options +dataset_params: + training_size: 19132 + dataset_type: "point_image_dataset_mix_semkitti" + pc_dataset_type: "SemanticKITTI" + collate_type: "collate_fn_default" + ignore_label: 0 + label_mapping: "./config/label_mapping/semantic-kitti.yaml" + + # 2D augmentation + bottom_crop: + - 480 + - 320 + color_jitter: + - 0.4 + - 0.4 + - 0.4 + flip2d: 0.5 + image_normalizer: + - [0.485, 0.456, 0.406] + - [0.229, 0.224, 0.225] + max_volume_space: + - 50 + - 50 + - 2 + min_volume_space: + - -50 + - -50 + - -4 + seg_labelweights: + - 0 + - 55437630 + - 320797 + - 541736 + - 2578735 + - 3274484 + - 552662 + - 184064 + - 78858 + - 240942562 + - 17294618 + - 170599734 + - 6369672 + - 230413074 + - 101130274 + - 476491114 + - 9833174 + - 129609852 + - 4506626 + - 1168181 + + train_data_loader: + data_path: "./dataset/SemanticKitti/dataset/sequences/" + batch_size: 4 + shuffle: True + num_workers: 8 + rotate_aug: True + flip_aug: True + scale_aug: True + transform_aug: True + dropout_aug: True + + val_data_loader: + data_path: "./dataset/SemanticKitti/dataset/sequences/" + shuffle: False + num_workers: 8 + + # normal test + batch_size: 4 + rotate_aug: False + flip_aug: False + scale_aug: False + transform_aug: False + dropout_aug: False + + +################### +## Train params +train_params: + max_num_epochs: 64 + learning_rate: 0.24 + optimizer: SGD # [SGD, Adam] + lr_scheduler: CosineAnnealingLR # [StepLR, ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts] + momentum: 0.9 + nesterov: True + weight_decay: 1.0e-4 + + lambda_seg2d: 1 + lambda_xm: 0.05 + lambda_lovasz: 1 diff --git a/TwoDPASS/config/2DPASS-nuscenes.yaml b/TwoDPASS/config/2DPASS-nuscenes.yaml new file mode 100644 index 0000000..7ad4625 --- /dev/null +++ b/TwoDPASS/config/2DPASS-nuscenes.yaml @@ -0,0 +1,95 @@ +# Config format schema number +format_version: 1 + +################### +## Model options +model_params: + model_architecture: "arch_2dpass" + + input_dims: 4 + spatial_shape: + - 1000 + - 1000 + - 70 + scale_list: + - 2 + - 4 + - 8 + - 16 + - 16 + - 16 + + hiden_size: 256 + num_classes: 17 + backbone_2d: resnet34 + pretrained2d: False + +################### +## Dataset options +dataset_params: + training_size: 28130 + dataset_type: "point_image_dataset_nus" + pc_dataset_type: "nuScenes" + collate_type: "collate_fn_default" + ignore_label: 0 + label_mapping: "./config/label_mapping/nuscenes.yaml" + + # 2D augmentation + resize: + - 400 + - 240 + color_jitter: + - 0.4 + - 0.4 + - 0.4 + flip2d: 0.5 + image_normalizer: + - [0.485, 0.456, 0.406] + - [0.229, 0.224, 0.225] + max_volume_space: + - 50 + - 50 + - 3 + min_volume_space: + - -50 + - -50 + - -4 + + train_data_loader: + data_path: "./dataset/nuscenes/" + batch_size: 8 + shuffle: True + num_workers: 8 + rotate_aug: True + flip_aug: True + scale_aug: True + transform_aug: True + dropout_aug: True + + val_data_loader: + data_path: "./dataset/nuscenes/" + shuffle: False + num_workers: 8 + + # normal test + batch_size: 8 + rotate_aug: False + flip_aug: False + scale_aug: False + transform_aug: False + dropout_aug: False + +################### +## Train params +train_params: + max_num_epochs: 80 + learning_rate: 0.24 + optimizer: SGD # [SGD, Adam] + lr_scheduler: CosineAnnealingLR # [StepLR, ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts] + momentum: 0.9 + nesterov: True + weight_decay: 1.0e-4 + + lambda_seg2d: 1 + lambda_xm: 0.05 + lambda_lovasz: 1 diff --git a/TwoDPASS/config/2DPASS-semantickitti.yaml b/TwoDPASS/config/2DPASS-semantickitti.yaml new file mode 100644 index 0000000..4e0eebd --- /dev/null +++ b/TwoDPASS/config/2DPASS-semantickitti.yaml @@ -0,0 +1,114 @@ +# Config format schema number +format_version: 1 + +################### +## Model options +model_params: + model_architecture: "arch_2dpass" + + input_dims: 4 + spatial_shape: + - 1000 + - 1000 + - 60 + scale_list: + - 2 + - 4 + - 8 + - 16 + + hiden_size: 64 + num_classes: 20 + backbone_2d: resnet34 + pretrained2d: False + +################### +## Dataset options +dataset_params: + training_size: 19132 + dataset_type: "point_image_dataset_semkitti" + pc_dataset_type: "SemanticKITTI" + collate_type: "collate_fn_default" + ignore_label: 0 + label_mapping: "./config/label_mapping/semantic-kitti.yaml" + + # 2D augmentation + bottom_crop: + - 480 + - 320 + color_jitter: + - 0.4 + - 0.4 + - 0.4 + flip2d: 0.5 + image_normalizer: + - [0.485, 0.456, 0.406] + - [0.229, 0.224, 0.225] + max_volume_space: + - 50 + - 50 + - 2 + min_volume_space: + - -50 + - -50 + - -4 + seg_labelweights: + - 0 + - 55437630 + - 320797 + - 541736 + - 2578735 + - 3274484 + - 552662 + - 184064 + - 78858 + - 240942562 + - 17294618 + - 170599734 + - 6369672 + - 230413074 + - 101130274 + - 476491114 + - 9833174 + - 129609852 + - 4506626 + - 1168181 + + train_data_loader: + data_path: "./dataset/SemanticKitti/dataset/sequences/" + batch_size: 8 + shuffle: True + num_workers: 8 + rotate_aug: True + flip_aug: True + scale_aug: True + transform_aug: True + dropout_aug: True + + val_data_loader: + data_path: "./dataset/SemanticKitti/dataset/sequences/" + shuffle: False + num_workers: 8 + + # normal test + batch_size: 8 + rotate_aug: False + flip_aug: False + scale_aug: False + transform_aug: False + dropout_aug: False + + +################### +## Train params +train_params: + max_num_epochs: 64 + learning_rate: 0.24 + optimizer: SGD # [SGD, Adam] + lr_scheduler: CosineAnnealingWarmRestarts # [StepLR, ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts] + momentum: 0.9 + nesterov: True + weight_decay: 1.0e-4 + + lambda_seg2d: 1 + lambda_xm: 0.05 diff --git a/TwoDPASS/config/MinkowskiNet-nuscenes.yaml b/TwoDPASS/config/MinkowskiNet-nuscenes.yaml new file mode 100644 index 0000000..67950cb --- /dev/null +++ b/TwoDPASS/config/MinkowskiNet-nuscenes.yaml @@ -0,0 +1,68 @@ +# Config format schema number +format_version: 1 + + +################### +## Model options +model_params: + model_architecture: "minkowskinet" + + input_dims: 4 + voxel_size: 0.05 + cr: 1 # enlarge factor of layer_num + layer_num: + - 32 + - 32 + - 64 + - 128 + - 256 + - 256 + - 128 + - 96 + - 96 + + num_class: 17 + + +################### +## Dataset options +dataset_params: + dataset_type: "voxel_dataset" + pc_dataset_type: "nuScenes" + collate_type: "collate_fn_voxel" + ignore_label: 0 + label_mapping: "./config/label_mapping/nuscenes.yaml" + + train_data_loader: + data_path: "./dataset/nuscenes/" + batch_size: 32 + shuffle: True + num_workers: 8 + rotate_aug: True + flip_aug: True + scale_aug: True + transform_aug: True + dropout_aug: True + + val_data_loader: + data_path: "./dataset/nuscenes/" + shuffle: False + num_workers: 8 + batch_size: 32 + rotate_aug: False + flip_aug: False + scale_aug: False + transform_aug: False + dropout_aug: False + + +################### +## Train params +train_params: + max_num_epochs: 80 + learning_rate: 2.4e-1 + optimizer: SGD # [SGD, Adam] + lr_scheduler: CosineAnnealingLR # [StepLR, ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts] + momentum: 0.9 + nesterov: True + weight_decay: 1.0e-4 \ No newline at end of file diff --git a/TwoDPASS/config/MinkowskiNet-semantickitti.yaml b/TwoDPASS/config/MinkowskiNet-semantickitti.yaml new file mode 100644 index 0000000..1b13dab --- /dev/null +++ b/TwoDPASS/config/MinkowskiNet-semantickitti.yaml @@ -0,0 +1,91 @@ +# Config format schema number +format_version: 2 + + +################### +## Model options +model_params: + model_architecture: "minkowskinet" + + input_dims: 4 + voxel_size: 0.05 + cr: 1 # enlarge factor of layer_num + layer_num: + - 32 + - 32 + - 64 + - 128 + - 256 + - 256 + - 128 + - 96 + - 96 + + num_class: 20 + + +################### +## Dataset options +dataset_params: + training_size: 19132 + dataset_type: "voxel_dataset" + pc_dataset_type: "SemanticKITTI" + collate_type: "collate_fn_voxel" + ignore_label: 0 + label_mapping: "./config/label_mapping/semantic-kitti.yaml" + + seg_labelweights: + - 0 + - 55437630 + - 320797 + - 541736 + - 2578735 + - 3274484 + - 552662 + - 184064 + - 78858 + - 240942562 + - 17294618 + - 170599734 + - 6369672 + - 230413074 + - 101130274 + - 476491114 + - 9833174 + - 129609852 + - 4506626 + - 1168181 + + train_data_loader: + data_path: "./dataset/SemanticKitti/dataset/sequences/" + batch_size: 8 + shuffle: True + num_workers: 8 + rotate_aug: True + flip_aug: True + scale_aug: True + transform_aug: True + dropout_aug: True + + val_data_loader: + data_path: "./dataset/SemanticKitti/dataset/sequences/" + shuffle: False + num_workers: 8 + batch_size: 8 + rotate_aug: False + flip_aug: False + scale_aug: False + transform_aug: False + dropout_aug: False + + +################### +## Train params +train_params: + max_num_epochs: 64 + learning_rate: 2.4e-1 + optimizer: SGD # [SGD, Adam] + lr_scheduler: CosineAnnealingWarmRestarts # [StepLR, ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts] + momentum: 0.9 + nesterov: True + weight_decay: 1.0e-4 \ No newline at end of file diff --git a/TwoDPASS/config/SPVCNN-nuscenes.yaml b/TwoDPASS/config/SPVCNN-nuscenes.yaml new file mode 100644 index 0000000..5e2b622 --- /dev/null +++ b/TwoDPASS/config/SPVCNN-nuscenes.yaml @@ -0,0 +1,68 @@ +# Config format schema number +format_version: 1 + + +################### +## Model options +model_params: + model_architecture: "spvcnn" + + input_dims: 4 + voxel_size: 0.05 + cr: 1 # enlarge factor of layer_num + layer_num: + - 32 + - 32 + - 64 + - 128 + - 256 + - 256 + - 128 + - 96 + - 96 + + num_class: 17 + + +################### +## Dataset options +dataset_params: + dataset_type: "voxel_dataset" + pc_dataset_type: "nuScenes" + collate_type: "collate_fn_voxel" + ignore_label: 0 + label_mapping: "./config/label_mapping/nuscenes.yaml" + + train_data_loader: + data_path: "./dataset/nuscenes/" + batch_size: 32 + shuffle: True + num_workers: 8 + rotate_aug: True + flip_aug: True + scale_aug: True + transform_aug: True + dropout_aug: True + + val_data_loader: + data_path: "./dataset/nuscenes/" + shuffle: False + num_workers: 8 + batch_size: 32 + rotate_aug: False + flip_aug: False + scale_aug: False + transform_aug: False + dropout_aug: False + + +################### +## Train params +train_params: + max_num_epochs: 80 + learning_rate: 2.4e-1 + optimizer: SGD # [SGD, Adam] + lr_scheduler: CosineAnnealingLR # [StepLR, ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts] + momentum: 0.9 + nesterov: True + weight_decay: 1.0e-4 \ No newline at end of file diff --git a/TwoDPASS/config/SPVCNN-semantickitti.yaml b/TwoDPASS/config/SPVCNN-semantickitti.yaml new file mode 100644 index 0000000..1ee54f5 --- /dev/null +++ b/TwoDPASS/config/SPVCNN-semantickitti.yaml @@ -0,0 +1,91 @@ +# Config format schema number +format_version: 2 + + +################### +## Model options +model_params: + model_architecture: "spvcnn" + + input_dims: 4 + voxel_size: 0.05 + cr: 1 # enlarge factor of layer_num + layer_num: + - 32 + - 32 + - 64 + - 128 + - 256 + - 256 + - 128 + - 96 + - 96 + + num_class: 20 + + +################### +## Dataset options +dataset_params: + training_size: 19132 + dataset_type: "voxel_dataset" + pc_dataset_type: "SemanticKITTI" + collate_type: "collate_fn_voxel" + ignore_label: 0 + label_mapping: "TwoDPASS/config/label_mapping/semantic-kitti.yaml" + + seg_labelweights: + - 0 + - 55437630 + - 320797 + - 541736 + - 2578735 + - 3274484 + - 552662 + - 184064 + - 78858 + - 240942562 + - 17294618 + - 170599734 + - 6369672 + - 230413074 + - 101130274 + - 476491114 + - 9833174 + - 129609852 + - 4506626 + - 1168181 + + train_data_loader: + data_path: "/KITTI/dataset/sequences/" + batch_size: 1 + shuffle: True + num_workers: 1 + rotate_aug: True + flip_aug: True + scale_aug: True + transform_aug: True + dropout_aug: True + + val_data_loader: + data_path: "/KITTI/dataset/sequences/" + shuffle: False + num_workers: 1 + batch_size: 1 + rotate_aug: False + flip_aug: False + scale_aug: False + transform_aug: False + dropout_aug: False + + +################### +## Train params +train_params: + max_num_epochs: 64 + learning_rate: 2.4e-1 + optimizer: SGD # [SGD, Adam] + lr_scheduler: CosineAnnealingWarmRestarts # [StepLR, ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts] + momentum: 0.9 + nesterov: True + weight_decay: 1.0e-4 \ No newline at end of file diff --git a/TwoDPASS/config/corruption/semantickittic.yaml b/TwoDPASS/config/corruption/semantickittic.yaml new file mode 100644 index 0000000..f329f13 --- /dev/null +++ b/TwoDPASS/config/corruption/semantickittic.yaml @@ -0,0 +1,21 @@ +corruption_name: + - clean_data + - fog_light + - fog_moderate + - fog_heavy + - snowfall_light + - snowfall_moderate + - snowfall_heavy + - local_light + - local_moderate + - local_heavy + - global_light + - global_moderate + - global_heavy + - dense_32beam + - sparse_32beam + - dense_16beam + - sparse_16beam + + + diff --git a/TwoDPASS/config/label_mapping/nuscenes.yaml b/TwoDPASS/config/label_mapping/nuscenes.yaml new file mode 100644 index 0000000..e7e4276 --- /dev/null +++ b/TwoDPASS/config/label_mapping/nuscenes.yaml @@ -0,0 +1,84 @@ +labels: + 0: 'noise' + 1: 'animal' + 2: 'human.pedestrian.adult' + 3: 'human.pedestrian.child' + 4: 'human.pedestrian.construction_worker' + 5: 'human.pedestrian.personal_mobility' + 6: 'human.pedestrian.police_officer' + 7: 'human.pedestrian.stroller' + 8: 'human.pedestrian.wheelchair' + 9: 'movable_object.barrier' + 10: 'movable_object.debris' + 11: 'movable_object.pushable_pullable' + 12: 'movable_object.trafficcone' + 13: 'static_object.bicycle_rack' + 14: 'vehicle.bicycle' + 15: 'vehicle.bus.bendy' + 16: 'vehicle.bus.rigid' + 17: 'vehicle.car' + 18: 'vehicle.construction' + 19: 'vehicle.emergency.ambulance' + 20: 'vehicle.emergency.police' + 21: 'vehicle.motorcycle' + 22: 'vehicle.trailer' + 23: 'vehicle.truck' + 24: 'flat.driveable_surface' + 25: 'flat.other' + 26: 'flat.sidewalk' + 27: 'flat.terrain' + 28: 'static.manmade' + 29: 'static.other' + 30: 'static.vegetation' + 31: 'vehicle.ego' +labels_16: + 0: 'noise' + 1: 'barrier' + 2: 'bicycle' + 3: 'bus' + 4: 'car' + 5: 'construction_vehicle' + 6: 'motorcycle' + 7: 'pedestrian' + 8: 'traffic_cone' + 9: 'trailer' + 10: 'truck' + 11: 'driveable_surface' + 12: 'other_flat' + 13: 'sidewalk' + 14: 'terrain' + 15: 'manmade' + 16: 'vegetation' +learning_map: + 1: 0 + 5: 0 + 7: 0 + 8: 0 + 10: 0 + 11: 0 + 13: 0 + 19: 0 + 20: 0 + 0: 0 + 29: 0 + 31: 0 + 9: 1 + 14: 2 + 15: 3 + 16: 3 + 17: 4 + 18: 5 + 21: 6 + 2: 7 + 3: 7 + 4: 7 + 6: 7 + 12: 8 + 22: 9 + 23: 10 + 24: 11 + 25: 12 + 26: 13 + 27: 14 + 28: 15 + 30: 16 \ No newline at end of file diff --git a/TwoDPASS/config/label_mapping/semantic-kitti.yaml b/TwoDPASS/config/label_mapping/semantic-kitti.yaml new file mode 100644 index 0000000..6281065 --- /dev/null +++ b/TwoDPASS/config/label_mapping/semantic-kitti.yaml @@ -0,0 +1,211 @@ +# This file is covered by the LICENSE file in the root of this project. +labels: + 0 : "unlabeled" + 1 : "outlier" + 10: "car" + 11: "bicycle" + 13: "bus" + 15: "motorcycle" + 16: "on-rails" + 18: "truck" + 20: "other-vehicle" + 30: "person" + 31: "bicyclist" + 32: "motorcyclist" + 40: "road" + 44: "parking" + 48: "sidewalk" + 49: "other-ground" + 50: "building" + 51: "fence" + 52: "other-structure" + 60: "lane-marking" + 70: "vegetation" + 71: "trunk" + 72: "terrain" + 80: "pole" + 81: "traffic-sign" + 99: "other-object" + 252: "moving-car" + 253: "moving-bicyclist" + 254: "moving-person" + 255: "moving-motorcyclist" + 256: "moving-on-rails" + 257: "moving-bus" + 258: "moving-truck" + 259: "moving-other-vehicle" +color_map: # bgr + 0 : [0, 0, 0] + 1 : [0, 0, 255] + 10: [245, 150, 100] + 11: [245, 230, 100] + 13: [250, 80, 100] + 15: [150, 60, 30] + 16: [255, 0, 0] + 18: [180, 30, 80] + 20: [255, 0, 0] + 30: [30, 30, 255] + 31: [200, 40, 255] + 32: [90, 30, 150] + 40: [255, 0, 255] + 44: [255, 150, 255] + 48: [75, 0, 75] + 49: [75, 0, 175] + 50: [0, 200, 255] + 51: [50, 120, 255] + 52: [0, 150, 255] + 60: [170, 255, 150] + 70: [0, 175, 0] + 71: [0, 60, 135] + 72: [80, 240, 150] + 80: [150, 240, 255] + 81: [0, 0, 255] + 99: [255, 255, 50] + 252: [245, 150, 100] + 256: [255, 0, 0] + 253: [200, 40, 255] + 254: [30, 30, 255] + 255: [90, 30, 150] + 257: [250, 80, 100] + 258: [180, 30, 80] + 259: [255, 0, 0] +content: # as a ratio with the total number of points + 0: 0.018889854628292943 + 1: 0.0002937197336781505 + 10: 0.040818519255974316 + 11: 0.00016609538710764618 + 13: 2.7879693665067774e-05 + 15: 0.00039838616015114444 + 16: 0.0 + 18: 0.0020633612104619787 + 20: 0.0016218197275284021 + 30: 0.00017698551338515307 + 31: 1.1065903904919655e-08 + 32: 5.532951952459828e-09 + 40: 0.1987493871255525 + 44: 0.014717169549888214 + 48: 0.14392298360372 + 49: 0.0039048553037472045 + 50: 0.1326861944777486 + 51: 0.0723592229456223 + 52: 0.002395131480328884 + 60: 4.7084144280367186e-05 + 70: 0.26681502148037506 + 71: 0.006035012012626033 + 72: 0.07814222006271769 + 80: 0.002855498193863172 + 81: 0.0006155958086189918 + 99: 0.009923127583046915 + 252: 0.001789309418528068 + 253: 0.00012709999297008662 + 254: 0.00016059776092534436 + 255: 3.745553104802113e-05 + 256: 0.0 + 257: 0.00011351574470342043 + 258: 0.00010157861367183268 + 259: 4.3840131989471124e-05 +# classes that are indistinguishable from single scan or inconsistent in +# ground truth are mapped to their closest equivalent +learning_map: + 0 : 0 # "unlabeled" + 1 : 0 # "outlier" mapped to "unlabeled" --------------------------mapped + 10: 1 # "car" + 11: 2 # "bicycle" + 13: 5 # "bus" mapped to "other-vehicle" --------------------------mapped + 15: 3 # "motorcycle" + 16: 5 # "on-rails" mapped to "other-vehicle" ---------------------mapped + 18: 4 # "truck" + 20: 5 # "other-vehicle" + 30: 6 # "person" + 31: 7 # "bicyclist" + 32: 8 # "motorcyclist" + 40: 9 # "road" + 44: 10 # "parking" + 48: 11 # "sidewalk" + 49: 12 # "other-ground" + 50: 13 # "building" + 51: 14 # "fence" + 52: 0 # "other-structure" mapped to "unlabeled" ------------------mapped + 60: 9 # "lane-marking" to "road" ---------------------------------mapped + 70: 15 # "vegetation" + 71: 16 # "trunk" + 72: 17 # "terrain" + 80: 18 # "pole" + 81: 19 # "traffic-sign" + 99: 0 # "other-object" to "unlabeled" ----------------------------mapped + 252: 1 # "moving-car" to "car" ------------------------------------mapped + 253: 7 # "moving-bicyclist" to "bicyclist" ------------------------mapped + 254: 6 # "moving-person" to "person" ------------------------------mapped + 255: 8 # "moving-motorcyclist" to "motorcyclist" ------------------mapped + 256: 5 # "moving-on-rails" mapped to "other-vehicle" --------------mapped + 257: 5 # "moving-bus" mapped to "other-vehicle" -------------------mapped + 258: 4 # "moving-truck" to "truck" --------------------------------mapped + 259: 5 # "moving-other"-vehicle to "other-vehicle" ----------------mapped +learning_map_inv: # inverse of previous map + 0: 0 # "unlabeled", and others ignored + 1: 10 # "car" + 2: 11 # "bicycle" + 3: 15 # "motorcycle" + 4: 18 # "truck" + 5: 20 # "other-vehicle" + 6: 30 # "person" + 7: 31 # "bicyclist" + 8: 32 # "motorcyclist" + 9: 40 # "road" + 10: 44 # "parking" + 11: 48 # "sidewalk" + 12: 49 # "other-ground" + 13: 50 # "building" + 14: 51 # "fence" + 15: 70 # "vegetation" + 16: 71 # "trunk" + 17: 72 # "terrain" + 18: 80 # "pole" + 19: 81 # "traffic-sign" +learning_ignore: # Ignore classes + 0: True # "unlabeled", and others ignored + 1: False # "car" + 2: False # "bicycle" + 3: False # "motorcycle" + 4: False # "truck" + 5: False # "other-vehicle" + 6: False # "person" + 7: False # "bicyclist" + 8: False # "motorcyclist" + 9: False # "road" + 10: False # "parking" + 11: False # "sidewalk" + 12: False # "other-ground" + 13: False # "building" + 14: False # "fence" + 15: False # "vegetation" + 16: False # "trunk" + 17: False # "terrain" + 18: False # "pole" + 19: False # "traffic-sign" +split: # sequence numbers + train: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 9 + - 10 + valid: + - 8 + test: + - 11 + - 12 + - 13 + - 14 + - 15 + - 16 + - 17 + - 18 + - 19 + - 20 + - 21 diff --git a/TwoDPASS/dataloader/corruption_dataset.py b/TwoDPASS/dataloader/corruption_dataset.py new file mode 100644 index 0000000..d25928e --- /dev/null +++ b/TwoDPASS/dataloader/corruption_dataset.py @@ -0,0 +1,57 @@ +import os +import yaml +import numpy as np + +from torch.utils import data + +def absoluteFilePaths(directory, num_vote): + for dirpath, _, filenames in os.walk(directory): + filenames.sort() + for f in filenames: + for _ in range(num_vote): + yield os.path.abspath(os.path.join(dirpath, f)) + + +class SemanticKITTIC(data.Dataset): + def __init__(self, config, data_path, corruption, num_vote=1): + with open(config['dataset_params']['label_mapping'], 'r') as stream: + semkittiyaml = yaml.safe_load(stream) + + self.config = config + self.corruption = corruption + self.imageset = 'val' + self.num_vote = num_vote + self.learning_map = semkittiyaml['learning_map'] + self.im_idx = [] + self.im_idx += absoluteFilePaths('/'.join([data_path.replace('sequences', 'SemanticKITTI-C'), corruption, 'velodyne']), num_vote) + self.im_idx = sorted(self.im_idx) + + def __len__(self): + 'Denotes the total number of samples' + return len(self.im_idx) + + def __getitem__(self, index): + raw_data = np.fromfile(self.im_idx[index], dtype=np.float32).reshape((-1, 4)) + + origin_len = len(raw_data) + raw_data = raw_data[:, :4] + points = raw_data[:, :3] + + annotated_data = np.fromfile(self.im_idx[index].replace('velodyne', 'labels')[:-3] + 'label', + dtype=np.uint32).reshape((-1, 1)) + instance_label = annotated_data >> 16 + annotated_data = annotated_data & 0xFFFF # delete high 16 digits binary + annotated_data = np.vectorize(self.learning_map.__getitem__)(annotated_data) + + if self.config['dataset_params']['ignore_label'] != 0: + annotated_data -= 1 + annotated_data[annotated_data == -1] = self.config['dataset_params']['ignore_label'] + + data_dict = {} + data_dict['xyz'] = points + data_dict['labels'] = annotated_data.astype(np.uint8) + data_dict['instance_label'] = instance_label + data_dict['signal'] = raw_data[:, 3:4] + data_dict['origin_len'] = origin_len + + return data_dict, self.im_idx[index] \ No newline at end of file diff --git a/TwoDPASS/dataloader/dataset.py b/TwoDPASS/dataloader/dataset.py new file mode 100644 index 0000000..7bdebb6 --- /dev/null +++ b/TwoDPASS/dataloader/dataset.py @@ -0,0 +1,855 @@ +""" +Task-specific Datasets +""" +import random +import torch +import numpy as np + +from PIL import Image +from torch.utils import data +from torchvision import transforms as T +from pyquaternion import Quaternion +from nuscenes.utils.geometry_utils import view_points + +REGISTERED_DATASET_CLASSES = {} +REGISTERED_COLATE_CLASSES = {} + +try: + from torchsparse import SparseTensor + from torchsparse.utils.collate import sparse_collate_fn + from torchsparse.utils.quantize import sparse_quantize +except: + print('please install torchsparse if you want to run spvcnn/minkowskinet!') + + +def register_dataset(cls, name=None): + global REGISTERED_DATASET_CLASSES + if name is None: + name = cls.__name__ + assert name not in REGISTERED_DATASET_CLASSES, f"exist class: {REGISTERED_DATASET_CLASSES}" + REGISTERED_DATASET_CLASSES[name] = cls + return cls + + +def register_collate_fn(cls, name=None): + global REGISTERED_COLATE_CLASSES + if name is None: + name = cls.__name__ + assert name not in REGISTERED_COLATE_CLASSES, f"exist class: {REGISTERED_COLATE_CLASSES}" + REGISTERED_COLATE_CLASSES[name] = cls + return cls + + +def get_model_class(name): + global REGISTERED_DATASET_CLASSES + assert name in REGISTERED_DATASET_CLASSES, f"available class: {REGISTERED_DATASET_CLASSES}" + return REGISTERED_DATASET_CLASSES[name] + + +def get_collate_class(name): + global REGISTERED_COLATE_CLASSES + assert name in REGISTERED_COLATE_CLASSES, f"available class: {REGISTERED_COLATE_CLASSES}" + return REGISTERED_COLATE_CLASSES[name] + + +@register_dataset +class point_image_dataset_semkitti(data.Dataset): + def __init__(self, in_dataset, config, loader_config, num_vote=1, trans_std=[0.1, 0.1, 0.1], max_dropout_ratio=0.2): + 'Initialization' + self.point_cloud_dataset = in_dataset + self.config = config + self.ignore_label = config['dataset_params']['ignore_label'] + self.rotate_aug = loader_config['rotate_aug'] + self.flip_aug = loader_config['flip_aug'] + self.transform = loader_config['transform_aug'] + self.scale_aug = loader_config['scale_aug'] + self.dropout = loader_config['dropout_aug'] + self.instance_aug = loader_config.get('instance_aug', False) + self.max_volume_space = config['dataset_params']['max_volume_space'] + self.min_volume_space = config['dataset_params']['min_volume_space'] + self.num_vote = num_vote + self.trans_std = trans_std + self.max_dropout_ratio = max_dropout_ratio + self.debug = config['debug'] + + self.bottom_crop = config['dataset_params']['bottom_crop'] + color_jitter = config['dataset_params']['color_jitter'] + self.color_jitter = T.ColorJitter(*color_jitter) if color_jitter else None + self.flip2d = config['dataset_params']['flip2d'] + self.image_normalizer = config['dataset_params']['image_normalizer'] + + def __len__(self): + 'Denotes the total number of samples' + if self.debug: + return 100 * self.num_vote + else: + return len(self.point_cloud_dataset) + + @staticmethod + def select_points_in_frustum(points_2d, x1, y1, x2, y2): + """ + Select points in a 2D frustum parametrized by x1, y1, x2, y2 in image coordinates + :param points_2d: point cloud projected into 2D + :param points_3d: point cloud + :param x1: left bound + :param y1: upper bound + :param x2: right bound + :param y2: lower bound + :return: points (2D and 3D) that are in the frustum + """ + keep_ind = (points_2d[:, 0] > x1) * \ + (points_2d[:, 1] > y1) * \ + (points_2d[:, 0] < x2) * \ + (points_2d[:, 1] < y2) + + return keep_ind + + def __getitem__(self, index): + 'Generates one sample of data' + data, root = self.point_cloud_dataset[index] + + xyz = data['xyz'] + labels = data['labels'] + instance_label = data['instance_label'].reshape(-1) + sig = data['signal'] + origin_len = data['origin_len'] + + ref_pc = xyz.copy() + ref_labels = labels.copy() + ref_index = np.arange(len(ref_pc)) + + mask_x = np.logical_and(xyz[:, 0] > self.min_volume_space[0], xyz[:, 0] < self.max_volume_space[0]) + mask_y = np.logical_and(xyz[:, 1] > self.min_volume_space[1], xyz[:, 1] < self.max_volume_space[1]) + mask_z = np.logical_and(xyz[:, 2] > self.min_volume_space[2], xyz[:, 2] < self.max_volume_space[2]) + mask = np.logical_and(mask_x, np.logical_and(mask_y, mask_z)) + + xyz = xyz[mask] + ref_pc = ref_pc[mask] + labels = labels[mask] + instance_label = instance_label[mask] + ref_index = ref_index[mask] + sig = sig[mask] + point_num = len(xyz) + + if self.dropout and self.point_cloud_dataset.imageset == 'train': + dropout_ratio = np.random.random() * self.max_dropout_ratio + drop_idx = np.where(np.random.random((xyz.shape[0])) <= dropout_ratio)[0] + + if len(drop_idx) > 0: + xyz[drop_idx, :] = xyz[0, :] + labels[drop_idx, :] = labels[0, :] + sig[drop_idx, :] = sig[0, :] + instance_label[drop_idx] = instance_label[0] + ref_index[drop_idx] = ref_index[0] + + # load 2D data + image = data['img'] + proj_matrix = data['proj_matrix'] + + # project points into image + keep_idx = xyz[:, 0] > 0 # only keep point in front of the vehicle + points_hcoords = np.concatenate([xyz[keep_idx], np.ones([keep_idx.sum(), 1], dtype=np.float32)], axis=1) + img_points = (proj_matrix @ points_hcoords.T).T + img_points = img_points[:, :2] / np.expand_dims(img_points[:, 2], axis=1) # scale 2D points + keep_idx_img_pts = self.select_points_in_frustum(img_points, 0, 0, *image.size) + keep_idx[keep_idx] = keep_idx_img_pts + + # fliplr so that indexing is row, col and not col, row + img_points = np.fliplr(img_points) + points_img = img_points[keep_idx_img_pts] + + ### 3D Augmentation ### + # random data augmentation by rotation + if self.rotate_aug: + rotate_rad = np.deg2rad(np.random.random() * 360) + c, s = np.cos(rotate_rad), np.sin(rotate_rad) + j = np.matrix([[c, s], [-s, c]]) + xyz[:, :2] = np.dot(xyz[:, :2], j) + + # random data augmentation by flip x , y or x+y + if self.flip_aug: + flip_type = np.random.choice(4, 1) + if flip_type == 1: + xyz[:, 0] = -xyz[:, 0] + elif flip_type == 2: + xyz[:, 1] = -xyz[:, 1] + elif flip_type == 3: + xyz[:, :2] = -xyz[:, :2] + + if self.scale_aug: + noise_scale = np.random.uniform(0.95, 1.05) + xyz[:, 0] = noise_scale * xyz[:, 0] + xyz[:, 1] = noise_scale * xyz[:, 1] + + if self.transform: + noise_translate = np.array([np.random.normal(0, self.trans_std[0], 1), + np.random.normal(0, self.trans_std[1], 1), + np.random.normal(0, self.trans_std[2], 1)]).T + + xyz[:, 0:3] += noise_translate + + img_label = labels[keep_idx] + point2img_index = np.arange(len(labels))[keep_idx] + feat = np.concatenate((xyz, sig), axis=1) + + ### 2D Augmentation ### + if self.bottom_crop: + # self.bottom_crop is a tuple (crop_width, crop_height) + left = int(np.random.rand() * (image.size[0] + 1 - self.bottom_crop[0])) + right = left + self.bottom_crop[0] + top = image.size[1] - self.bottom_crop[1] + bottom = image.size[1] + + # update image points + keep_idx = points_img[:, 0] >= top + keep_idx = np.logical_and(keep_idx, points_img[:, 0] < bottom) + keep_idx = np.logical_and(keep_idx, points_img[:, 1] >= left) + keep_idx = np.logical_and(keep_idx, points_img[:, 1] < right) + + # crop image + image = image.crop((left, top, right, bottom)) + points_img = points_img[keep_idx] + points_img[:, 0] -= top + points_img[:, 1] -= left + + img_label = img_label[keep_idx] + point2img_index = point2img_index[keep_idx] + + img_indices = points_img.astype(np.int64) + + # 2D augmentation + if self.color_jitter is not None: + image = self.color_jitter(image) + + # PIL to numpy + image = np.array(image, dtype=np.float32, copy=False) / 255. + + # 2D augmentation + if np.random.rand() < self.flip2d: + image = np.ascontiguousarray(np.fliplr(image)) + img_indices[:, 1] = image.shape[1] - 1 - img_indices[:, 1] + + # normalize image + if self.image_normalizer: + mean, std = self.image_normalizer + mean = np.asarray(mean, dtype=np.float32) + std = np.asarray(std, dtype=np.float32) + image = (image - mean) / std + + data_dict = {} + data_dict['point_feat'] = feat + data_dict['point_label'] = labels + data_dict['ref_xyz'] = ref_pc + data_dict['ref_label'] = ref_labels + data_dict['ref_index'] = ref_index + data_dict['mask'] = mask + data_dict['point_num'] = point_num + data_dict['origin_len'] = origin_len + data_dict['root'] = root + + data_dict['img'] = image + data_dict['img_indices'] = img_indices + data_dict['img_label'] = img_label + data_dict['point2img_index'] = point2img_index + + return data_dict + + +@register_dataset +class point_image_dataset_mix_semkitti(data.Dataset): + def __init__(self, in_dataset, config, loader_config, num_vote=1, trans_std=[0.1, 0.1, 0.1], max_dropout_ratio=0.2): + 'Initialization' + self.point_cloud_dataset = in_dataset + self.config = config + self.ignore_label = config['dataset_params']['ignore_label'] + self.rotate_aug = loader_config['rotate_aug'] + self.flip_aug = loader_config['flip_aug'] + self.transform = loader_config['transform_aug'] + self.scale_aug = loader_config['scale_aug'] + self.dropout = loader_config['dropout_aug'] + self.instance_aug = loader_config.get('instance_aug', False) + self.max_volume_space = config['dataset_params']['max_volume_space'] + self.min_volume_space = config['dataset_params']['min_volume_space'] + self.num_vote = num_vote + self.trans_std = trans_std + self.max_dropout_ratio = max_dropout_ratio + self.debug = config['debug'] + + self.bottom_crop = config['dataset_params']['bottom_crop'] + color_jitter = config['dataset_params']['color_jitter'] + self.color_jitter = T.ColorJitter(*color_jitter) if color_jitter else None + self.flip2d = config['dataset_params']['flip2d'] + self.image_normalizer = config['dataset_params']['image_normalizer'] + + def __len__(self): + 'Denotes the total number of samples' + if self.debug: + return 100 * self.num_vote + else: + return len(self.point_cloud_dataset) + + @staticmethod + def select_points_in_frustum(points_2d, x1, y1, x2, y2): + """ + Select points in a 2D frustum parametrized by x1, y1, x2, y2 in image coordinates + :param points_2d: point cloud projected into 2D + :param points_3d: point cloud + :param x1: left bound + :param y1: upper bound + :param x2: right bound + :param y2: lower bound + :return: points (2D and 3D) that are in the frustum + """ + keep_ind = (points_2d[:, 0] > x1) * \ + (points_2d[:, 1] > y1) * \ + (points_2d[:, 0] < x2) * \ + (points_2d[:, 1] < y2) + + return keep_ind + + def get_augment_scene(self, index, cut_scene=False): + 'Generates one sample of data' + data, root = self.point_cloud_dataset[index] + + xyz = data['xyz'] + labels = data['labels'] + instance_label = data['instance_label'].reshape(-1) + sig = data['signal'] + origin_len = data['origin_len'] + + ref_pc = xyz.copy() + ref_labels = labels.copy() + ref_index = np.arange(len(ref_pc)) + + mask_x = np.logical_and(xyz[:, 0] > self.min_volume_space[0], xyz[:, 0] < self.max_volume_space[0]) + mask_y = np.logical_and(xyz[:, 1] > self.min_volume_space[1], xyz[:, 1] < self.max_volume_space[1]) + mask_z = np.logical_and(xyz[:, 2] > self.min_volume_space[2], xyz[:, 2] < self.max_volume_space[2]) + mask = np.logical_and(mask_x, np.logical_and(mask_y, mask_z)) + + if cut_scene: + mask *= instance_label != 0 + + xyz = xyz[mask] + ref_pc = ref_pc[mask] + labels = labels[mask] + instance_label = instance_label[mask] + ref_index = ref_index[mask] + sig = sig[mask] + point_num = len(xyz) + + if self.dropout and self.point_cloud_dataset.imageset == 'train': + dropout_ratio = np.random.random() * self.max_dropout_ratio + drop_idx = np.where(np.random.random((xyz.shape[0])) <= dropout_ratio)[0] + + if len(drop_idx) > 0: + xyz[drop_idx, :] = xyz[0, :] + labels[drop_idx, :] = labels[0, :] + sig[drop_idx, :] = sig[0, :] + instance_label[drop_idx] = instance_label[0] + ref_index[drop_idx] = ref_index[0] + + # load 2D data + image = data['img'] + proj_matrix = data['proj_matrix'] + + # project points into image + keep_idx = xyz[:, 0] > 0 # only keep point in front of the vehicle + points_hcoords = np.concatenate([xyz[keep_idx], np.ones([keep_idx.sum(), 1], dtype=np.float32)], axis=1) + img_points = (proj_matrix @ points_hcoords.T).T + img_points = img_points[:, :2] / np.expand_dims(img_points[:, 2], axis=1) # scale 2D points + keep_idx_img_pts = self.select_points_in_frustum(img_points, 0, 0, *image.size) + keep_idx[keep_idx] = keep_idx_img_pts + + # fliplr so that indexing is row, col and not col, row + img_points = np.fliplr(img_points) + points_img = img_points[keep_idx_img_pts] + + ### 3D Augmentation ### + # random data augmentation by rotation + if self.rotate_aug: + rotate_rad = np.deg2rad(np.random.random() * 360) + c, s = np.cos(rotate_rad), np.sin(rotate_rad) + j = np.matrix([[c, s], [-s, c]]) + xyz[:, :2] = np.dot(xyz[:, :2], j) + + # random data augmentation by flip x , y or x+y + if self.flip_aug: + flip_type = np.random.choice(4, 1) + if flip_type == 1: + xyz[:, 0] = -xyz[:, 0] + elif flip_type == 2: + xyz[:, 1] = -xyz[:, 1] + elif flip_type == 3: + xyz[:, :2] = -xyz[:, :2] + + if self.scale_aug: + noise_scale = np.random.uniform(0.95, 1.05) + xyz[:, 0] = noise_scale * xyz[:, 0] + xyz[:, 1] = noise_scale * xyz[:, 1] + + if self.transform: + noise_translate = np.array([np.random.normal(0, self.trans_std[0], 1), + np.random.normal(0, self.trans_std[1], 1), + np.random.normal(0, self.trans_std[2], 1)]).T + + xyz[:, 0:3] += noise_translate + + img_label = labels[keep_idx] + point2img_index = np.arange(len(labels))[keep_idx] + feat = np.concatenate((xyz, sig), axis=1) + + ### 2D Augmentation ### + if self.bottom_crop: + # self.bottom_crop is a tuple (crop_width, crop_height) + left = int(np.random.rand() * (image.size[0] + 1 - self.bottom_crop[0])) + right = left + self.bottom_crop[0] + top = image.size[1] - self.bottom_crop[1] + bottom = image.size[1] + + # update image points + keep_idx = points_img[:, 0] >= top + keep_idx = np.logical_and(keep_idx, points_img[:, 0] < bottom) + keep_idx = np.logical_and(keep_idx, points_img[:, 1] >= left) + keep_idx = np.logical_and(keep_idx, points_img[:, 1] < right) + + # crop image + image = image.crop((left, top, right, bottom)) + points_img = points_img[keep_idx] + points_img[:, 0] -= top + points_img[:, 1] -= left + + img_label = img_label[keep_idx] + point2img_index = point2img_index[keep_idx] + + img_indices = points_img.astype(np.int64) + + # 2D augmentation + if self.color_jitter is not None: + image = self.color_jitter(image) + + # PIL to numpy + image = np.array(image, dtype=np.float32, copy=False) / 255. + + # 2D augmentation + if np.random.rand() < self.flip2d: + image = np.ascontiguousarray(np.fliplr(image)) + img_indices[:, 1] = image.shape[1] - 1 - img_indices[:, 1] + + # normalize image + if self.image_normalizer: + mean, std = self.image_normalizer + mean = np.asarray(mean, dtype=np.float32) + std = np.asarray(std, dtype=np.float32) + image = (image - mean) / std + + data_dict = {} + data_dict['point_feat'] = feat + data_dict['point_label'] = labels + data_dict['ref_xyz'] = ref_pc + data_dict['ref_label'] = ref_labels + data_dict['ref_index'] = ref_index + data_dict['mask'] = mask + data_dict['point_num'] = point_num + data_dict['origin_len'] = origin_len + data_dict['root'] = root + + data_dict['img'] = image + data_dict['img_indices'] = img_indices + data_dict['img_label'] = img_label + data_dict['point2img_index'] = point2img_index + + return data_dict + + def __getitem__(self, index): + data_dict = self.get_augment_scene(index) + + if self.point_cloud_dataset.imageset == 'train': + cut_index = random.randint(0, self.__len__() - 1) + + while cut_index == index: + cut_index = random.randint(0, self.__len__() - 1) + + cut_dict = self.get_augment_scene(cut_index, cut_scene=True) + cutmix_data_dict = {} + for keys in data_dict.keys(): + if keys == 'point_num' or keys == 'origin_len': + cutmix_data_dict[keys] = data_dict[keys] + cut_dict[keys] + elif keys == 'ref_index': + cut_dict[keys] = cut_dict[keys] + data_dict['origin_len'] + cutmix_data_dict[keys] = np.append(data_dict[keys], cut_dict[keys]) + elif keys == 'mask': + cutmix_data_dict[keys] = np.append(data_dict[keys], cut_dict[keys]) + elif keys not in ['img', 'img_indices', 'img_label', 'point2img_index']: + cutmix_data_dict[keys] = np.vstack((data_dict[keys], cut_dict[keys])) + else: + cutmix_data_dict[keys] = data_dict[keys] + + else: + cutmix_data_dict = data_dict + + return cutmix_data_dict + + +@register_dataset +class point_image_dataset_nus(data.Dataset): + def __init__(self, in_dataset, config, loader_config, num_vote=1, trans_std=[0.1, 0.1, 0.1], max_dropout_ratio=0.2): + 'Initialization' + self.point_cloud_dataset = in_dataset + self.config = config + self.ignore_label = config['dataset_params']['ignore_label'] + self.rotate_aug = loader_config['rotate_aug'] + self.flip_aug = loader_config['flip_aug'] + self.transform = loader_config['transform_aug'] + self.scale_aug = loader_config['scale_aug'] + self.dropout = loader_config['dropout_aug'] + self.instance_aug = loader_config.get('instance_aug', False) + self.max_volume_space = config['dataset_params']['max_volume_space'] + self.min_volume_space = config['dataset_params']['min_volume_space'] + self.num_vote = num_vote + self.trans_std = trans_std + self.max_dropout_ratio = max_dropout_ratio + self.debug = config['debug'] + + self.resize = config['dataset_params'].get('resize', False) + color_jitter = config['dataset_params']['color_jitter'] + self.color_jitter = T.ColorJitter(*color_jitter) if color_jitter else None + self.flip2d = config['dataset_params']['flip2d'] + self.image_normalizer = config['dataset_params'].get('image_normalizer', False) + + def map_pointcloud_to_image(self, pc, im_shape, info): + """ + Maps the lidar point cloud to the image. + :param pc: (3, N) + :param im_shape: image to check size and debug + :param info: dict with calibration infos + :param im: image, only for visualization + :return: + """ + pc = pc.copy().T + + # Points live in the point sensor frame. So they need to be transformed via global to the image plane. + # First step: transform the point-cloud to the ego vehicle frame for the timestamp of the sweep. + pc = Quaternion(info['lidar2ego_rotation']).rotation_matrix @ pc + pc = pc + np.array(info['lidar2ego_translation'])[:, np.newaxis] + + # Second step: transform to the global frame. + pc = Quaternion(info['ego2global_rotation_lidar']).rotation_matrix @ pc + pc = pc + np.array(info['ego2global_translation_lidar'])[:, np.newaxis] + + # Third step: transform into the ego vehicle frame for the timestamp of the image. + pc = pc - np.array(info['ego2global_translation_cam'])[:, np.newaxis] + pc = Quaternion(info['ego2global_rotation_cam']).rotation_matrix.T @ pc + + # Fourth step: transform into the camera. + pc = pc - np.array(info['cam2ego_translation'])[:, np.newaxis] + pc = Quaternion(info['cam2ego_rotation']).rotation_matrix.T @ pc + + # Fifth step: actually take a "picture" of the point cloud. + # Grab the depths (camera frame z axis points away from the camera). + depths = pc[2, :] + + # Take the actual picture (matrix multiplication with camera-matrix + renormalization). + points = view_points(pc, np.array(info['cam_intrinsic']), normalize=True) + + # Cast to float32 to prevent later rounding errors + points = points.astype(np.float32) + + # Remove points that are either outside or behind the camera. + mask = np.ones(depths.shape[0], dtype=bool) + mask = np.logical_and(mask, depths > 0) + mask = np.logical_and(mask, points[0, :] > 0) + mask = np.logical_and(mask, points[0, :] < im_shape[1]) + mask = np.logical_and(mask, points[1, :] > 0) + mask = np.logical_and(mask, points[1, :] < im_shape[0]) + + return mask, pc.T, points.T[:, :2] + + def __len__(self): + 'Denotes the total number of samples' + if self.debug: + return 100 * self.num_vote + else: + return len(self.point_cloud_dataset) + + def __getitem__(self, index): + 'Generates one sample of data' + data, root = self.point_cloud_dataset[index] + + xyz = data['xyz'] + labels = data['labels'] + sig = data['signal'] + origin_len = data['origin_len'] + + # load 2D data + image = data['img'] + calib_infos = data['calib_infos'] + + ref_pc = xyz.copy() + ref_labels = labels.copy() + ref_index = np.arange(len(ref_pc)) + + mask_x = np.logical_and(xyz[:, 0] > self.min_volume_space[0], xyz[:, 0] < self.max_volume_space[0]) + mask_y = np.logical_and(xyz[:, 1] > self.min_volume_space[1], xyz[:, 1] < self.max_volume_space[1]) + mask_z = np.logical_and(xyz[:, 2] > self.min_volume_space[2], xyz[:, 2] < self.max_volume_space[2]) + mask = np.logical_and(mask_x, np.logical_and(mask_y, mask_z)) + + xyz = xyz[mask] + ref_pc = ref_pc[mask] + labels = labels[mask] + ref_index = ref_index[mask] + sig = sig[mask] + point_num = len(xyz) + + # dropout points + if self.dropout and self.point_cloud_dataset.imageset == 'train': + dropout_ratio = np.random.random() * self.max_dropout_ratio + drop_idx = np.where(np.random.random((xyz.shape[0])) <= dropout_ratio)[0] + + if len(drop_idx) > 0: + xyz[drop_idx, :] = xyz[0, :] + labels[drop_idx, :] = labels[0, :] + sig[drop_idx, :] = sig[0, :] + ref_index[drop_idx] = ref_index[0] + + keep_idx, _, points_img = self.map_pointcloud_to_image( + xyz, (image.size[1], image.size[0]), calib_infos) + points_img = np.ascontiguousarray(np.fliplr(points_img)) + + # random data augmentation by rotation + if self.rotate_aug: + rotate_rad = np.deg2rad(np.random.random() * 360) + c, s = np.cos(rotate_rad), np.sin(rotate_rad) + j = np.matrix([[c, s], [-s, c]]) + xyz[:, :2] = np.dot(xyz[:, :2], j) + + # random data augmentation by flip x , y or x+y + if self.flip_aug: + flip_type = np.random.choice(4, 1) + if flip_type == 1: + xyz[:, 0] = -xyz[:, 0] + elif flip_type == 2: + xyz[:, 1] = -xyz[:, 1] + elif flip_type == 3: + xyz[:, :2] = -xyz[:, :2] + + if self.scale_aug: + noise_scale = np.random.uniform(0.95, 1.05) + xyz[:, 0] = noise_scale * xyz[:, 0] + xyz[:, 1] = noise_scale * xyz[:, 1] + + if self.transform: + noise_translate = np.array([np.random.normal(0, self.trans_std[0], 1), + np.random.normal(0, self.trans_std[1], 1), + np.random.normal(0, self.trans_std[2], 1)]).T + + xyz[:, 0:3] += noise_translate + + points_img = points_img[keep_idx] + img_label = labels[keep_idx] + point2img_index = np.arange(len(keep_idx))[keep_idx] + feat = np.concatenate((xyz, sig), axis=1) + + ### 2D Augmentation ### + if self.resize: + assert image.size[0] > self.resize[0] + + # scale image points + points_img[:, 0] = float(self.resize[1]) / image.size[1] * np.floor(points_img[:, 0]) + points_img[:, 1] = float(self.resize[0]) / image.size[0] * np.floor(points_img[:, 1]) + + # resize image + image = image.resize(self.resize, Image.BILINEAR) + + img_indices = points_img.astype(np.int64) + + # 2D augmentation + if self.color_jitter is not None: + image = self.color_jitter(image) + + image = np.array(image, dtype=np.float32, copy=False) / 255. + + # 2D augmentation + if np.random.rand() < self.flip2d: + image = np.ascontiguousarray(np.fliplr(image)) + img_indices[:, 1] = image.shape[1] - 1 - img_indices[:, 1] + + # normalize image + if self.image_normalizer: + mean, std = self.image_normalizer + mean = np.asarray(mean, dtype=np.float32) + std = np.asarray(std, dtype=np.float32) + image = (image - mean) / std + + data_dict = {} + data_dict['point_feat'] = feat + data_dict['point_label'] = labels + data_dict['ref_xyz'] = ref_pc + data_dict['ref_label'] = ref_labels + data_dict['ref_index'] = ref_index + data_dict['mask'] = mask + data_dict['point_num'] = point_num + data_dict['origin_len'] = origin_len + data_dict['root'] = root + + data_dict['img'] = image + data_dict['img_indices'] = img_indices + data_dict['img_label'] = img_label + data_dict['point2img_index'] = point2img_index + + return data_dict + + +@register_dataset +class voxel_dataset(data.Dataset): + def __init__(self, in_dataset, config, loader_config, num_vote=1, trans_std=[0.1, 0.1, 0.1], max_dropout_ratio=0.2): + 'Initialization' + self.point_cloud_dataset = in_dataset + self.config = config + self.ignore_label = config['dataset_params']['ignore_label'] + self.rotate_aug = loader_config['rotate_aug'] + self.flip_aug = loader_config['flip_aug'] + self.transform = loader_config['transform_aug'] + self.scale_aug = loader_config['scale_aug'] + self.dropout = loader_config['dropout_aug'] + self.voxel_size = config['model_params']['voxel_size'] + self.num_vote = num_vote + self.trans_std = trans_std + self.max_dropout_ratio = max_dropout_ratio + self.debug = config['debug'] + + def __len__(self): + 'Denotes the total number of samples' + if self.debug: + return 100 * self.num_vote + else: + return len(self.point_cloud_dataset) + + + def __getitem__(self, index): + 'Generates one sample of data' + data, root = self.point_cloud_dataset[index] + + xyz = data['xyz'] + labels = data['labels'] + sig = data['signal'] + origin_len = data['origin_len'] + + # random data augmentation by rotation + if self.rotate_aug: + rotate_rad = np.deg2rad(np.random.random() * 360) + c, s = np.cos(rotate_rad), np.sin(rotate_rad) + j = np.matrix([[c, s], [-s, c]]) + xyz[:, :2] = np.dot(xyz[:, :2], j) + + # random data augmentation by flip x , y or x+y + if self.flip_aug: + flip_type = np.random.choice(4, 1) + if flip_type == 1: + xyz[:, 0] = -xyz[:, 0] + elif flip_type == 2: + xyz[:, 1] = -xyz[:, 1] + elif flip_type == 3: + xyz[:, :2] = -xyz[:, :2] + + if self.scale_aug: + noise_scale = np.random.uniform(0.95, 1.05) + xyz[:, 0] = noise_scale * xyz[:, 0] + xyz[:, 1] = noise_scale * xyz[:, 1] + + if self.transform: + noise_translate = np.array([np.random.normal(0, self.trans_std[0], 1), + np.random.normal(0, self.trans_std[1], 1), + np.random.normal(0, self.trans_std[2], 1)]).T + + xyz[:, 0:3] += noise_translate + + if self.dropout and self.point_cloud_dataset.imageset == 'train': + dropout_ratio = np.random.random() * self.max_dropout_ratio + drop_idx = np.where(np.random.random((xyz.shape[0])) <= dropout_ratio)[0] + + if len(drop_idx) > 0: + xyz[drop_idx, :] = xyz[0, :] + labels[drop_idx, :] = labels[0, :] + sig[drop_idx, :] = sig[0, :] + + ref_pc = xyz.copy() + ref_labels = labels.copy() + ref_index = np.arange(len(ref_pc)) + pc_ = np.round(xyz / self.voxel_size) + pc_ = pc_ - pc_.min(0, keepdims=1) + feat_ = np.concatenate((xyz, sig), axis=1) + + _, inds, inverse_map = sparse_quantize(pc_, 1, return_index=True, return_inverse=True) # remove duplicate points within voxel + + pc = pc_[inds] + feat = feat_[inds] + labels = labels[inds] + num_voxel = len(inds) + points = SparseTensor(ref_pc, pc_) + ref_index = SparseTensor(ref_index, pc_) + map = SparseTensor(inds, pc) + lidar = SparseTensor(feat, pc) + labels = SparseTensor(labels, pc) + labels_mapped = SparseTensor(ref_labels, pc_) + inverse_map = SparseTensor(inverse_map, pc_) + + data_dict = {} + data_dict['lidar'] = lidar + data_dict['points'] = points + data_dict['targets'] = labels + data_dict['targets_mapped'] = labels_mapped + data_dict['ref_index'] = ref_index + data_dict['origin_len'] = origin_len + data_dict['root'] = root + data_dict['map'] = map + data_dict['num_voxel'] = num_voxel + data_dict['inverse_map'] = inverse_map + # added for latentbki + data_dict['global_pose'] = data['global_pose'] + data_dict['scene_id'] = data['scene_id'] + data_dict['frame_id'] = int(data['frame_id']) + + return data_dict + + +@register_collate_fn +def collate_fn_default(data): + point_num = [d['point_num'] for d in data] + batch_size = len(point_num) + ref_labels = data[0]['ref_label'] + origin_len = data[0]['origin_len'] + ref_indices = [torch.from_numpy(d['ref_index']) for d in data] + point2img_index = [torch.from_numpy(d['point2img_index']).long() for d in data] + path = [d['root'] for d in data] + + img = [torch.from_numpy(d['img']) for d in data] + img_indices = [d['img_indices'] for d in data] + img_label = [torch.from_numpy(d['img_label']) for d in data] + + b_idx = [] + for i in range(batch_size): + b_idx.append(torch.ones(point_num[i]) * i) + points = [torch.from_numpy(d['point_feat']) for d in data] + ref_xyz = [torch.from_numpy(d['ref_xyz']) for d in data] + labels = [torch.from_numpy(d['point_label']) for d in data] + + return { + 'points': torch.cat(points).float(), + 'ref_xyz': torch.cat(ref_xyz).float(), + 'batch_idx': torch.cat(b_idx).long(), + 'batch_size': batch_size, + 'labels': torch.cat(labels).long().squeeze(1), + 'raw_labels': torch.from_numpy(ref_labels).long(), + 'origin_len': origin_len, + 'indices': torch.cat(ref_indices).long(), + 'point2img_index': point2img_index, + 'img': torch.stack(img, 0).permute(0, 3, 1, 2), + 'img_indices': img_indices, + 'img_label': torch.cat(img_label, 0).squeeze(1).long(), + 'path': path, + } + + +@register_collate_fn +def collate_fn_voxel(inputs): + return sparse_collate_fn(inputs) \ No newline at end of file diff --git a/TwoDPASS/dataloader/pc_dataset.py b/TwoDPASS/dataloader/pc_dataset.py new file mode 100644 index 0000000..5b2f5eb --- /dev/null +++ b/TwoDPASS/dataloader/pc_dataset.py @@ -0,0 +1,334 @@ +import os +import yaml +import numpy as np + +from PIL import Image +from torch.utils import data +from pathlib import Path +from nuscenes.utils import splits + +REGISTERED_PC_DATASET_CLASSES = {} + + +def register_dataset(cls, name=None): + global REGISTERED_PC_DATASET_CLASSES + if name is None: + name = cls.__name__ + assert name not in REGISTERED_PC_DATASET_CLASSES, f"exist class: {REGISTERED_PC_DATASET_CLASSES}" + REGISTERED_PC_DATASET_CLASSES[name] = cls + return cls + + +def get_pc_model_class(name): + global REGISTERED_PC_DATASET_CLASSES + assert name in REGISTERED_PC_DATASET_CLASSES, f"available class: {REGISTERED_PC_DATASET_CLASSES}" + return REGISTERED_PC_DATASET_CLASSES[name] + + +def absoluteFilePaths(directory, num_vote): + for dirpath, _, filenames in os.walk(directory): + filenames.sort() + for f in filenames: + for _ in range(num_vote): + yield os.path.abspath(os.path.join(dirpath, f)) + + +@register_dataset +class SemanticKITTI(data.Dataset): + def __init__(self, config, data_path, imageset='train', num_vote=1): + with open(config['dataset_params']['label_mapping'], 'r') as stream: + semkittiyaml = yaml.safe_load(stream) + + self.config = config + self.num_vote = num_vote + self.learning_map = semkittiyaml['learning_map'] + self.imageset = imageset + + if imageset == 'train': + split = semkittiyaml['split']['train'] + if config['train_params'].get('trainval', False): + split += semkittiyaml['split']['valid'] + elif imageset == 'val': + split = semkittiyaml['split']['valid'] + elif imageset == 'test': + split = semkittiyaml['split']['test'] + else: + raise Exception('Split must be train/val/test') + + self.im_idx = [] + self.proj_matrix = {} + self.poses = np.empty((0,12)) + self.Tr = np.empty((0,16)) + self.scene_id = [] + self.frame_id = [] + + for i_folder in split: + self.im_idx += absoluteFilePaths('/'.join([data_path, str(i_folder).zfill(2), 'velodyne']), num_vote) + calib_path = os.path.join(data_path, str(i_folder).zfill(2), "calib.txt") + calib = self.read_calib(calib_path) + proj_matrix = np.matmul(calib["P2"], calib["Tr"]) + self.proj_matrix[i_folder] = proj_matrix + + # load pose assume num_vote = 1 + pose = np.loadtxt(os.path.join(data_path, str(i_folder).zfill(2), 'poses.txt')) + self.poses = np.vstack((self.poses, pose)) + Tr = calib["Tr"].reshape(-1,) + Tr = np.repeat(np.expand_dims(Tr, axis=1).T,pose.shape[0],axis=0) + self.Tr = np.vstack((self.Tr, Tr)) + self.scene_id = [str(i_folder).zfill(2)] * pose.shape[0] + frames = [str(i).zfill(6) for i in range(pose.shape[0])] + self.frame_id.extend(frames) + + + seg_num_per_class = config['dataset_params']['seg_labelweights'] + seg_labelweights = seg_num_per_class / np.sum(seg_num_per_class) + self.seg_labelweights = np.power(np.amax(seg_labelweights) / seg_labelweights, 1 / 3.0) + + def get_pose(self, frame_id): + pose = np.zeros((4, 4)) + pose[3, 3] = 1 + pose[:3, :4] = self.poses[frame_id,:].reshape(3, 4) + + # Tr = np.zeros((4, 4)) + # Tr[3, 3] = 1 + # Tr[:3, :4] = self.Tr[frame_id,:].reshape(3,4) + Tr = self.Tr[frame_id,:].reshape(4,4) + + Tr = Tr.astype(np.float32) + pose = pose.astype(np.float32) + global_pose = np.matmul(np.linalg.inv(Tr), np.matmul(pose, Tr)) + + return global_pose + + def __len__(self): + 'Denotes the total number of samples' + return len(self.im_idx) + + @staticmethod + def read_calib(calib_path): + """ + :param calib_path: Path to a calibration text file. + :return: dict with calibration matrices. + """ + calib_all = {} + with open(calib_path, 'r') as f: + for line in f.readlines(): + if line == '\n': + break + key, value = line.split(':', 1) + calib_all[key] = np.array([float(x) for x in value.split()]) + + # reshape matrices + calib_out = {} + calib_out['P2'] = calib_all['P2'].reshape(3, 4) # 3x4 projection matrix for left camera + calib_out['Tr'] = np.identity(4) # 4x4 matrix + calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4) + + return calib_out + + def __getitem__(self, index): + raw_data = np.fromfile(self.im_idx[index], dtype=np.float32).reshape((-1, 4)) + origin_len = len(raw_data) + points = raw_data[:, :3] + + if self.imageset == 'test': + annotated_data = np.expand_dims(np.zeros_like(raw_data[:, 0], dtype=int), axis=1) + instance_label = np.expand_dims(np.zeros_like(raw_data[:, 0], dtype=int), axis=1) + else: + annotated_data = np.fromfile(self.im_idx[index].replace('velodyne', 'labels')[:-3] + 'label', + dtype=np.uint32).reshape((-1, 1)) + instance_label = annotated_data >> 16 + annotated_data = annotated_data & 0xFFFF # delete high 16 digits binary + annotated_data = np.vectorize(self.learning_map.__getitem__)(annotated_data) + + if self.config['dataset_params']['ignore_label'] != 0: + annotated_data -= 1 + annotated_data[annotated_data == -1] = self.config['dataset_params']['ignore_label'] + + # image_file = self.im_idx[index].replace('velodyne', 'image_2').replace('.bin', '.png') + # image = Image.open(image_file) + proj_matrix = self.proj_matrix[int(self.im_idx[index][-22:-20])] + + data_dict = {} + data_dict['xyz'] = points + data_dict['labels'] = annotated_data.astype(np.uint8) + data_dict['instance_label'] = instance_label + data_dict['signal'] = raw_data[:, 3:4] + data_dict['origin_len'] = origin_len + # data_dict['img'] = image + data_dict['proj_matrix'] = proj_matrix + + global_pose = self.get_pose(index) + data_dict['global_pose'] = global_pose + data_dict['scene_id'] = self.scene_id[index] + data_dict['frame_id'] = self.frame_id[index] + + + return data_dict, self.im_idx[index] + + +@register_dataset +class nuScenes(data.Dataset): + def __init__(self, config, data_path, imageset='train', num_vote=1): + if config.debug: + version = 'v1.0-mini' + scenes = splits.mini_train + else: + if imageset != 'test': + version = 'v1.0-trainval' + if imageset == 'train': + scenes = splits.train + else: + scenes = splits.val + else: + version = 'v1.0-test' + scenes = splits.test + + self.split = imageset + with open(config['dataset_params']['label_mapping'], 'r') as stream: + nuscenesyaml = yaml.safe_load(stream) + self.learning_map = nuscenesyaml['learning_map'] + + self.num_vote = num_vote + self.data_path = data_path + self.imageset = imageset + self.img_view = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT', + 'CAM_FRONT_LEFT'] + + from nuscenes import NuScenes + self.nusc = NuScenes(version=version, dataroot=data_path, verbose=True) + + self.get_available_scenes() + available_scene_names = [s['name'] for s in self.available_scenes] + scenes = list(filter(lambda x: x in available_scene_names, scenes)) + scenes = set([self.available_scenes[available_scene_names.index(s)]['token'] for s in scenes]) + self.get_path_infos_cam_lidar(scenes) + + print('Total %d scenes in the %s split' % (len(self.token_list), imageset)) + + def __len__(self): + 'Denotes the total number of samples' + return len(self.token_list) + + def loadDataByIndex(self, index): + lidar_sample_token = self.token_list[index]['lidar_token'] + lidar_path = os.path.join(self.data_path, + self.nusc.get('sample_data', lidar_sample_token)['filename']) + raw_data = np.fromfile(lidar_path, dtype=np.float32).reshape((-1, 5)) + + if self.split == 'test': + self.lidarseg_path = None + annotated_data = np.expand_dims( + np.zeros_like(raw_data[:, 0], dtype=int), axis=1) + else: + lidarseg_path = os.path.join(self.data_path, + self.nusc.get('lidarseg', lidar_sample_token)['filename']) + annotated_data = np.fromfile( + lidarseg_path, dtype=np.uint8).reshape((-1, 1)) # label + + pointcloud = raw_data[:, :4] + sem_label = annotated_data + inst_label = np.zeros(pointcloud.shape[0], dtype=np.int32) + return pointcloud, sem_label, inst_label, lidar_sample_token + + def labelMapping(self, sem_label): + sem_label = np.vectorize(self.map_name_from_general_index_to_segmentation_index.__getitem__)( + sem_label) # n, 1 + assert sem_label.shape[-1] == 1 + sem_label = sem_label[:, 0] + return sem_label + + def loadImage(self, index, image_id): + cam_sample_token = self.token_list[index]['cam_token'][image_id] + cam = self.nusc.get('sample_data', cam_sample_token) + image = Image.open(os.path.join(self.nusc.dataroot, cam['filename'])) + return image, cam_sample_token + + def get_available_scenes(self): + # only for check if all the files are available + self.available_scenes = [] + for scene in self.nusc.scene: + scene_token = scene['token'] + scene_rec = self.nusc.get('scene', scene_token) + sample_rec = self.nusc.get('sample', scene_rec['first_sample_token']) + sd_rec = self.nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + has_more_frames = True + scene_not_exist = False + while has_more_frames: + lidar_path, _, _ = self.nusc.get_sample_data(sd_rec['token']) + if not Path(lidar_path).exists(): + scene_not_exist = True + break + else: + break + + if scene_not_exist: + continue + self.available_scenes.append(scene) + + def get_path_infos_cam_lidar(self, scenes): + self.token_list = [] + + for sample in self.nusc.sample: + scene_token = sample['scene_token'] + lidar_token = sample['data']['LIDAR_TOP'] # 360 lidar + + if scene_token in scenes: + for _ in range(self.num_vote): + cam_token = [] + for i in self.img_view: + cam_token.append(sample['data'][i]) + self.token_list.append( + {'lidar_token': lidar_token, + 'cam_token': cam_token} + ) + + def __getitem__(self, index): + pointcloud, sem_label, instance_label, lidar_sample_token = self.loadDataByIndex(index) + sem_label = np.vectorize(self.learning_map.__getitem__)(sem_label) + + # get image feature + image_id = np.random.randint(6) + image, cam_sample_token = self.loadImage(index, image_id) + + cam_path, boxes_front_cam, cam_intrinsic = self.nusc.get_sample_data(cam_sample_token) + pointsensor = self.nusc.get('sample_data', lidar_sample_token) + cs_record_lidar = self.nusc.get('calibrated_sensor', + pointsensor['calibrated_sensor_token']) + pose_record_lidar = self.nusc.get('ego_pose', pointsensor['ego_pose_token']) + cam = self.nusc.get('sample_data', cam_sample_token) + cs_record_cam = self.nusc.get('calibrated_sensor', + cam['calibrated_sensor_token']) + pose_record_cam = self.nusc.get('ego_pose', cam['ego_pose_token']) + + calib_infos = { + "lidar2ego_translation": cs_record_lidar['translation'], + "lidar2ego_rotation": cs_record_lidar['rotation'], + "ego2global_translation_lidar": pose_record_lidar['translation'], + "ego2global_rotation_lidar": pose_record_lidar['rotation'], + "ego2global_translation_cam": pose_record_cam['translation'], + "ego2global_rotation_cam": pose_record_cam['rotation'], + "cam2ego_translation": cs_record_cam['translation'], + "cam2ego_rotation": cs_record_cam['rotation'], + "cam_intrinsic": cam_intrinsic, + } + + data_dict = {} + data_dict['xyz'] = pointcloud[:, :3] + data_dict['img'] = image + data_dict['calib_infos'] = calib_infos + data_dict['labels'] = sem_label.astype(np.uint8) + data_dict['signal'] = pointcloud[:, 3:4] + data_dict['origin_len'] = len(pointcloud) + + return data_dict, lidar_sample_token + + +def get_SemKITTI_label_name(label_mapping): + with open(label_mapping, 'r') as stream: + semkittiyaml = yaml.safe_load(stream) + SemKITTI_label_name = dict() + for i in sorted(list(semkittiyaml['learning_map'].keys()))[::-1]: + SemKITTI_label_name[semkittiyaml['learning_map'][i]] = semkittiyaml['labels'][i] + + return SemKITTI_label_name diff --git a/TwoDPASS/figures/2DPASS.gif b/TwoDPASS/figures/2DPASS.gif new file mode 100644 index 0000000..eb5bf41 Binary files /dev/null and b/TwoDPASS/figures/2DPASS.gif differ diff --git a/TwoDPASS/figures/multiscan.jpg b/TwoDPASS/figures/multiscan.jpg new file mode 100644 index 0000000..9788ccc Binary files /dev/null and b/TwoDPASS/figures/multiscan.jpg differ diff --git a/TwoDPASS/figures/nuscene.png b/TwoDPASS/figures/nuscene.png new file mode 100644 index 0000000..49d1d29 Binary files /dev/null and b/TwoDPASS/figures/nuscene.png differ diff --git a/TwoDPASS/figures/semantickittic.png b/TwoDPASS/figures/semantickittic.png new file mode 100644 index 0000000..dae8ac7 Binary files /dev/null and b/TwoDPASS/figures/semantickittic.png differ diff --git a/TwoDPASS/figures/singlescan.jpg b/TwoDPASS/figures/singlescan.jpg new file mode 100644 index 0000000..342a9af Binary files /dev/null and b/TwoDPASS/figures/singlescan.jpg differ diff --git a/TwoDPASS/logs/SemanticKITTI/default/version_0/hparams.yaml b/TwoDPASS/logs/SemanticKITTI/default/version_0/hparams.yaml new file mode 100644 index 0000000..9556198 --- /dev/null +++ b/TwoDPASS/logs/SemanticKITTI/default/version_0/hparams.yaml @@ -0,0 +1,165 @@ +config: !!python/object/new:easydict.EasyDict + dictitems: + SWA: false + baseline_only: false + check_val_every_n_epoch: 1 + checkpoint: pretrained/SPVCNN/best_model.ckpt + config_path: config/SPVCNN-semantickitti.yaml + dataset_params: &id005 !!python/object/new:easydict.EasyDict + dictitems: + collate_type: collate_fn_voxel + dataset_type: voxel_dataset + ignore_label: 0 + label_mapping: ./config/label_mapping/semantic-kitti.yaml + pc_dataset_type: SemanticKITTI + seg_labelweights: &id001 + - 0 + - 55437630 + - 320797 + - 541736 + - 2578735 + - 3274484 + - 552662 + - 184064 + - 78858 + - 240942562 + - 17294618 + - 170599734 + - 6369672 + - 230413074 + - 101130274 + - 476491114 + - 9833174 + - 129609852 + - 4506626 + - 1168181 + train_data_loader: &id002 !!python/object/new:easydict.EasyDict + dictitems: + batch_size: 8 + data_path: /KITTI/dataset/sequences/ + dropout_aug: true + flip_aug: true + num_workers: 8 + rotate_aug: true + scale_aug: true + shuffle: true + transform_aug: true + state: + batch_size: 8 + data_path: /KITTI/dataset/sequences/ + dropout_aug: true + flip_aug: true + num_workers: 8 + rotate_aug: true + scale_aug: true + shuffle: true + transform_aug: true + training_size: 19132 + val_data_loader: &id003 !!python/object/new:easydict.EasyDict + dictitems: + batch_size: 1 + data_path: /KITTI/dataset/sequences/ + dropout_aug: false + flip_aug: false + num_workers: 8 + rotate_aug: false + scale_aug: false + shuffle: false + transform_aug: false + state: + batch_size: 1 + data_path: /KITTI/dataset/sequences/ + dropout_aug: false + flip_aug: false + num_workers: 8 + rotate_aug: false + scale_aug: false + shuffle: false + transform_aug: false + state: + collate_type: collate_fn_voxel + dataset_type: voxel_dataset + ignore_label: 0 + label_mapping: ./config/label_mapping/semantic-kitti.yaml + pc_dataset_type: SemanticKITTI + seg_labelweights: *id001 + train_data_loader: *id002 + training_size: 19132 + val_data_loader: *id003 + debug: false + fine_tune: false + format_version: 2 + gpu: &id006 + - 0 + log_dir: default + model_params: &id007 !!python/object/new:easydict.EasyDict + dictitems: + cr: 1 + input_dims: 4 + layer_num: &id004 + - 32 + - 32 + - 64 + - 128 + - 256 + - 256 + - 128 + - 96 + - 96 + model_architecture: spvcnn + num_class: 20 + voxel_size: 0.05 + state: + cr: 1 + input_dims: 4 + layer_num: *id004 + model_architecture: spvcnn + num_class: 20 + voxel_size: 0.05 + monitor: val/mIoU + num_vote: 1 + pretrain2d: false + save_top_k: 1 + seed: 0 + stop_patience: 50 + submit_to_server: false + test: true + train_params: &id008 !!python/object/new:easydict.EasyDict + dictitems: + learning_rate: 0.24 + lr_scheduler: CosineAnnealingWarmRestarts + max_num_epochs: 64 + momentum: 0.9 + nesterov: true + optimizer: SGD + weight_decay: 0.0001 + state: + learning_rate: 0.24 + lr_scheduler: CosineAnnealingWarmRestarts + max_num_epochs: 64 + momentum: 0.9 + nesterov: true + optimizer: SGD + weight_decay: 0.0001 + state: + SWA: false + baseline_only: false + check_val_every_n_epoch: 1 + checkpoint: pretrained/SPVCNN/best_model.ckpt + config_path: config/SPVCNN-semantickitti.yaml + dataset_params: *id005 + debug: false + fine_tune: false + format_version: 2 + gpu: *id006 + log_dir: default + model_params: *id007 + monitor: val/mIoU + num_vote: 1 + pretrain2d: false + save_top_k: 1 + seed: 0 + stop_patience: 50 + submit_to_server: false + test: true + train_params: *id008 diff --git a/TwoDPASS/main.py b/TwoDPASS/main.py new file mode 100644 index 0000000..ca7f693 --- /dev/null +++ b/TwoDPASS/main.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +@author: Xu Yan +@file: main.py +@time: 2021/12/7 22:21 +''' + +import os +import yaml +import torch +import datetime +import importlib +import numpy as np +import pytorch_lightning as pl + +from easydict import EasyDict +from argparse import ArgumentParser +from pytorch_lightning import loggers as pl_loggers +from pytorch_lightning.profiler import SimpleProfiler +from pytorch_lightning.callbacks import ModelCheckpoint, StochasticWeightAveraging +from pytorch_lightning.callbacks.early_stopping import EarlyStopping +from dataloader.dataset import get_model_class, get_collate_class +from dataloader.pc_dataset import get_pc_model_class +from pytorch_lightning.callbacks import LearningRateMonitor + +import warnings +warnings.filterwarnings("ignore") + + +def load_yaml(file_name): + with open(file_name, 'r') as f: + try: + config = yaml.load(f, Loader=yaml.FullLoader) + except: + config = yaml.load(f) + return config + + +def parse_config(): + parser = ArgumentParser() + # general + parser.add_argument('--gpu', type=int, nargs='+', default=(0,), help='specify gpu devices') + parser.add_argument("--seed", default=0, type=int) + parser.add_argument('--config_path', default='config/2DPASS-semantickitti.yaml') + # training + parser.add_argument('--log_dir', type=str, default='default', help='log location') + parser.add_argument('--monitor', type=str, default='val/mIoU', help='the maximum metric') + parser.add_argument('--stop_patience', type=int, default=50, help='patience for stop training') + parser.add_argument('--save_top_k', type=int, default=1, help='save top k checkpoints, use -1 to checkpoint every epoch') + parser.add_argument('--check_val_every_n_epoch', type=int, default=1, help='check_val_every_n_epoch') + parser.add_argument('--SWA', action='store_true', default=False, help='StochasticWeightAveraging') + parser.add_argument('--baseline_only', action='store_true', default=False, help='training without 2D') + # testing + parser.add_argument('--test', action='store_true', default=False, help='test mode') + parser.add_argument('--fine_tune', action='store_true', default=False, help='fine tune mode') + parser.add_argument('--pretrain2d', action='store_true', default=False, help='use pre-trained 2d network') + parser.add_argument('--num_vote', type=int, default=1, help='number of voting in the test') + parser.add_argument('--submit_to_server', action='store_true', default=False, help='submit on benchmark') + parser.add_argument('--checkpoint', type=str, default=None, help='load checkpoint') + # debug + parser.add_argument('--debug', default=False, action='store_true') + + args = parser.parse_args() + config = load_yaml(args.config_path) + config.update(vars(args)) # override the configuration using the value in args + + # voting test + if args.test: + config['dataset_params']['val_data_loader']['batch_size'] = args.num_vote + if args.num_vote > 1: + config['dataset_params']['val_data_loader']['rotate_aug'] = True + config['dataset_params']['val_data_loader']['transform_aug'] = True + if args.debug: + config['dataset_params']['val_data_loader']['batch_size'] = 2 + config['dataset_params']['val_data_loader']['num_workers'] = 0 + + return EasyDict(config) + + +def build_loader(config): + pc_dataset = get_pc_model_class(config['dataset_params']['pc_dataset_type']) + dataset_type = get_model_class(config['dataset_params']['dataset_type']) + train_config = config['dataset_params']['train_data_loader'] + val_config = config['dataset_params']['val_data_loader'] + train_dataset_loader, val_dataset_loader, test_dataset_loader = None, None, None + + if not config['test']: + train_pt_dataset = pc_dataset(config, data_path=train_config['data_path'], imageset='train') + val_pt_dataset = pc_dataset(config, data_path=val_config['data_path'], imageset='val') + train_dataset_loader = torch.utils.data.DataLoader( + dataset=dataset_type(train_pt_dataset, config, train_config), + batch_size=train_config["batch_size"], + collate_fn=get_collate_class(config['dataset_params']['collate_type']), + shuffle=train_config["shuffle"], + num_workers=train_config["num_workers"], + pin_memory=True, + drop_last=True + ) + # config['dataset_params']['training_size'] = len(train_dataset_loader) * len(configs.gpu) + val_dataset_loader = torch.utils.data.DataLoader( + dataset=dataset_type(val_pt_dataset, config, val_config, num_vote=1), + batch_size=val_config["batch_size"], + collate_fn=get_collate_class(config['dataset_params']['collate_type']), + shuffle=val_config["shuffle"], + pin_memory=True, + num_workers=val_config["num_workers"] + ) + else: + if config['submit_to_server']: + test_pt_dataset = pc_dataset(config, data_path=val_config['data_path'], imageset='test', num_vote=val_config["batch_size"]) + test_dataset_loader = torch.utils.data.DataLoader( + dataset=dataset_type(test_pt_dataset, config, val_config, num_vote=val_config["batch_size"]), + batch_size=val_config["batch_size"], + collate_fn=get_collate_class(config['dataset_params']['collate_type']), + shuffle=val_config["shuffle"], + num_workers=val_config["num_workers"] + ) + else: + val_pt_dataset = pc_dataset(config, data_path=val_config['data_path'], imageset='val', num_vote=val_config["batch_size"]) + val_dataset_loader = torch.utils.data.DataLoader( + dataset=dataset_type(val_pt_dataset, config, val_config, num_vote=val_config["batch_size"]), + batch_size=val_config["batch_size"], + collate_fn=get_collate_class(config['dataset_params']['collate_type']), + shuffle=val_config["shuffle"], + num_workers=val_config["num_workers"] + ) + + return train_dataset_loader, val_dataset_loader, test_dataset_loader + + +if __name__ == '__main__': + # parameters + configs = parse_config() + print(configs) + + # setting + os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, configs.gpu)) + num_gpu = len(configs.gpu) + + # output path + log_folder = 'logs/' + configs['dataset_params']['pc_dataset_type'] + tb_logger = pl_loggers.TensorBoardLogger(log_folder, name=configs.log_dir, default_hp_metric=False) + os.makedirs(f'{log_folder}/{configs.log_dir}', exist_ok=True) + profiler = SimpleProfiler(output_filename=f'{log_folder}/{configs.log_dir}/profiler.txt') + np.set_printoptions(precision=4, suppress=True) + + # save the backup files + backup_dir = os.path.join(log_folder, configs.log_dir, 'backup_files_%s' % str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M'))) + if not configs['test']: + os.makedirs(backup_dir, exist_ok=True) + os.system('cp main.py {}'.format(backup_dir)) + os.system('cp dataloader/dataset.py {}'.format(backup_dir)) + os.system('cp dataloader/pc_dataset.py {}'.format(backup_dir)) + os.system('cp {} {}'.format(configs.config_path, backup_dir)) + os.system('cp network/base_model.py {}'.format(backup_dir)) + os.system('cp network/baseline.py {}'.format(backup_dir)) + os.system('cp {}.py {}'.format('network/' + configs['model_params']['model_architecture'], backup_dir)) + + # reproducibility + torch.manual_seed(configs.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = True + np.random.seed(configs.seed) + config_path = configs.config_path + + train_dataset_loader, val_dataset_loader, test_dataset_loader = build_loader(configs) + model_file = importlib.import_module('network.' + configs['model_params']['model_architecture']) + my_model = model_file.get_model(configs) ######## get model ############ + + pl.seed_everything(configs.seed) + checkpoint_callback = ModelCheckpoint( + monitor=configs.monitor, + mode='max', + save_last=True, + save_top_k=configs.save_top_k) + + if configs.checkpoint is not None: + print('load pre-trained model...') + if configs.fine_tune or configs.test or configs.pretrain2d: + my_model = my_model.load_from_checkpoint(configs.checkpoint, config=configs, strict=(not configs.pretrain2d)) ######### load model ########## + else: + # continue last training + my_model = my_model.load_from_checkpoint(configs.checkpoint) + + if configs.SWA: + swa = [StochasticWeightAveraging(swa_epoch_start=configs.train_params.swa_epoch_start, annealing_epochs=1)] + else: + swa = [] + + if not configs.test: + # init trainer + print('Start training...') + trainer = pl.Trainer(gpus=[i for i in range(num_gpu)], + accelerator='ddp', + max_epochs=configs['train_params']['max_num_epochs'], + resume_from_checkpoint=configs.checkpoint if not configs.fine_tune and not configs.pretrain2d else None, + callbacks=[checkpoint_callback, + LearningRateMonitor(logging_interval='step'), + EarlyStopping(monitor=configs.monitor, + patience=configs.stop_patience, + mode='max', + verbose=True), + ] + swa, + logger=tb_logger, + profiler=profiler, + check_val_every_n_epoch=configs.check_val_every_n_epoch, + gradient_clip_val=1, + accumulate_grad_batches=1 + ) + trainer.fit(my_model, train_dataset_loader, val_dataset_loader) + + else: + print('Start testing...') + assert num_gpu == 1, 'only support single GPU testing!' + trainer = pl.Trainer(gpus=[i for i in range(num_gpu)], + accelerator='ddp', + resume_from_checkpoint=configs.checkpoint, + logger=tb_logger, + profiler=profiler) + trainer.test(my_model, test_dataset_loader if configs.submit_to_server else val_dataset_loader) \ No newline at end of file diff --git a/TwoDPASS/network/arch_2dpass.py b/TwoDPASS/network/arch_2dpass.py new file mode 100644 index 0000000..6917fad --- /dev/null +++ b/TwoDPASS/network/arch_2dpass.py @@ -0,0 +1,172 @@ +import torch +import torch_scatter +import numpy as np +import torch.nn as nn +import torch.nn.functional as F + +from TwoDPASS.network.basic_block import Lovasz_loss +from TwoDPASS.network.baseline import get_model as SPVCNN +from TwoDPASS.network.base_model import LightningBaseModel +from TwoDPASS.network.basic_block import ResNetFCN + +class xModalKD(nn.Module): + def __init__(self,config): + super(xModalKD, self).__init__() + self.hiden_size = config['model_params']['hiden_size'] + self.scale_list = config['model_params']['scale_list'] + self.num_classes = config['model_params']['num_classes'] + self.lambda_xm = config['train_params']['lambda_xm'] + self.lambda_seg2d = config['train_params']['lambda_seg2d'] + self.num_scales = len(self.scale_list) + + self.multihead_3d_classifier = nn.ModuleList() + for i in range(self.num_scales): + self.multihead_3d_classifier.append( + nn.Sequential( + nn.Linear(self.hiden_size, 128), + nn.ReLU(True), + nn.Linear(128, self.num_classes)) + ) + + self.multihead_fuse_classifier = nn.ModuleList() + for i in range(self.num_scales): + self.multihead_fuse_classifier.append( + nn.Sequential( + nn.Linear(self.hiden_size, 128), + nn.ReLU(True), + nn.Linear(128, self.num_classes)) + ) + self.leaners = nn.ModuleList() + self.fcs1 = nn.ModuleList() + self.fcs2 = nn.ModuleList() + for i in range(self.num_scales): + self.leaners.append(nn.Sequential(nn.Linear(self.hiden_size, self.hiden_size))) + self.fcs1.append(nn.Sequential(nn.Linear(self.hiden_size * 2, self.hiden_size))) + self.fcs2.append(nn.Sequential(nn.Linear(self.hiden_size, self.hiden_size))) + + self.classifier = nn.Sequential( + nn.Linear(self.hiden_size * self.num_scales, 128), + nn.ReLU(True), + nn.Linear(128, self.num_classes), + ) + + if 'seg_labelweights' in config['dataset_params']: + seg_num_per_class = config['dataset_params']['seg_labelweights'] + seg_labelweights = seg_num_per_class / np.sum(seg_num_per_class) + seg_labelweights = torch.Tensor(np.power(np.amax(seg_labelweights) / seg_labelweights, 1 / 3.0)) + else: + seg_labelweights = None + + self.ce_loss = nn.CrossEntropyLoss(weight=seg_labelweights, ignore_index=config['dataset_params']['ignore_label']) + self.lovasz_loss = Lovasz_loss(ignore=config['dataset_params']['ignore_label']) + + @staticmethod + def p2img_mapping(pts_fea, p2img_idx, batch_idx): + img_feat = [] + for b in range(batch_idx.max()+1): + img_feat.append(pts_fea[batch_idx == b][p2img_idx[b]]) + return torch.cat(img_feat, 0) + + @staticmethod + def voxelize_labels(labels, full_coors): + lbxyz = torch.cat([labels.reshape(-1, 1), full_coors], dim=-1) + unq_lbxyz, count = torch.unique(lbxyz, return_counts=True, dim=0) + inv_ind = torch.unique(unq_lbxyz[:, 1:], return_inverse=True, dim=0)[1] + label_ind = torch_scatter.scatter_max(count, inv_ind)[1] + labels = unq_lbxyz[:, 0][label_ind] + return labels + + def seg_loss(self, logits, labels): + ce_loss = self.ce_loss(logits, labels) + lovasz_loss = self.lovasz_loss(F.softmax(logits, dim=1), labels) + return ce_loss + lovasz_loss + + def fusion_to_single_KD(self, data_dict, idx): + batch_idx = data_dict['batch_idx'] + point2img_index = data_dict['point2img_index'] + last_scale = self.scale_list[idx - 1] if idx > 0 else 1 + img_feat = data_dict['img_scale{}'.format(self.scale_list[idx])] + pts_feat = data_dict['layer_{}'.format(idx)]['pts_feat'] + coors_inv = data_dict['scale_{}'.format(last_scale)]['coors_inv'] + + # 3D prediction + pts_pred_full = self.multihead_3d_classifier[idx](pts_feat) + + # correspondence + pts_label_full = self.voxelize_labels(data_dict['labels'], data_dict['layer_{}'.format(idx)]['full_coors']) + pts_feat = self.p2img_mapping(pts_feat[coors_inv], point2img_index, batch_idx) + pts_pred = self.p2img_mapping(pts_pred_full[coors_inv], point2img_index, batch_idx) + + # modality fusion + feat_learner = F.relu(self.leaners[idx](pts_feat)) + feat_cat = torch.cat([img_feat, feat_learner], 1) + feat_cat = self.fcs1[idx](feat_cat) + feat_weight = torch.sigmoid(self.fcs2[idx](feat_cat)) + fuse_feat = F.relu(feat_cat * feat_weight) + + # fusion prediction + fuse_pred = self.multihead_fuse_classifier[idx](fuse_feat) + + # Segmentation Loss + seg_loss_3d = self.seg_loss(pts_pred_full, pts_label_full) + seg_loss_2d = self.seg_loss(fuse_pred, data_dict['img_label']) + loss = seg_loss_3d + seg_loss_2d * self.lambda_seg2d / self.num_scales + + # KL divergence + xm_loss = F.kl_div( + F.log_softmax(pts_pred, dim=1), + F.softmax(fuse_pred.detach(), dim=1), + ) + loss += xm_loss * self.lambda_xm / self.num_scales + + return loss, fuse_feat + + def forward(self, data_dict): + loss = 0 + img_seg_feat = [] + + for idx in range(self.num_scales): + singlescale_loss, fuse_feat = self.fusion_to_single_KD(data_dict, idx) + img_seg_feat.append(fuse_feat) + loss += singlescale_loss + + img_seg_logits = self.classifier(torch.cat(img_seg_feat, 1)) + loss += self.seg_loss(img_seg_logits, data_dict['img_label']) + data_dict['loss'] += loss + + return data_dict + + +class get_model(LightningBaseModel): + def __init__(self, config): + super(get_model, self).__init__(config) + self.save_hyperparameters() + self.baseline_only = config.baseline_only + self.num_classes = config.model_params.num_classes + self.hiden_size = config.model_params.hiden_size + self.lambda_seg2d = config.train_params.lambda_seg2d + self.lambda_xm = config.train_params.lambda_xm + self.scale_list = config.model_params.scale_list + self.num_scales = len(self.scale_list) + + self.model_3d = SPVCNN(config) + if not self.baseline_only: + self.model_2d = ResNetFCN( + backbone=config.model_params.backbone_2d, + pretrained=config.model_params.pretrained2d, + config=config + ) + self.fusion = xModalKD(config) + else: + print('Start vanilla training!') + + def forward(self, data_dict): + # 3D network + data_dict = self.model_3d(data_dict) + + # training with 2D network + if self.training and not self.baseline_only: + data_dict = self.model_2d(data_dict) + data_dict = self.fusion(data_dict) + + return data_dict \ No newline at end of file diff --git a/TwoDPASS/network/base_model.py b/TwoDPASS/network/base_model.py new file mode 100644 index 0000000..f0b5ffe --- /dev/null +++ b/TwoDPASS/network/base_model.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +@author: Xu Yan +@file: base_model.py +@time: 2021/12/7 22:39 +''' +import os +import torch +import yaml +import json +import numpy as np +import pytorch_lightning as pl + +from datetime import datetime +from pytorch_lightning.metrics import Accuracy +from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR, CosineAnnealingLR +from TwoDPASS.utils.metric_util import IoU +from TwoDPASS.utils.schedulers import cosine_schedule_with_warmup + + +class LightningBaseModel(pl.LightningModule): + def __init__(self, args): + super().__init__() + self.args = args + self.train_acc = Accuracy() + self.val_acc = Accuracy(compute_on_step=False) + self.val_iou = IoU(self.args['dataset_params'], compute_on_step=False) + + if self.args['submit_to_server']: + self.submit_dir = os.path.dirname(self.args['checkpoint']) + '/submit_' + datetime.now().strftime( + '%Y_%m_%d') + with open(self.args['dataset_params']['label_mapping'], 'r') as stream: + self.mapfile = yaml.safe_load(stream) + + self.ignore_label = self.args['dataset_params']['ignore_label'] + + def configure_optimizers(self): + if self.args['train_params']['optimizer'] == 'Adam': + optimizer = torch.optim.Adam(self.parameters(), + lr=self.args['train_params']["learning_rate"]) + elif self.args['train_params']['optimizer'] == 'SGD': + optimizer = torch.optim.SGD(self.parameters(), + lr=self.args['train_params']["learning_rate"], + momentum=self.args['train_params']["momentum"], + weight_decay=self.args['train_params']["weight_decay"], + nesterov=self.args['train_params']["nesterov"]) + else: + raise NotImplementedError + + if self.args['train_params']["lr_scheduler"] == 'StepLR': + lr_scheduler = StepLR( + optimizer, + step_size=self.args['train_params']["decay_step"], + gamma=self.args['train_params']["decay_rate"] + ) + elif self.args['train_params']["lr_scheduler"] == 'ReduceLROnPlateau': + lr_scheduler = ReduceLROnPlateau( + optimizer, + mode='max', + factor=self.args['train_params']["decay_rate"], + patience=self.args['train_params']["decay_step"], + verbose=True + ) + elif self.args['train_params']["lr_scheduler"] == 'CosineAnnealingLR': + lr_scheduler = CosineAnnealingLR( + optimizer, + T_max=self.args['train_params']['max_num_epochs'] - 4, + eta_min=1e-5, + ) + elif self.args['train_params']["lr_scheduler"] == 'CosineAnnealingWarmRestarts': + from functools import partial + lr_scheduler = torch.optim.lr_scheduler.LambdaLR( + optimizer, lr_lambda=partial( + cosine_schedule_with_warmup, + num_epochs=self.args['train_params']['max_num_epochs'], + batch_size=self.args['dataset_params']['train_data_loader']['batch_size'], + dataset_size=self.args['dataset_params']['training_size'], + num_gpu=len(self.args.gpu) + ), + ) + else: + raise NotImplementedError + + scheduler = { + 'scheduler': lr_scheduler, + 'interval': 'step' if self.args['train_params']["lr_scheduler"] == 'CosineAnnealingWarmRestarts' else 'epoch', + 'frequency': 1 + } + + return { + 'optimizer': optimizer, + 'lr_scheduler': scheduler, + 'monitor': self.args.monitor, + } + + def forward(self, data): + pass + + def training_step(self, data_dict, batch_idx): + data_dict = self.forward(data_dict) + self.train_acc(data_dict['logits'].argmax(1)[data_dict['labels'] != self.ignore_label], + data_dict['labels'][data_dict['labels'] != self.ignore_label]) + self.log('train/acc', self.train_acc, on_epoch=True) + self.log('train/loss_main_ce', data_dict['loss_main_ce']) + self.log('train/loss_main_lovasz', data_dict['loss_main_lovasz']) + + return data_dict['loss'] + + + def validation_step(self, data_dict, batch_idx): + indices = data_dict['indices'] + raw_labels = data_dict['raw_labels'].squeeze(1).cpu() + origin_len = data_dict['origin_len'] + vote_logits = torch.zeros((len(raw_labels), self.num_classes)) + data_dict = self.forward(data_dict) + + if self.args['test']: + vote_logits.index_add_(0, indices.cpu(), data_dict['logits'].cpu()) + if self.args['dataset_params']['pc_dataset_type'] == 'SemanticKITTI_multiscan': + vote_logits = vote_logits[:origin_len] + raw_labels = raw_labels[:origin_len] + else: + vote_logits = data_dict['logits'].cpu() + raw_labels = data_dict['labels'].squeeze(0).cpu() + + prediction = vote_logits.argmax(1) + + if self.ignore_label != 0: + prediction = prediction[raw_labels != self.ignore_label] + raw_labels = raw_labels[raw_labels != self.ignore_label] + prediction += 1 + raw_labels += 1 + + self.val_acc(prediction, raw_labels) + self.log('val/acc', self.val_acc, on_epoch=True) + self.val_iou( + prediction.cpu().detach().numpy(), + raw_labels.cpu().detach().numpy(), + ) + + return data_dict['loss'] + + def test_step(self, data_dict, batch_idx): + indices = data_dict['indices'] + origin_len = data_dict['origin_len'] + raw_labels = data_dict['raw_labels'].squeeze(1).cpu() + path = data_dict['path'][0] + + vote_logits = torch.zeros((len(raw_labels), self.num_classes)) + data_dict = self.forward(data_dict) + vote_logits.index_add_(0, indices.cpu(), data_dict['logits'].cpu()) + + if self.args['dataset_params']['pc_dataset_type'] == 'SemanticKITTI_multiscan': + vote_logits = vote_logits[:origin_len] + raw_labels = raw_labels[:origin_len] + + prediction = vote_logits.argmax(1) + + if self.ignore_label != 0: + prediction = prediction[raw_labels != self.ignore_label] + raw_labels = raw_labels[raw_labels != self.ignore_label] + prediction += 1 + raw_labels += 1 + + if not self.args['submit_to_server']: + self.val_acc(prediction, raw_labels) + self.log('val/acc', self.val_acc, on_epoch=True) + self.val_iou( + prediction.cpu().detach().numpy(), + raw_labels.cpu().detach().numpy(), + ) + else: + if self.args['dataset_params']['pc_dataset_type'] != 'nuScenes': + components = path.split('/') + sequence = components[-3] + points_name = components[-1] + label_name = points_name.replace('bin', 'label') + full_save_dir = os.path.join(self.submit_dir, 'sequences', sequence, 'predictions') + os.makedirs(full_save_dir, exist_ok=True) + full_label_name = os.path.join(full_save_dir, label_name) + + if os.path.exists(full_label_name): + print('%s already exsist...' % (label_name)) + pass + + valid_labels = np.vectorize(self.mapfile['learning_map_inv'].__getitem__) + original_label = valid_labels(vote_logits.argmax(1).cpu().numpy().astype(int)) + final_preds = original_label.astype(np.uint32) + final_preds.tofile(full_label_name) + + else: + meta_dict = { + "meta": { + "use_camera": False, + "use_lidar": True, + "use_map": False, + "use_radar": False, + "use_external": False, + } + } + os.makedirs(os.path.join(self.submit_dir, 'test'), exist_ok=True) + with open(os.path.join(self.submit_dir, 'test', 'submission.json'), 'w', encoding='utf-8') as f: + json.dump(meta_dict, f) + original_label = prediction.cpu().numpy().astype(np.uint8) + + assert all((original_label > 0) & (original_label < 17)), \ + "Error: Array for predictions must be between 1 and 16 (inclusive)." + + full_save_dir = os.path.join(self.submit_dir, 'lidarseg/test') + full_label_name = os.path.join(full_save_dir, path + '_lidarseg.bin') + os.makedirs(full_save_dir, exist_ok=True) + + if os.path.exists(full_label_name): + print('%s already exsist...' % (full_label_name)) + else: + original_label.tofile(full_label_name) + + return data_dict['loss'] + + def validation_epoch_end(self, outputs): + iou, best_miou = self.val_iou.compute() + mIoU = np.nanmean(iou) + str_print = '' + self.log('val/mIoU', mIoU, on_epoch=True) + self.log('val/best_miou', best_miou, on_epoch=True) + str_print += 'Validation per class iou: ' + + for class_name, class_iou in zip(self.val_iou.unique_label_str, iou): + str_print += '\n%s : %.2f%%' % (class_name, class_iou * 100) + + str_print += '\nCurrent val miou is %.3f while the best val miou is %.3f' % (mIoU * 100, best_miou * 100) + self.print(str_print) + + def test_epoch_end(self, outputs): + if not self.args['submit_to_server']: + iou, best_miou = self.val_iou.compute() + mIoU = np.nanmean(iou) + str_print = '' + self.log('val/mIoU', mIoU, on_epoch=True) + self.log('val/best_miou', best_miou, on_epoch=True) + str_print += 'Validation per class iou: ' + + for class_name, class_iou in zip(self.val_iou.unique_label_str, iou): + str_print += '\n%s : %.2f%%' % (class_name, class_iou * 100) + + str_print += '\nCurrent val miou is %.3f while the best val miou is %.3f' % (mIoU * 100, best_miou * 100) + self.print(str_print) + + def on_after_backward(self) -> None: + """ + Skipping updates in case of unstable gradients + https://github.com/Lightning-AI/lightning/issues/4956 + """ + valid_gradients = True + for name, param in self.named_parameters(): + if param.grad is not None: + valid_gradients = not (torch.isnan(param.grad).any() or torch.isinf(param.grad).any()) + if not valid_gradients: + break + if not valid_gradients: + print(f'detected inf or nan values in gradients. not updating model parameters') + self.zero_grad() \ No newline at end of file diff --git a/TwoDPASS/network/baseline.py b/TwoDPASS/network/baseline.py new file mode 100644 index 0000000..ca8a3fa --- /dev/null +++ b/TwoDPASS/network/baseline.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +@author: Xu Yan +@file: baseline.py +@time: 2021/12/16 22:41 +''' +import torch +import torch_scatter +import spconv.pytorch as spconv +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +from TwoDPASS.network.basic_block import Lovasz_loss +from TwoDPASS.network.base_model import LightningBaseModel +from TwoDPASS.network.basic_block import SparseBasicBlock +from TwoDPASS.network.voxel_fea_generator import voxel_3d_generator, voxelization + + +class point_encoder(nn.Module): + def __init__(self, in_channels, out_channels, scale): + super(point_encoder, self).__init__() + self.scale = scale + self.layer_in = nn.Sequential( + nn.Linear(in_channels, out_channels), + nn.LeakyReLU(0.1, True), + ) + self.PPmodel = nn.Sequential( + nn.Linear(in_channels, out_channels // 2), + nn.LeakyReLU(0.1, True), + nn.BatchNorm1d(out_channels // 2), + nn.Linear(out_channels // 2, out_channels // 2), + nn.LeakyReLU(0.1, True), + nn.BatchNorm1d(out_channels // 2), + nn.Linear(out_channels // 2, out_channels), + nn.LeakyReLU(0.1, True), + ) + self.layer_out = nn.Sequential( + nn.Linear(2 * out_channels, out_channels), + nn.LeakyReLU(0.1, True), + nn.Linear(out_channels, out_channels)) + + @staticmethod + def downsample(coors, p_fea, scale=2): + batch = coors[:, 0:1] + coors = coors[:, 1:] // scale + inv = torch.unique(torch.cat([batch, coors], 1), return_inverse=True, dim=0)[1] + return torch_scatter.scatter_mean(p_fea, inv, dim=0), inv + + def forward(self, features, data_dict): + output, inv = self.downsample(data_dict['coors'], features) + identity = self.layer_in(features) + output = self.PPmodel(output)[inv] + output = torch.cat([identity, output], dim=1) + + v_feat = torch_scatter.scatter_mean( + self.layer_out(output[data_dict['coors_inv']]), + data_dict['scale_{}'.format(self.scale)]['coors_inv'], + dim=0 + ) + data_dict['coors'] = data_dict['scale_{}'.format(self.scale)]['coors'] + data_dict['coors_inv'] = data_dict['scale_{}'.format(self.scale)]['coors_inv'] + data_dict['full_coors'] = data_dict['scale_{}'.format(self.scale)]['full_coors'] + + return v_feat + + +class SPVBlock(nn.Module): + def __init__(self, in_channels, out_channels, indice_key, scale, last_scale, spatial_shape): + super(SPVBlock, self).__init__() + self.scale = scale + self.indice_key = indice_key + self.layer_id = indice_key.split('_')[1] + self.last_scale = last_scale + self.spatial_shape = spatial_shape + self.v_enc = spconv.SparseSequential( + SparseBasicBlock(in_channels, out_channels, self.indice_key), + SparseBasicBlock(out_channels, out_channels, self.indice_key), + ) + self.p_enc = point_encoder(in_channels, out_channels, scale) + + def forward(self, data_dict): + coors_inv_last = data_dict['scale_{}'.format(self.last_scale)]['coors_inv'] + coors_inv = data_dict['scale_{}'.format(self.scale)]['coors_inv'] + + # voxel encoder + v_fea = self.v_enc(data_dict['sparse_tensor']) + data_dict['layer_{}'.format(self.layer_id)] = {} + data_dict['layer_{}'.format(self.layer_id)]['pts_feat'] = v_fea.features + data_dict['layer_{}'.format(self.layer_id)]['full_coors'] = data_dict['full_coors'] + v_fea_inv = torch_scatter.scatter_mean(v_fea.features[coors_inv_last], coors_inv, dim=0) + + # point encoder + p_fea = self.p_enc( + features=data_dict['sparse_tensor'].features+v_fea.features, + data_dict=data_dict + ) + + # fusion and pooling + data_dict['sparse_tensor'] = spconv.SparseConvTensor( + features=p_fea+v_fea_inv, + indices=data_dict['coors'], + spatial_shape=self.spatial_shape, + batch_size=data_dict['batch_size'] + ) + + return p_fea[coors_inv] + + +class get_model(LightningBaseModel): + def __init__(self, config): + super(get_model, self).__init__(config) + self.save_hyperparameters() + self.input_dims = config['model_params']['input_dims'] + self.hiden_size = config['model_params']['hiden_size'] + self.num_classes = config['model_params']['num_classes'] + self.scale_list = config['model_params']['scale_list'] + self.num_scales = len(self.scale_list) + min_volume_space = config['dataset_params']['min_volume_space'] + max_volume_space = config['dataset_params']['max_volume_space'] + self.coors_range_xyz = [[min_volume_space[0], max_volume_space[0]], + [min_volume_space[1], max_volume_space[1]], + [min_volume_space[2], max_volume_space[2]]] + self.spatial_shape = np.array(config['model_params']['spatial_shape']) + self.strides = [int(scale / self.scale_list[0]) for scale in self.scale_list] + + # voxelization + self.voxelizer = voxelization( + coors_range_xyz=self.coors_range_xyz, + spatial_shape=self.spatial_shape, + scale_list=self.scale_list + ) + + # input processing + self.voxel_3d_generator = voxel_3d_generator( + in_channels=self.input_dims, + out_channels=self.hiden_size, + coors_range_xyz=self.coors_range_xyz, + spatial_shape=self.spatial_shape + ) + + # encoder layers + self.spv_enc = nn.ModuleList() + for i in range(self.num_scales): + self.spv_enc.append(SPVBlock( + in_channels=self.hiden_size, + out_channels=self.hiden_size, + indice_key='spv_'+ str(i), + scale=self.scale_list[i], + last_scale=self.scale_list[i-1] if i > 0 else 1, + spatial_shape=np.int32(self.spatial_shape // self.strides[i])[::-1].tolist()) + ) + + # decoder layer + self.classifier = nn.Sequential( + nn.Linear(self.hiden_size * self.num_scales, 128), + nn.ReLU(True), + nn.Linear(128, self.num_classes), + ) + + # loss + self.criterion = criterion(config) + + def forward(self, data_dict): + with torch.no_grad(): + data_dict = self.voxelizer(data_dict) + + data_dict = self.voxel_3d_generator(data_dict) + + enc_feats = [] + for i in range(self.num_scales): + enc_feats.append(self.spv_enc[i](data_dict)) + + output = torch.cat(enc_feats, dim=1) + data_dict['logits'] = self.classifier(output) + + data_dict['loss'] = 0. + data_dict = self.criterion(data_dict) + + return data_dict + + +class criterion(nn.Module): + def __init__(self, config): + super(criterion, self).__init__() + self.config = config + self.lambda_lovasz = self.config['train_params'].get('lambda_lovasz', 0.1) + if 'seg_labelweights' in config['dataset_params']: + seg_num_per_class = config['dataset_params']['seg_labelweights'] + seg_labelweights = seg_num_per_class / np.sum(seg_num_per_class) + seg_labelweights = torch.Tensor(np.power(np.amax(seg_labelweights) / seg_labelweights, 1 / 3.0)) + else: + seg_labelweights = None + + self.ce_loss = nn.CrossEntropyLoss( + weight=seg_labelweights, + ignore_index=config['dataset_params']['ignore_label'] + ) + self.lovasz_loss = Lovasz_loss( + ignore=config['dataset_params']['ignore_label'] + ) + + def forward(self, data_dict): + loss_main_ce = self.ce_loss(data_dict['logits'], data_dict['labels'].long()) + loss_main_lovasz = self.lovasz_loss(F.softmax(data_dict['logits'], dim=1), data_dict['labels'].long()) + loss_main = loss_main_ce + loss_main_lovasz * self.lambda_lovasz + data_dict['loss_main_ce'] = loss_main_ce + data_dict['loss_main_lovasz'] = loss_main_lovasz + data_dict['loss'] += loss_main + + return data_dict \ No newline at end of file diff --git a/TwoDPASS/network/basic_block.py b/TwoDPASS/network/basic_block.py new file mode 100644 index 0000000..bb5a170 --- /dev/null +++ b/TwoDPASS/network/basic_block.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +@author: Xu Yan +@file: basic_block.py +@time: 2021/12/16 20:34 +''' +import torch +import spconv.pytorch as spconv +import torch.nn as nn +import torch.nn.functional as F + +from torchvision.models.resnet import resnet34 +from TwoDPASS.utils.lovasz_loss import lovasz_softmax + + +class SparseBasicBlock(spconv.SparseModule): + def __init__(self, in_channels, out_channels, indice_key): + super(SparseBasicBlock, self).__init__() + self.layers_in = spconv.SparseSequential( + spconv.SubMConv3d(in_channels, out_channels, 1, indice_key=indice_key, bias=False), + nn.BatchNorm1d(out_channels), + ) + self.layers = spconv.SparseSequential( + spconv.SubMConv3d(in_channels, out_channels, 3, indice_key=indice_key, bias=False), + nn.BatchNorm1d(out_channels), + nn.LeakyReLU(0.1), + spconv.SubMConv3d(out_channels, out_channels, 3, indice_key=indice_key, bias=False), + nn.BatchNorm1d(out_channels), + ) + + def forward(self, x): + identity = self.layers_in(x) + output = self.layers(x) + return output.replace_feature(F.leaky_relu(output.features + identity.features, 0.1)) + + +class ResNetFCN(nn.Module): + def __init__(self, backbone="resnet34", pretrained=True, config=None): + super(ResNetFCN, self).__init__() + + if backbone == "resnet34": + net = resnet34(pretrained) + else: + raise NotImplementedError("invalid backbone: {}".format(backbone)) + self.hiden_size = config['model_params']['hiden_size'] + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False) + self.conv1.weight.data = net.conv1.weight.data + self.bn1 = net.bn1 + self.relu = net.relu + self.maxpool = net.maxpool + self.layer1 = net.layer1 + self.layer2 = net.layer2 + self.layer3 = net.layer3 + self.layer4 = net.layer4 + + # Decoder + self.deconv_layer1 = nn.Sequential( + nn.Conv2d(64, self.hiden_size, kernel_size=7, stride=1, padding=3, bias=False), + nn.ReLU(inplace=True), + nn.UpsamplingNearest2d(scale_factor=2), + ) + self.deconv_layer2 = nn.Sequential( + nn.Conv2d(128, self.hiden_size, kernel_size=7, stride=1, padding=3, bias=False), + nn.ReLU(inplace=True), + nn.UpsamplingNearest2d(scale_factor=4), + ) + self.deconv_layer3 = nn.Sequential( + nn.Conv2d(256, 64, kernel_size=7, stride=1, padding=3, bias=False), + nn.ReLU(inplace=True), + nn.ConvTranspose2d(64, self.hiden_size, kernel_size=3, stride=2, padding=1, dilation=1, output_padding=1), + nn.ReLU(inplace=True), + nn.UpsamplingNearest2d(scale_factor=4), + ) + self.deconv_layer4 = nn.Sequential( + nn.Conv2d(512, 64, kernel_size=7, stride=1, padding=3, bias=False), + nn.ReLU(inplace=True), + nn.ConvTranspose2d(64, 64, kernel_size=3, stride=2, padding=1, dilation=1, output_padding=1), + nn.ReLU(inplace=True), + nn.ConvTranspose2d(64, self.hiden_size, kernel_size=3, stride=2, padding=1, dilation=1, output_padding=1), + nn.ReLU(inplace=True), + nn.UpsamplingNearest2d(scale_factor=4), + ) + + def forward(self, data_dict): + x = data_dict['img'] + h, w = x.shape[2], x.shape[3] + if h % 16 != 0 or w % 16 != 0: + assert False, "invalid input size: {}".format(x.shape) + + # Encoder + conv1_out = self.relu(self.bn1(self.conv1(x))) + layer1_out = self.layer1(self.maxpool(conv1_out)) + layer2_out = self.layer2(layer1_out) + layer3_out = self.layer3(layer2_out) + layer4_out = self.layer4(layer3_out) + + # Deconv + layer1_out = self.deconv_layer1(layer1_out) + layer2_out = self.deconv_layer2(layer2_out) + layer3_out = self.deconv_layer3(layer3_out) + layer4_out = self.deconv_layer4(layer4_out) + + data_dict['img_scale2'] = layer1_out + data_dict['img_scale4'] = layer2_out + data_dict['img_scale8'] = layer3_out + data_dict['img_scale16'] = layer4_out + + process_keys = [k for k in data_dict.keys() if k.find('img_scale') != -1] + img_indices = data_dict['img_indices'] + + temp = {k: [] for k in process_keys} + + for i in range(x.shape[0]): + for k in process_keys: + temp[k].append(data_dict[k].permute(0, 2, 3, 1)[i][img_indices[i][:, 0], img_indices[i][:, 1]]) + + for k in process_keys: + data_dict[k] = torch.cat(temp[k], 0) + + return data_dict + +class Lovasz_loss(nn.Module): + def __init__(self, ignore=None): + super(Lovasz_loss, self).__init__() + self.ignore = ignore + + def forward(self, probas, labels): + return lovasz_softmax(probas, labels, ignore=self.ignore) \ No newline at end of file diff --git a/TwoDPASS/network/minkowskinet.py b/TwoDPASS/network/minkowskinet.py new file mode 100644 index 0000000..97e2e19 --- /dev/null +++ b/TwoDPASS/network/minkowskinet.py @@ -0,0 +1,171 @@ +import torch +import numpy as np +import torch.nn as nn +import torchsparse +import torchsparse.nn as spnn +import network.torchsparse_utils.basic_blocks as basic_blocks +import torch.nn.functional as F + +from TwoDPASS.network.torchsparse_utils.utils import * +from torchsparse import PointTensor +from TwoDPASS.network.torchsparse_utils.base_model import LightningBaseModel +from TwoDPASS.network.basic_block import Lovasz_loss + + +class get_model(LightningBaseModel): + def __init__(self, config): + super().__init__(config, None) + self.save_hyperparameters() + + cr = config.model_params.cr + cs = config.model_params.layer_num + cs = [int(cr * x) for x in cs] + + self.pres = self.vres = config.model_params.voxel_size + self.num_classes = config.model_params.num_class + + self.stem = nn.Sequential( + spnn.Conv3d(config.model_params.input_dims, cs[0], kernel_size=3, stride=1), + spnn.BatchNorm(cs[0]), spnn.ReLU(True), + spnn.Conv3d(cs[0], cs[0], kernel_size=3, stride=1), + spnn.BatchNorm(cs[0]), spnn.ReLU(True)) + + self.stage1 = nn.Sequential( + basic_blocks.BasicConvolutionBlock(cs[0], cs[0], ks=2, stride=2, dilation=1), + basic_blocks.ResidualBlock(cs[0], cs[1], ks=3, stride=1, dilation=1), + basic_blocks.ResidualBlock(cs[1], cs[1], ks=3, stride=1, dilation=1), + ) + + self.stage2 = nn.Sequential( + basic_blocks.BasicConvolutionBlock(cs[1], cs[1], ks=2, stride=2, dilation=1), + basic_blocks.ResidualBlock(cs[1], cs[2], ks=3, stride=1, dilation=1), + basic_blocks.ResidualBlock(cs[2], cs[2], ks=3, stride=1, dilation=1), + ) + + self.stage3 = nn.Sequential( + basic_blocks.BasicConvolutionBlock(cs[2], cs[2], ks=2, stride=2, dilation=1), + basic_blocks.ResidualBlock(cs[2], cs[3], ks=3, stride=1, dilation=1), + basic_blocks.ResidualBlock(cs[3], cs[3], ks=3, stride=1, dilation=1), + ) + + self.stage4 = nn.Sequential( + basic_blocks.BasicConvolutionBlock(cs[3], cs[3], ks=2, stride=2, dilation=1), + basic_blocks.ResidualBlock(cs[3], cs[4], ks=3, stride=1, dilation=1), + basic_blocks.ResidualBlock(cs[4], cs[4], ks=3, stride=1, dilation=1), + ) + + self.up1 = nn.ModuleList([ + basic_blocks.BasicDeconvolutionBlock(cs[4], cs[5], ks=2, stride=2), + nn.Sequential( + basic_blocks.ResidualBlock(cs[5] + cs[3], cs[5], ks=3, stride=1, + dilation=1), + basic_blocks.ResidualBlock(cs[5], cs[5], ks=3, stride=1, dilation=1), + ) + ]) + + self.up2 = nn.ModuleList([ + basic_blocks.BasicDeconvolutionBlock(cs[5], cs[6], ks=2, stride=2), + nn.Sequential( + basic_blocks.ResidualBlock(cs[6] + cs[2], cs[6], ks=3, stride=1, + dilation=1), + basic_blocks.ResidualBlock(cs[6], cs[6], ks=3, stride=1, dilation=1), + ) + ]) + + self.up3 = nn.ModuleList([ + basic_blocks.BasicDeconvolutionBlock(cs[6], cs[7], ks=2, stride=2), + nn.Sequential( + basic_blocks.ResidualBlock(cs[7] + cs[1], cs[7], ks=3, stride=1, + dilation=1), + basic_blocks.ResidualBlock(cs[7], cs[7], ks=3, stride=1, dilation=1), + ) + ]) + + self.up4 = nn.ModuleList([ + basic_blocks.BasicDeconvolutionBlock(cs[7], cs[8], ks=2, stride=2), + nn.Sequential( + basic_blocks.ResidualBlock(cs[8] + cs[0], cs[8], ks=3, stride=1, + dilation=1), + basic_blocks.ResidualBlock(cs[8], cs[8], ks=3, stride=1, dilation=1), + ) + ]) + + self.classifier = nn.Sequential(nn.Linear(cs[8], self.num_classes)) + + self.criterion = get_loss(config) + self.weight_initialization() + self.dropout = nn.Dropout(0.3, True) + + def weight_initialization(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, data_dict): + x = data_dict['lidar'] + x.C = x.C.int() + + x0 = self.stem(x) + x1 = self.stage1(x0) + x2 = self.stage2(x1) + x3 = self.stage3(x2) + x4 = self.stage4(x3) + + y1 = self.up1[0](x4) + y1 = torchsparse.cat([y1, x3]) + y1 = self.up1[1](y1) + + y2 = self.up2[0](y1) + y2 = torchsparse.cat([y2, x2]) + y2 = self.up2[1](y2) + + y3 = self.up3[0](y2) + y3 = torchsparse.cat([y3, x1]) + y3 = self.up3[1](y3) + + y4 = self.up4[0](y3) + y4 = torchsparse.cat([y4, x0]) + y4 = self.up4[1](y4) + + output = self.classifier(y4.F) + data_dict['sparse_logits'] = output + data_dict = self.criterion(data_dict) + + return data_dict + + +class get_loss(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + if 'seg_labelweights' in config['dataset_params']: + seg_num_per_class = config['dataset_params']['seg_labelweights'] + seg_labelweights = seg_num_per_class / np.sum(seg_num_per_class) + seg_labelweights = torch.Tensor(np.power(np.amax(seg_labelweights) / seg_labelweights, 1 / 3.0)) + else: + seg_labelweights = None + + self.ce_loss = nn.CrossEntropyLoss( + weight=seg_labelweights, + ignore_index=config['dataset_params']['ignore_label'] + ) + self.lovasz_loss = Lovasz_loss( + ignore=config['dataset_params']['ignore_label'] + ) + + def forward(self, data_dict): + lovasz_loss = self.lovasz_loss( + F.softmax(data_dict['sparse_logits'], dim=1), + data_dict['sparse_label'] + ) + seg_loss = self.ce_loss(data_dict['sparse_logits'], data_dict['sparse_label']) + total_loss = lovasz_loss + seg_loss + data_dict['loss'] = total_loss + data_dict['loss_sparse'] = total_loss + data_dict['loss_main_ce'] = seg_loss + data_dict['loss_main_lovasz'] = lovasz_loss + + return data_dict + + diff --git a/TwoDPASS/network/spvcnn.py b/TwoDPASS/network/spvcnn.py new file mode 100644 index 0000000..e365caf --- /dev/null +++ b/TwoDPASS/network/spvcnn.py @@ -0,0 +1,286 @@ +import torch +import numpy as np +import torch.nn as nn +import torchsparse +import torchsparse.nn as spnn +import TwoDPASS.network.torchsparse_utils.basic_blocks as basic_blocks +import torch.nn.functional as F + +from TwoDPASS.network.torchsparse_utils.utils import * +from torchsparse import PointTensor +from TwoDPASS.network.torchsparse_utils.base_model import LightningBaseModel +from TwoDPASS.network.basic_block import Lovasz_loss + + +class get_model(LightningBaseModel): + def __init__(self, config): + super().__init__(config, None) + self.save_hyperparameters() + + cr = config.model_params.cr + cs = config.model_params.layer_num + cs = [int(cr * x) for x in cs] + + self.pres = self.vres = config.model_params.voxel_size + self.num_classes = config.model_params.num_class + + self.stem = nn.Sequential( + spnn.Conv3d(config.model_params.input_dims, cs[0], kernel_size=3, stride=1), + spnn.BatchNorm(cs[0]), spnn.ReLU(True), + spnn.Conv3d(cs[0], cs[0], kernel_size=3, stride=1), + spnn.BatchNorm(cs[0]), spnn.ReLU(True)) + + self.stage1 = nn.Sequential( + basic_blocks.BasicConvolutionBlock(cs[0], cs[0], ks=2, stride=2, dilation=1), + basic_blocks.ResidualBlock(cs[0], cs[1], ks=3, stride=1, dilation=1), + basic_blocks.ResidualBlock(cs[1], cs[1], ks=3, stride=1, dilation=1), + ) + + self.stage2 = nn.Sequential( + basic_blocks.BasicConvolutionBlock(cs[1], cs[1], ks=2, stride=2, dilation=1), + basic_blocks.ResidualBlock(cs[1], cs[2], ks=3, stride=1, dilation=1), + basic_blocks.ResidualBlock(cs[2], cs[2], ks=3, stride=1, dilation=1), + ) + + self.stage3 = nn.Sequential( + basic_blocks.BasicConvolutionBlock(cs[2], cs[2], ks=2, stride=2, dilation=1), + basic_blocks.ResidualBlock(cs[2], cs[3], ks=3, stride=1, dilation=1), + basic_blocks.ResidualBlock(cs[3], cs[3], ks=3, stride=1, dilation=1), + ) + + self.stage4 = nn.Sequential( + basic_blocks.BasicConvolutionBlock(cs[3], cs[3], ks=2, stride=2, dilation=1), + basic_blocks.ResidualBlock(cs[3], cs[4], ks=3, stride=1, dilation=1), + basic_blocks.ResidualBlock(cs[4], cs[4], ks=3, stride=1, dilation=1), + ) + + self.up1 = nn.ModuleList([ + basic_blocks.BasicDeconvolutionBlock(cs[4], cs[5], ks=2, stride=2), + nn.Sequential( + basic_blocks.ResidualBlock(cs[5] + cs[3], cs[5], ks=3, stride=1, + dilation=1), + basic_blocks.ResidualBlock(cs[5], cs[5], ks=3, stride=1, dilation=1), + ) + ]) + + self.up2 = nn.ModuleList([ + basic_blocks.BasicDeconvolutionBlock(cs[5], cs[6], ks=2, stride=2), + nn.Sequential( + basic_blocks.ResidualBlock(cs[6] + cs[2], cs[6], ks=3, stride=1, + dilation=1), + basic_blocks.ResidualBlock(cs[6], cs[6], ks=3, stride=1, dilation=1), + ) + ]) + + self.up3 = nn.ModuleList([ + basic_blocks.BasicDeconvolutionBlock(cs[6], cs[7], ks=2, stride=2), + nn.Sequential( + basic_blocks.ResidualBlock(cs[7] + cs[1], cs[7], ks=3, stride=1, + dilation=1), + basic_blocks.ResidualBlock(cs[7], cs[7], ks=3, stride=1, dilation=1), + ) + ]) + + self.up4 = nn.ModuleList([ + basic_blocks.BasicDeconvolutionBlock(cs[7], cs[8], ks=2, stride=2), + nn.Sequential( + basic_blocks.ResidualBlock(cs[8] + cs[0], cs[8], ks=3, stride=1, + dilation=1), + basic_blocks.ResidualBlock(cs[8], cs[8], ks=3, stride=1, dilation=1), + ) + ]) + + self.classifier = nn.Sequential(nn.Linear(cs[8], self.num_classes)) + + self.point_transforms = nn.ModuleList([ + nn.Sequential( + nn.Linear(cs[0], cs[4]), + nn.BatchNorm1d(cs[4]), + nn.ReLU(True), + ), + nn.Sequential( + nn.Linear(cs[4], cs[6]), + nn.BatchNorm1d(cs[6]), + nn.ReLU(True), + ), + nn.Sequential( + nn.Linear(cs[6], cs[8]), + nn.BatchNorm1d(cs[8]), + nn.ReLU(True), + ) + ]) + + self.criterion = get_loss(config) + self.weight_initialization() + self.dropout = nn.Dropout(0.3, True) + + def weight_initialization(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, data_dict): + x = data_dict['lidar'] + # x = data_dict['lidar'].to('cuda:0') + + # x: SparseTensor z: PointTensor + z = PointTensor(x.F, x.C.float()) + + x0 = initial_voxelize(z, self.pres, self.vres) + + x0 = self.stem(x0) + z0 = voxel_to_point(x0, z, nearest=False) + z0.F = z0.F + + x1 = point_to_voxel(x0, z0) + x1 = self.stage1(x1) + x2 = self.stage2(x1) + x3 = self.stage3(x2) + x4 = self.stage4(x3) + z1 = voxel_to_point(x4, z0) + z1.F = z1.F + self.point_transforms[0](z0.F) + + y1 = point_to_voxel(x4, z1) + y1.F = self.dropout(y1.F) + y1 = self.up1[0](y1) + y1 = torchsparse.cat([y1, x3]) + y1 = self.up1[1](y1) + + y2 = self.up2[0](y1) + y2 = torchsparse.cat([y2, x2]) + y2 = self.up2[1](y2) + z2 = voxel_to_point(y2, z1) + z2.F = z2.F + self.point_transforms[1](z1.F) + + y3 = point_to_voxel(y2, z2) + y3.F = self.dropout(y3.F) + y3 = self.up3[0](y3) + y3 = torchsparse.cat([y3, x1]) + y3 = self.up3[1](y3) + + y4 = self.up4[0](y3) + y4 = torchsparse.cat([y4, x0]) + y4 = self.up4[1](y4) + z3 = voxel_to_point(y4, z2) + z3.F = z3.F + self.point_transforms[2](z2.F) + + output = self.classifier(z3.F) + data_dict['sparse_logits'] = output + data_dict = self.criterion(data_dict) + + return data_dict + + def encode_points(self, data_dict, device=None): + if device: + x = data_dict['lidar'].to(device) + else: + x = data_dict['lidar'] + + # x: SparseTensor z: PointTensor + z = PointTensor(x.F, x.C.float()) + + x0 = initial_voxelize(z, self.pres, self.vres) + + x0 = self.stem(x0) + z0 = voxel_to_point(x0, z, nearest=False) + z0.F = z0.F + + x1 = point_to_voxel(x0, z0) + x1 = self.stage1(x1) + x2 = self.stage2(x1) + x3 = self.stage3(x2) + x4 = self.stage4(x3) + z1 = voxel_to_point(x4, z0) + z1.F = z1.F + self.point_transforms[0](z0.F) + + y1 = point_to_voxel(x4, z1) + y1.F = self.dropout(y1.F) + y1 = self.up1[0](y1) + y1 = torchsparse.cat([y1, x3]) + y1 = self.up1[1](y1) + + y2 = self.up2[0](y1) + y2 = torchsparse.cat([y2, x2]) + y2 = self.up2[1](y2) + z2 = voxel_to_point(y2, z1) + z2.F = z2.F + self.point_transforms[1](z1.F) + + y3 = point_to_voxel(y2, z2) + y3.F = self.dropout(y3.F) + y3 = self.up3[0](y3) + y3 = torchsparse.cat([y3, x1]) + y3 = self.up3[1](y3) + + y4 = self.up4[0](y3) + y4 = torchsparse.cat([y4, x0]) + y4 = self.up4[1](y4) + z3 = voxel_to_point(y4, z2) + z3.F = z3.F + self.point_transforms[2](z2.F) + + # inverse transform voxels to all points, assume the batch size is one, and all points belongs to same scene # + inv_map = data_dict['inverse_map'].F + features_mapped = z3.F[inv_map] # per points features instead of per voxel + + return features_mapped + + def decode_points(self, features, device=None): + if device: + features = features.to(device) + + output = self.classifier(features) + + return output + + # def decode_points(self, data_dict, features, device=None): + # if device: + # features = features.to(device) + + # output = self.classifier(features) + # data_dict['sparse_logits'] = output + # data_dict = self.criterion(data_dict, sparse_label=False) + + # return data_dict + + +class get_loss(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + if 'seg_labelweights' in config['dataset_params']: + seg_num_per_class = config['dataset_params']['seg_labelweights'] + seg_labelweights = seg_num_per_class / np.sum(seg_num_per_class) + seg_labelweights = torch.Tensor(np.power(np.amax(seg_labelweights) / seg_labelweights, 1 / 3.0)) + else: + seg_labelweights = None + + self.ce_loss = nn.CrossEntropyLoss( + weight=seg_labelweights, + ignore_index=config['dataset_params']['ignore_label'] + ) + self.lovasz_loss = Lovasz_loss( + ignore=config['dataset_params']['ignore_label'] + ) + + def forward(self, data_dict, sparse_label=True): + if sparse_label: + lovasz_loss = self.lovasz_loss( + F.softmax(data_dict['sparse_logits'], dim=1), + data_dict['sparse_label'] + ) + seg_loss = self.ce_loss(data_dict['sparse_logits'], data_dict['sparse_label']) + else: + lovasz_loss = self.lovasz_loss( + F.softmax(data_dict['sparse_logits'], dim=1), + data_dict['targets_mapped'].F.view(-1,) + ) + seg_loss = self.ce_loss(data_dict['sparse_logits'], data_dict['targets_mapped'].F.view(-1,)) + total_loss = lovasz_loss + seg_loss + data_dict['loss'] = total_loss + data_dict['loss_sparse'] = total_loss + data_dict['loss_main_ce'] = seg_loss + data_dict['loss_main_lovasz'] = lovasz_loss + + return data_dict + + diff --git a/TwoDPASS/network/torchsparse_utils/base_model.py b/TwoDPASS/network/torchsparse_utils/base_model.py new file mode 100644 index 0000000..21982e8 --- /dev/null +++ b/TwoDPASS/network/torchsparse_utils/base_model.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +@author: Xu Yan +@file: base_model.py +@time: 2021/12/7 22:39 +''' +import os +import torch +import yaml +import json +import numpy as np +import pytorch_lightning as pl + +from datetime import datetime +from pytorch_lightning.metrics import Accuracy +from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR, CosineAnnealingWarmRestarts, CosineAnnealingLR +from TwoDPASS.utils.metric_util import IoU +from TwoDPASS.utils.schedulers import cosine_schedule_with_warmup + + +class LightningBaseModel(pl.LightningModule): + def __init__(self, args, criterion): + super().__init__() + self.args = args + self.criterion = criterion + self.train_acc = Accuracy() + self.val_acc = Accuracy(compute_on_step=False) + self.val_iou = IoU(self.args['dataset_params'], compute_on_step=False) + + if self.args['submit_to_server']: + self.submit_dir = os.path.dirname(self.args['checkpoint']) + '/submit_' + datetime.now().strftime('%Y_%m_%d') + with open(self.args['dataset_params']['label_mapping'], 'r') as stream: + self.mapfile = yaml.safe_load(stream) + + self.ignore_label = self.args['dataset_params']['ignore_label'] + + def configure_optimizers(self): + if self.args['train_params']['optimizer'] == 'Adam': + optimizer = torch.optim.Adam(self.parameters(), + lr=self.args['train_params']["learning_rate"]) + elif self.args['train_params']['optimizer'] == 'SGD': + optimizer = torch.optim.SGD(self.parameters(), + lr=self.args['train_params']["learning_rate"], + momentum=self.args['train_params']["momentum"], + weight_decay=self.args['train_params']["weight_decay"], + nesterov=self.args['train_params']["nesterov"]) + else: + raise NotImplementedError + + if self.args['train_params']["lr_scheduler"] == 'StepLR': + lr_scheduler = StepLR( + optimizer, + step_size=self.args['train_params']["decay_step"], + gamma=self.args['train_params']["decay_rate"] + ) + elif self.args['train_params']["lr_scheduler"] == 'ReduceLROnPlateau': + lr_scheduler = ReduceLROnPlateau( + optimizer, + mode='max', + factor=self.args['train_params']["decay_rate"], + patience=self.args['train_params']["decay_step"], + verbose=True + ) + elif self.args['train_params']["lr_scheduler"] == 'CosineAnnealingLR': + lr_scheduler = CosineAnnealingLR( + optimizer, + T_max=self.args['train_params']['max_num_epochs']-4, + eta_min=1e-5, + ) + elif self.args['train_params']["lr_scheduler"] == 'CosineAnnealingWarmRestarts': + from functools import partial + lr_scheduler = torch.optim.lr_scheduler.LambdaLR( + optimizer, lr_lambda=partial( + cosine_schedule_with_warmup, + num_epochs=self.args['train_params']['max_num_epochs'], + batch_size=self.args['dataset_params']['train_data_loader']['batch_size'], + dataset_size=self.args['dataset_params']['training_size'], + num_gpu=len(self.args.gpu) + ), + ) + else: + raise NotImplementedError + + scheduler = { + 'scheduler': lr_scheduler, + 'interval': 'step' if self.args['train_params']["lr_scheduler"] == 'CosineAnnealingWarmRestarts' else 'epoch', + 'frequency': 1 + } + + return { + 'optimizer': optimizer, + 'lr_scheduler': scheduler, + 'monitor': self.args.monitor, + } + + def forward(self, data): + pass + + + def training_step(self, data_dict, batch_idx): + data_dict['sparse_label'] = data_dict['targets'].F.long().squeeze(1) + data_dict = self.forward(data_dict) + + data_dict['targets'] = data_dict['targets'].F.squeeze(1) + self.train_acc(data_dict['sparse_logits'].argmax(1)[data_dict['targets'] != self.ignore_label], + data_dict['targets'][data_dict['targets'] != self.ignore_label]) + self.log('train/acc', self.train_acc, on_epoch=True) + self.log('train/loss_sparse', data_dict['loss_sparse']) + self.log('train/loss_main_ce', data_dict['loss_main_ce']) + self.log('train/loss_main_lovasz', data_dict['loss_main_lovasz']) + + return data_dict['loss'] + + + def validation_step(self, data_dict, batch_idx): + data_dict['sparse_label'] = data_dict['targets'].F.long().squeeze(1) + data_dict = self.forward(data_dict) + + indices = data_dict['ref_index'].F + raw_labels = data_dict['targets_mapped'].F[data_dict['targets_mapped'].C[:, -1] == 0].squeeze(1).cpu() + origin_len = data_dict['origin_len'] + vote_logits = torch.zeros((len(raw_labels), self.num_classes)) + + invs = data_dict['inverse_map'] + all_labels = data_dict['targets_mapped'] + _outputs = [] + _targets = [] + for idx in range(int(invs.C[:, -1].max() + 1)): + cur_scene_pts = (data_dict['lidar'].C[:, -1] == idx).cpu().numpy() + cur_inv = invs.F[invs.C[:, -1] == idx].cpu().numpy() + cur_label = (all_labels.C[:, -1] == idx).cpu().numpy() + outputs_mapped = data_dict['sparse_logits'][cur_scene_pts][cur_inv] + targets_mapped = all_labels.F[cur_label] + _outputs.append(outputs_mapped) + _targets.append(targets_mapped) + prediction_mapped = torch.cat(_outputs, 0) + + if self.args['test']: + vote_logits.index_add_(0, indices.cpu(), prediction_mapped.cpu()) + prediction = vote_logits.argmax(1).cpu() + if self.args['dataset_params']['pc_dataset_type'] == 'SemanticKITTI_multiscan': + prediction = prediction[:origin_len] + raw_labels = raw_labels[:origin_len] + else: + prediction = prediction_mapped.argmax(1).cpu() + raw_labels = torch.cat(_targets, 0).squeeze(1).cpu() + + if self.ignore_label != 0: + prediction = prediction[raw_labels != self.ignore_label] + raw_labels = raw_labels[raw_labels != self.ignore_label] + prediction += 1 + raw_labels += 1 + + self.val_acc(prediction, raw_labels) + self.log('val/acc', self.val_acc, on_epoch=True) + self.val_iou(prediction.cpu().detach().numpy(), + raw_labels.cpu().detach().numpy(), + ) + + return data_dict['loss'] + + + def test_step(self, data_dict, batch_idx): + path = data_dict['root'][0] + data_dict['sparse_label'] = data_dict['targets'].F.long().squeeze(1) + data_dict = self.forward(data_dict) + + indices = data_dict['ref_index'].F + raw_labels = data_dict['targets_mapped'].F[data_dict['targets_mapped'].C[:, -1] == 0].squeeze(1).cpu() + origin_len = data_dict['origin_len'] + vote_logits = torch.zeros((len(raw_labels), self.num_classes)) + + invs = data_dict['inverse_map'] + all_labels = data_dict['targets_mapped'] + _outputs = [] + _targets = [] + ##### previously we map the points to voxels and predict the logits for each voxel, then we will assign points based on the voxel it falls in ##### + ####### inverse points to voxel map? ######## + for idx in range(int(invs.C[:, -1].max() + 1)): + cur_scene_pts = (data_dict['lidar'].C[:, -1] == idx).cpu().numpy() + cur_inv = invs.F[invs.C[:, -1] == idx].cpu().numpy() + cur_label = (all_labels.C[:, -1] == idx).cpu().numpy() + outputs_mapped = data_dict['sparse_logits'][cur_scene_pts][cur_inv] + targets_mapped = all_labels.F[cur_label] + _outputs.append(outputs_mapped) + _targets.append(targets_mapped) + prediction_mapped = torch.cat(_outputs, 0) + + vote_logits.index_add_(0, indices.cpu(), prediction_mapped.cpu()) + prediction = vote_logits.argmax(1) + + ##### TODO: separate encoder and decoder ##### + per_point_features = self.encode_points(data_dict) + data_dict_points = self.decode_points(data_dict, per_point_features) + + indices = data_dict['ref_index'].F + raw_labels = data_dict['targets_mapped'].F[data_dict['targets_mapped'].C[:, -1] == 0].squeeze(1).cpu() + origin_len = data_dict['origin_len'] + vote_logits = torch.zeros((len(raw_labels), self.num_classes)) + + invs = data_dict['inverse_map'] + all_labels = data_dict['targets_mapped'] + _outputs_points = [] + _targets = [] + for idx in range(int(invs.C[:, -1].max() + 1)): + cur_scene_pts = (data_dict['lidar'].C[:, -1] == idx).cpu().numpy() + cur_inv = invs.F[invs.C[:, -1] == idx].cpu().numpy() + cur_label = (all_labels.C[:, -1] == idx).cpu().numpy() + + outputs_mapped = data_dict_points['sparse_logits'] + targets_mapped = all_labels.F + _outputs_points.append(outputs_mapped) + _targets.append(targets_mapped) + prediction_mapped = torch.cat(_outputs_points, 0) + + vote_logits.index_add_(0, indices.cpu(), prediction_mapped.cpu()) + prediction_per_points = vote_logits.argmax(1) + ##### TODO: separate encoder and decoder ##### + + if self.ignore_label != 0: + prediction = prediction[raw_labels != self.ignore_label] + raw_labels = raw_labels[raw_labels != self.ignore_label] + prediction += 1 + raw_labels += 1 + + if not self.args['submit_to_server']: + self.val_acc(prediction, raw_labels) + self.log('val/acc', self.val_acc, on_epoch=True) + self.val_iou(prediction.cpu().detach().numpy(), + raw_labels.cpu().detach().numpy(), + ) + else: + if self.args['dataset_params']['pc_dataset_type'] != 'nuScenes': + components = path.split('/') + sequence = components[-3] + points_name = components[-1] + label_name = points_name.replace('bin', 'label') + full_save_dir = os.path.join(self.submit_dir, 'sequences', sequence, 'predictions') + os.makedirs(full_save_dir, exist_ok=True) + full_label_name = os.path.join(full_save_dir, label_name) + + if os.path.exists(full_label_name): + print('%s already exsist...' % (label_name)) + pass + + valid_labels = np.vectorize(self.mapfile['learning_map_inv'].__getitem__) + original_label = valid_labels(vote_logits.argmax(1).cpu().numpy().astype(int)) + final_preds = original_label.astype(np.uint32) + final_preds.tofile(full_label_name) + + else: + meta_dict = { + "meta": { + "use_camera": False, + "use_lidar": True, + "use_map": False, + "use_radar": False, + "use_external": False, + } + } + os.makedirs(os.path.join(self.submit_dir, 'test'), exist_ok=True) + with open(os.path.join(self.submit_dir, 'test', 'submission.json'), 'w', encoding='utf-8') as f: + json.dump(meta_dict, f) + original_label = prediction.cpu().numpy().astype(np.uint8) + + assert all((original_label > 0) & (original_label < 17)), \ + "Error: Array for predictions must be between 1 and 16 (inclusive)." + + full_save_dir = os.path.join(self.submit_dir, 'lidarseg/test') + full_label_name = os.path.join(full_save_dir, path + '_lidarseg.bin') + os.makedirs(full_save_dir, exist_ok=True) + + if os.path.exists(full_label_name): + print('%s already exsist...' % (full_label_name)) + else: + original_label.tofile(full_label_name) + + return data_dict['loss'] + + + def validation_epoch_end(self, outputs): + iou, best_miou = self.val_iou.compute() + mIoU = np.nanmean(iou) + str_print = '' + self.log('val/mIoU', mIoU, on_epoch=True) + self.log('val/best_miou', best_miou, on_epoch=True) + str_print += 'Validation per class iou: ' + + for class_name, class_iou in zip(self.val_iou.unique_label_str, iou): + str_print += '\n%s : %.2f%%' % (class_name, class_iou * 100) + + str_print += '\nCurrent val miou is %.3f while the best val miou is %.3f' % (mIoU * 100, best_miou * 100) + self.print(str_print) + + + def test_epoch_end(self, outputs): + if not self.args['submit_to_server']: + iou, best_miou = self.val_iou.compute() + mIoU = np.nanmean(iou) + str_print = '' + self.log('val/mIoU', mIoU, on_epoch=True) + self.log('val/best_miou', best_miou, on_epoch=True) + str_print += 'Validation per class iou: ' + + for class_name, class_iou in zip(self.val_iou.unique_label_str, iou): + str_print += '\n%s : %.2f%%' % (class_name, class_iou * 100) + + str_print += '\nCurrent val miou is %.3f while the best val miou is %.3f' % (mIoU * 100, best_miou * 100) + self.print(str_print) + diff --git a/TwoDPASS/network/torchsparse_utils/basic_blocks.py b/TwoDPASS/network/torchsparse_utils/basic_blocks.py new file mode 100644 index 0000000..db7d8a7 --- /dev/null +++ b/TwoDPASS/network/torchsparse_utils/basic_blocks.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +@author: Xu Yan +@file: basic_blocks.py +@time: 2021/4/14 22:53 +''' +import torch +import torch.nn as nn +import torchsparse.nn as spnn +import torch_scatter + + +class BasicConvolutionBlock(nn.Module): + def __init__(self, inc, outc, ks=3, stride=1, dilation=1): + super().__init__() + self.net = nn.Sequential( + spnn.Conv3d( + inc, + outc, + kernel_size=ks, + dilation=dilation, + stride=stride), spnn.BatchNorm(outc), + spnn.ReLU(True)) + + def forward(self, x): + out = self.net(x) + return out + + +class BasicDeconvolutionBlock(nn.Module): + def __init__(self, inc, outc, ks=3, stride=1): + super().__init__() + self.net = nn.Sequential( + spnn.Conv3d( + inc, + outc, + kernel_size=ks, + stride=stride, + transposed=True), + spnn.BatchNorm(outc), + spnn.ReLU(True)) + + def forward(self, x): + return self.net(x) + + +class ResidualBlock(nn.Module): + def __init__(self, inc, outc, ks=3, stride=1, dilation=1): + super().__init__() + self.net = nn.Sequential( + spnn.Conv3d( + inc, + outc, + kernel_size=ks, + dilation=dilation, + stride=stride), spnn.BatchNorm(outc), + spnn.ReLU(True), + spnn.Conv3d( + outc, + outc, + kernel_size=ks, + dilation=dilation, + stride=1), + spnn.BatchNorm(outc)) + + self.downsample = nn.Sequential() if (inc == outc and stride == 1) else \ + nn.Sequential( + spnn.Conv3d(inc, outc, kernel_size=1, dilation=1, stride=stride), + spnn.BatchNorm(outc) + ) + + self.ReLU = spnn.ReLU(True) + + def forward(self, x): + out = self.ReLU(self.net(x) + self.downsample(x)) + return out + diff --git a/TwoDPASS/network/torchsparse_utils/utils.py b/TwoDPASS/network/torchsparse_utils/utils.py new file mode 100644 index 0000000..086176a --- /dev/null +++ b/TwoDPASS/network/torchsparse_utils/utils.py @@ -0,0 +1,100 @@ +import torch +import torchsparse.nn.functional as F +from torchsparse import PointTensor, SparseTensor +from torchsparse.nn.utils import get_kernel_offsets + +__all__ = ['initial_voxelize', 'point_to_voxel', 'voxel_to_point'] + + +# z: PointTensor +# return: SparseTensor +def initial_voxelize(z, init_res, after_res): + new_float_coord = torch.cat( + [(z.C[:, :3] * init_res) / after_res, z.C[:, -1].view(-1, 1)], 1) + + pc_hash = F.sphash(torch.floor(new_float_coord).int()) + sparse_hash = torch.unique(pc_hash) + idx_query = F.sphashquery(pc_hash, sparse_hash) + counts = F.spcount(idx_query.int(), len(sparse_hash)) + + inserted_coords = F.spvoxelize(torch.floor(new_float_coord), idx_query, + counts) + inserted_coords = torch.round(inserted_coords).int() + inserted_feat = F.spvoxelize(z.F, idx_query, counts) + + new_tensor = SparseTensor(inserted_feat, inserted_coords, 1) + new_tensor.cmaps.setdefault(new_tensor.stride, new_tensor.coords) + z.additional_features['idx_query'][1] = idx_query + z.additional_features['counts'][1] = counts + z.C = new_float_coord + + return new_tensor + + +# x: SparseTensor, z: PointTensor +# return: SparseTensor +def point_to_voxel(x, z): + if z.additional_features is None or z.additional_features.get( + 'idx_query') is None or z.additional_features['idx_query'].get( + x.s) is None: + pc_hash = F.sphash( + torch.cat([ + torch.floor(z.C[:, :3] / x.s[0]).int() * x.s[0], + z.C[:, -1].int().view(-1, 1) + ], 1)) + sparse_hash = F.sphash(x.C) + idx_query = F.sphashquery(pc_hash, sparse_hash) + counts = F.spcount(idx_query.int(), x.C.shape[0]) + z.additional_features['idx_query'][x.s] = idx_query + z.additional_features['counts'][x.s] = counts + else: + idx_query = z.additional_features['idx_query'][x.s] + counts = z.additional_features['counts'][x.s] + + inserted_feat = F.spvoxelize(z.F, idx_query, counts) + new_tensor = SparseTensor(inserted_feat, x.C, x.s) + new_tensor.cmaps = x.cmaps + new_tensor.kmaps = x.kmaps + + return new_tensor + + +# x: SparseTensor, z: PointTensor +# return: PointTensor +def voxel_to_point(x, z, nearest=False): + if z.idx_query is None or z.weights is None or z.idx_query.get( + x.s) is None or z.weights.get(x.s) is None: + off = get_kernel_offsets(2, x.s, 1, device=z.F.device) + old_hash = F.sphash( + torch.cat([ + torch.floor(z.C[:, :3] / x.s[0]).int() * x.s[0], + z.C[:, -1].int().view(-1, 1) + ], 1), off) + pc_hash = F.sphash(x.C.to(z.F.device)) + idx_query = F.sphashquery(old_hash, pc_hash) + weights = F.calc_ti_weights(z.C, idx_query, + scale=x.s[0]).transpose(0, 1).contiguous() + idx_query = idx_query.transpose(0, 1).contiguous() + if nearest: + weights[:, 1:] = 0. + idx_query[:, 1:] = -1 + new_feat = F.spdevoxelize(x.F, idx_query, weights) + new_tensor = PointTensor(new_feat, + z.C, + idx_query=z.idx_query, + weights=z.weights) + new_tensor.additional_features = z.additional_features + new_tensor.idx_query[x.s] = idx_query + new_tensor.weights[x.s] = weights + z.idx_query[x.s] = idx_query + z.weights[x.s] = weights + + else: + new_feat = F.spdevoxelize(x.F, z.idx_query.get(x.s), z.weights.get(x.s)) + new_tensor = PointTensor(new_feat, + z.C, + idx_query=z.idx_query, + weights=z.weights) + new_tensor.additional_features = z.additional_features + + return new_tensor \ No newline at end of file diff --git a/TwoDPASS/network/voxel_fea_generator.py b/TwoDPASS/network/voxel_fea_generator.py new file mode 100644 index 0000000..d031ca1 --- /dev/null +++ b/TwoDPASS/network/voxel_fea_generator.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +@author: Xu Yan +@file: voxel_fea_generator.py +@time: 2021/8/4 13:36 +''' +import torch +import torch_scatter +import torch.nn as nn +import numpy as np +import spconv.pytorch as spconv + + +class voxelization(nn.Module): + def __init__(self, coors_range_xyz, spatial_shape, scale_list): + super(voxelization, self).__init__() + self.spatial_shape = spatial_shape + self.scale_list = scale_list + [1] + self.coors_range_xyz = coors_range_xyz + + @staticmethod + def sparse_quantize(pc, coors_range, spatial_shape): + idx = spatial_shape * (pc - coors_range[0]) / (coors_range[1] - coors_range[0]) + return idx.long() + + def forward(self, data_dict): + pc = data_dict['points'][:, :3] + + for idx, scale in enumerate(self.scale_list): + xidx = self.sparse_quantize(pc[:, 0], self.coors_range_xyz[0], np.ceil(self.spatial_shape[0] / scale)) + yidx = self.sparse_quantize(pc[:, 1], self.coors_range_xyz[1], np.ceil(self.spatial_shape[1] / scale)) + zidx = self.sparse_quantize(pc[:, 2], self.coors_range_xyz[2], np.ceil(self.spatial_shape[2] / scale)) + + bxyz_indx = torch.stack([data_dict['batch_idx'], xidx, yidx, zidx], dim=-1).long() + unq, unq_inv, unq_cnt = torch.unique(bxyz_indx, return_inverse=True, return_counts=True, dim=0) + unq = torch.cat([unq[:, 0:1], unq[:, [3, 2, 1]]], dim=1) + data_dict['scale_{}'.format(scale)] = { + 'full_coors': bxyz_indx, + 'coors_inv': unq_inv, + 'coors': unq.type(torch.int32) + } + return data_dict + + +class voxel_3d_generator(nn.Module): + def __init__(self, in_channels, out_channels, coors_range_xyz, spatial_shape): + super(voxel_3d_generator, self).__init__() + self.spatial_shape = spatial_shape + self.coors_range_xyz = coors_range_xyz + self.PPmodel = nn.Sequential( + nn.Linear(in_channels + 6, out_channels), + nn.ReLU(True), + nn.Linear(out_channels, out_channels) + ) + + def prepare_input(self, point, grid_ind, inv_idx): + pc_mean = torch_scatter.scatter_mean(point[:, :3], inv_idx, dim=0)[inv_idx] + nor_pc = point[:, :3] - pc_mean + + coors_range_xyz = torch.Tensor(self.coors_range_xyz) + cur_grid_size = torch.Tensor(self.spatial_shape) + crop_range = coors_range_xyz[:, 1] - coors_range_xyz[:, 0] + intervals = (crop_range / cur_grid_size).to(point.device) + voxel_centers = grid_ind * intervals + coors_range_xyz[:, 0].to(point.device) + center_to_point = point[:, :3] - voxel_centers + + pc_feature = torch.cat((point, nor_pc, center_to_point), dim=1) + return pc_feature + + def forward(self, data_dict): + pt_fea = self.prepare_input( + data_dict['points'], + data_dict['scale_1']['full_coors'][:, 1:], + data_dict['scale_1']['coors_inv'] + ) + pt_fea = self.PPmodel(pt_fea) + + features = torch_scatter.scatter_mean(pt_fea, data_dict['scale_1']['coors_inv'], dim=0) + data_dict['sparse_tensor'] = spconv.SparseConvTensor( + features=features, + indices=data_dict['scale_1']['coors'].int(), + spatial_shape=np.int32(self.spatial_shape)[::-1].tolist(), + batch_size=data_dict['batch_size'] + ) + + data_dict['coors'] = data_dict['scale_1']['coors'] + data_dict['coors_inv'] = data_dict['scale_1']['coors_inv'] + data_dict['full_coors'] = data_dict['scale_1']['full_coors'] + + return data_dict \ No newline at end of file diff --git a/TwoDPASS/pretrained/SPVCNN/SPVCNN-semantickitti.yaml b/TwoDPASS/pretrained/SPVCNN/SPVCNN-semantickitti.yaml new file mode 100644 index 0000000..e30a5ce --- /dev/null +++ b/TwoDPASS/pretrained/SPVCNN/SPVCNN-semantickitti.yaml @@ -0,0 +1,91 @@ +# Config format schema number +format_version: 2 + + +################### +## Model options +model_params: + model_architecture: "spvcnn" + + input_dims: 4 + voxel_size: 0.05 + cr: 1 # enlarge factor of layer_num + layer_num: + - 32 + - 32 + - 64 + - 128 + - 256 + - 256 + - 128 + - 96 + - 96 + + num_class: 20 + + +################### +## Dataset options +dataset_params: + training_size: 19132 + dataset_type: "voxel_dataset" + pc_dataset_type: "SemanticKITTI" + collate_type: "collate_fn_voxel" + ignore_label: 0 + label_mapping: "./config/label_mapping/semantic-kitti.yaml" + + seg_labelweights: + - 0 + - 55437630 + - 320797 + - 541736 + - 2578735 + - 3274484 + - 552662 + - 184064 + - 78858 + - 240942562 + - 17294618 + - 170599734 + - 6369672 + - 230413074 + - 101130274 + - 476491114 + - 9833174 + - 129609852 + - 4506626 + - 1168181 + + train_data_loader: + data_path: "./dataset/SemanticKitti/dataset/sequences/" + batch_size: 1 + shuffle: True + num_workers: 1 + rotate_aug: True + flip_aug: True + scale_aug: True + transform_aug: True + dropout_aug: True + + val_data_loader: + data_path: "./dataset/SemanticKitti/dataset/sequences/" + shuffle: False + num_workers: 1 + batch_size: 0 + rotate_aug: False + flip_aug: False + scale_aug: False + transform_aug: False + dropout_aug: False + + +################### +## Train params +train_params: + max_num_epochs: 64 + learning_rate: 2.4e-1 + optimizer: SGD # [SGD, Adam] + lr_scheduler: CosineAnnealingWarmRestarts # [StepLR, ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts] + momentum: 0.9 + nesterov: True + weight_decay: 1.0e-4 \ No newline at end of file diff --git a/TwoDPASS/robust_test.py b/TwoDPASS/robust_test.py new file mode 100644 index 0000000..4598d67 --- /dev/null +++ b/TwoDPASS/robust_test.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +@author: Xu Yan +@file: robust_test.py +@time: 2022/10/7 21:24 +''' + +import os +import yaml +import torch +import datetime +import importlib +import numpy as np +import pandas as pd +import pytorch_lightning as pl + +from pathlib import Path +from easydict import EasyDict +from argparse import ArgumentParser +from dataloader.corruption_dataset import SemanticKITTIC +from dataloader.dataset import get_model_class, get_collate_class +import warnings + +warnings.filterwarnings("ignore") + + +def load_yaml(file_name): + with open(file_name, 'r') as f: + try: + config = yaml.load(f, Loader=yaml.FullLoader) + except: + config = yaml.load(f) + return config + + +def parse_config(): + parser = ArgumentParser() + # general + parser.add_argument('--gpu', type=int, nargs='+', default=(0,), help='specify gpu devices') + parser.add_argument("--seed", default=0, type=int) + parser.add_argument('--config_path', default='config/semantickitti/2dpass-semantickitti.yaml') + # testing + parser.add_argument('--num_vote', type=int, default=12, help='number of voting in the test') + parser.add_argument('--checkpoint', type=str, default=None, help='load checkpoint') + # debug + parser.add_argument('--save_prediction', default=False, action='store_true') + parser.add_argument('--debug', default=False, action='store_true') + + args = parser.parse_args() + config = load_yaml(args.config_path) + config.update(vars(args)) # override the configuration using the value in args + + # voting test + config['dataset_params']['val_data_loader']['batch_size'] = args.num_vote + config['baseline_only'] = False + config['submit_to_server'] = args.save_prediction + config['test'] = True + + if args.num_vote > 1: + config['dataset_params']['val_data_loader']['rotate_aug'] = True + config['dataset_params']['val_data_loader']['transform_aug'] = True + if args.debug: + config['dataset_params']['val_data_loader']['batch_size'] = 2 + config['dataset_params']['val_data_loader']['num_workers'] = 0 + + return EasyDict(config) + + +def build_loader(config, corruption): + pc_dataset = SemanticKITTIC + # dataset_type = point_dataset + dataset_type = get_model_class(config['dataset_params']['dataset_type']) + val_config = config['dataset_params']['val_data_loader'] + + test_pt_dataset = pc_dataset( + config, + data_path=val_config['data_path'], + corruption=corruption, + num_vote=val_config["batch_size"] + ) + test_dataset_loader = torch.utils.data.DataLoader( + dataset=dataset_type(test_pt_dataset, config, val_config, num_vote=val_config["batch_size"]), + batch_size=val_config["batch_size"], + collate_fn=get_collate_class(config['dataset_params']['collate_type']), + shuffle=False, + num_workers=val_config["num_workers"] + ) + return test_dataset_loader + + +if __name__ == '__main__': + # parameters + configs = parse_config() + print(configs) + + # corruption dataset + with open('config/corruption/semantickittic.yaml', 'r') as stream: + corruption = yaml.safe_load(stream) + print(corruption) + + save_path = os.path.join(Path(configs.checkpoint).parent, + 'robust_test_' + str(datetime.datetime.now().strftime('%Y-%m-%d'))) + os.makedirs(save_path, exist_ok=True) + + # setting + os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, configs.gpu)) + num_gpu = len(configs.gpu) + assert num_gpu == 1, 'multi-GPU testing is not available!' + + # reproducibility + torch.manual_seed(configs.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = True + np.random.seed(configs.seed) + config_path = configs.config_path + results_dict = {} + + model_file = importlib.import_module('network.' + configs['model_params']['model_architecture']) + my_model = model_file.get_model(configs) + + pl.seed_everything(configs.seed) + my_model = my_model.load_from_checkpoint(configs.checkpoint, config=configs) + + for idx, cor in enumerate(corruption['corruption_name']): + print('[{}/{}] Start robust testing for {}...'.format(idx + 1, len(corruption['corruption_name']) + 1, cor)) + test_dataset_loader = build_loader(configs, cor) + + tester = pl.Trainer( + gpus=[i for i in range(num_gpu)], + accelerator='ddp', + resume_from_checkpoint=configs.checkpoint + ) + results = tester.test(my_model, test_dataset_loader) + results_dict[cor] = [results[0]['val/mIoU'], results[0]['val/acc']] + + df = pd.DataFrame(results_dict) + df.index = ['val/mIoU', 'val/acc'] + print(df.T) + df.T.to_csv(os.path.join(save_path, 'summary.csv')) diff --git a/TwoDPASS/utils/lovasz_loss.py b/TwoDPASS/utils/lovasz_loss.py new file mode 100644 index 0000000..ad85c45 --- /dev/null +++ b/TwoDPASS/utils/lovasz_loss.py @@ -0,0 +1,321 @@ +""" +Lovasz-Softmax and Jaccard hinge loss in PyTorch +Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License) +""" + +from __future__ import print_function, division + +import torch +from torch.autograd import Variable +import torch.nn.functional as F +import numpy as np +try: + from itertools import ifilterfalse +except ImportError: # py3k + from itertools import filterfalse as ifilterfalse + + +def lovasz_grad(gt_sorted): + """ + Computes gradient of the Lovasz extension w.r.t sorted errors + See Alg. 1 in paper + """ + p = len(gt_sorted) + gts = gt_sorted.sum() + intersection = gts - gt_sorted.float().cumsum(0) + union = gts + (1 - gt_sorted).float().cumsum(0) + jaccard = 1. - intersection / union + if p > 1: # cover 1-pixel case + jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] + return jaccard + + +def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True): + """ + IoU for foreground class + binary: 1 foreground, 0 background + """ + if not per_image: + preds, labels = (preds,), (labels,) + ious = [] + for pred, label in zip(preds, labels): + intersection = ((label == 1) & (pred == 1)).sum() + union = ((label == 1) | ((pred == 1) & (label != ignore))).sum() + if not union: + iou = EMPTY + else: + iou = float(intersection) / float(union) + ious.append(iou) + iou = mean(ious) # mean accross images if per_image + return 100 * iou + + +def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False): + """ + Array of IoU for each (non ignored) class + """ + if not per_image: + preds, labels = (preds,), (labels,) + ious = [] + for pred, label in zip(preds, labels): + iou = [] + for i in range(C): + if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes) + intersection = ((label == i) & (pred == i)).sum() + union = ((label == i) | ((pred == i) & (label != ignore))).sum() + if not union: + iou.append(EMPTY) + else: + iou.append(float(intersection) / float(union)) + ious.append(iou) + ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image + return 100 * np.array(ious) + + +# --------------------------- BINARY LOSSES --------------------------- + + +def lovasz_hinge(logits, labels, per_image=True, ignore=None): + """ + Binary Lovasz hinge loss + logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) + labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) + per_image: compute the loss per image instead of per batch + ignore: void class id + """ + if per_image: + loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore)) + for log, lab in zip(logits, labels)) + else: + loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore)) + return loss + + +def lovasz_hinge_flat(logits, labels): + """ + Binary Lovasz hinge loss + logits: [P] Variable, logits at each prediction (between -\infty and +\infty) + labels: [P] Tensor, binary ground truth labels (0 or 1) + ignore: label to ignore + """ + if len(labels) == 0: + # only void pixels, the gradients should be 0 + return logits.sum() * 0. + signs = 2. * labels.float() - 1. + errors = (1. - logits * Variable(signs)) + errors_sorted, perm = torch.sort(errors, dim=0, descending=True) + perm = perm.data + gt_sorted = labels[perm] + grad = lovasz_grad(gt_sorted) + loss = torch.dot(F.relu(errors_sorted), Variable(grad)) + return loss + + +def flatten_binary_scores(scores, labels, ignore=None): + """ + Flattens predictions in the batch (binary case) + Remove labels equal to 'ignore' + """ + scores = scores.view(-1) + labels = labels.view(-1) + if ignore is None: + return scores, labels + valid = (labels != ignore) + vscores = scores[valid] + vlabels = labels[valid] + return vscores, vlabels + + +class StableBCELoss(torch.nn.modules.Module): + def __init__(self): + super(StableBCELoss, self).__init__() + + def forward(self, input, target): + neg_abs = - input.abs() + loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log() + return loss.mean() + + +def binary_xloss(logits, labels, ignore=None): + """ + Binary Cross entropy loss + logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) + labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) + ignore: void class id + """ + logits, labels = flatten_binary_scores(logits, labels, ignore) + loss = StableBCELoss()(logits, Variable(labels.float())) + return loss + + +# --------------------------- MULTICLASS LOSSES --------------------------- + + +def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=None): + """ + Multi-class Lovasz-Softmax loss + probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). + Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. + labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + per_image: compute the loss per image instead of per batch + ignore: void class labels + """ + if per_image: + loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes) + for prob, lab in zip(probas, labels)) + else: + loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes) + return loss + + +def lovasz_softmax_flat(probas, labels, classes='present'): + """ + Multi-class Lovasz-Softmax loss + probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1) + labels: [P] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + """ + if probas.numel() == 0: + # only void pixels, the gradients should be 0 + return probas * 0. + C = probas.size(1) + losses = [] + class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes + for c in class_to_sum: + fg = (labels == c).float() # foreground for class c + if (classes is 'present' and fg.sum() == 0): + continue + if C == 1: + if len(classes) > 1: + raise ValueError('Sigmoid output possible only with 1 class') + class_pred = probas[:, 0] + else: + class_pred = probas[:, c] + errors = (Variable(fg) - class_pred).abs() + errors_sorted, perm = torch.sort(errors, 0, descending=True) + perm = perm.data + fg_sorted = fg[perm] + losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted)))) + return mean(losses) + + +def flatten_probas(probas, labels, ignore=None): + """ + Flattens predictions in the batch + """ + if probas.dim() == 3: + # assumes output of a sigmoid layer + B, C, N = probas.size() + probas = probas.view(B, C, 1, N).permute(0, 2, 3, 1).contiguous().view(-1, C) + elif probas.dim() == 5: + # 3D segmentation + B, C, L, H, W = probas.size() + probas = probas.contiguous().permute(0, 2, 3, 4, 1).contiguous().view(-1, C) + # B, C, H, W = probas.size() + # probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C + labels = labels.view(-1) + if ignore is None: + return probas, labels + valid = (labels != ignore) + vprobas = probas[valid.nonzero(as_tuple=False).squeeze()] + vlabels = labels[valid] + return vprobas, vlabels + + +def xloss(logits, labels, ignore=None): + """ + Cross entropy loss + """ + return F.cross_entropy(logits, Variable(labels), ignore_index=255) + + +def jaccard_loss(probas, labels, ignore=None, smooth=100, bk_class=None): + """ + Something wrong with this loss + Multi-class Lovasz-Softmax loss + probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). + Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. + labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + per_image: compute the loss per image instead of per batch + ignore: void class labels + """ + vprobas, vlabels = flatten_probas(probas, labels, ignore) + + true_1_hot = torch.eye(vprobas.shape[1])[vlabels] + + if bk_class: + one_hot_assignment = torch.ones_like(vlabels) + one_hot_assignment[vlabels == bk_class] = 0 + one_hot_assignment = one_hot_assignment.float().unsqueeze(1) + true_1_hot = true_1_hot * one_hot_assignment + + true_1_hot = true_1_hot.to(vprobas.device) + intersection = torch.sum(vprobas * true_1_hot) + cardinality = torch.sum(vprobas + true_1_hot) + loss = (intersection + smooth / (cardinality - intersection + smooth)).mean() + return (1 - loss) * smooth + + +def hinge_jaccard_loss(probas, labels, ignore=None, classes='present', hinge=0.1, smooth=100): + """ + Multi-class Hinge Jaccard loss + probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). + Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. + labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + ignore: void class labels + """ + vprobas, vlabels = flatten_probas(probas, labels, ignore) + C = vprobas.size(1) + losses = [] + class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes + for c in class_to_sum: + if c in vlabels: + c_sample_ind = vlabels == c + cprobas = vprobas[c_sample_ind, :] + non_c_ind = np.array([a for a in class_to_sum if a != c]) + class_pred = cprobas[:, c] + max_non_class_pred = torch.max(cprobas[:, non_c_ind], dim=1)[0] + TP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max=hinge) + 1.) + smooth + FN = torch.sum(torch.clamp(max_non_class_pred - class_pred, min=-hinge) + hinge) + + if (~c_sample_ind).sum() == 0: + FP = 0 + else: + nonc_probas = vprobas[~c_sample_ind, :] + class_pred = nonc_probas[:, c] + max_non_class_pred = torch.max(nonc_probas[:, non_c_ind], dim=1)[0] + FP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max=hinge) + 1.) + + losses.append(1 - TP / (TP + FP + FN)) + + if len(losses) == 0: return 0 + return mean(losses) + + +# --------------------------- HELPER FUNCTIONS --------------------------- +def isnan(x): + return x != x + + +def mean(l, ignore_nan=False, empty=0): + """ + nanmean compatible with generators. + """ + l = iter(l) + if ignore_nan: + l = ifilterfalse(isnan, l) + try: + n = 1 + acc = next(l) + except StopIteration: + if empty == 'raise': + raise ValueError('Empty mean') + return empty + for n, v in enumerate(l, 2): + acc += v + if n == 1: + return acc + return acc / n diff --git a/TwoDPASS/utils/metric_util.py b/TwoDPASS/utils/metric_util.py new file mode 100644 index 0000000..43dc4b9 --- /dev/null +++ b/TwoDPASS/utils/metric_util.py @@ -0,0 +1,41 @@ +import numpy as np +from pytorch_lightning.metrics import Metric +from TwoDPASS.dataloader.pc_dataset import get_SemKITTI_label_name + + +def fast_hist(pred, label, n): + k = (label >= 0) & (label < n) + bin_count = np.bincount( + n * label[k].astype(int) + pred[k], minlength=n ** 2) + return bin_count[:n ** 2].reshape(n, n) + + +def per_class_iu(hist): + return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) + + +def fast_hist_crop(output, target, unique_label): + hist = fast_hist(output.flatten(), target.flatten(), np.max(unique_label) + 2) + hist = hist[unique_label + 1, :] + hist = hist[:, unique_label + 1] + return hist + + +class IoU(Metric): + def __init__(self, dataset_config, dist_sync_on_step=False, compute_on_step=True): + super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=compute_on_step) + self.hist_list = [] + self.best_miou = 0 + self.SemKITTI_label_name = get_SemKITTI_label_name(dataset_config["label_mapping"]) + self.unique_label = np.asarray(sorted(list(self.SemKITTI_label_name.keys())))[1:] - 1 + self.unique_label_str = [self.SemKITTI_label_name[x] for x in self.unique_label + 1] + + def update(self, predict_labels, val_pt_labs) -> None: + self.hist_list.append(fast_hist_crop(predict_labels, val_pt_labs, self.unique_label)) + + def compute(self): + iou = per_class_iu(sum(self.hist_list)) + if np.nanmean(iou) > self.best_miou: + self.best_miou = np.nanmean(iou) + self.hist_list = [] + return iou, self.best_miou \ No newline at end of file diff --git a/TwoDPASS/utils/schedulers.py b/TwoDPASS/utils/schedulers.py new file mode 100644 index 0000000..45a8fe1 --- /dev/null +++ b/TwoDPASS/utils/schedulers.py @@ -0,0 +1,21 @@ +import numpy as np + +__all__ = ['cosine_schedule_with_warmup'] + + +def cosine_schedule_with_warmup(k, num_epochs, batch_size, dataset_size, num_gpu): + batch_size *= num_gpu + + if num_gpu == 1: + warmup_iters = 0 + else: + warmup_iters = 1000 // num_gpu + + if k < warmup_iters: + return (k + 1) / warmup_iters + else: + iter_per_epoch = (dataset_size + batch_size - 1) // batch_size + return 0.5 * (1 + np.cos(np.pi * (k - warmup_iters) / (num_epochs * iter_per_epoch))) + + + diff --git a/TwoDPASS/utils/turbo_cmap.py b/TwoDPASS/utils/turbo_cmap.py new file mode 100644 index 0000000..f2c9331 --- /dev/null +++ b/TwoDPASS/utils/turbo_cmap.py @@ -0,0 +1,30 @@ +# Reference: https://gist.github.com/mikhailov-work/ee72ba4191942acecc03fe6da94fc73f + +# Copyright 2019 Google LLC. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Anton Mikhailov + +turbo_colormap_data = [[0.18995,0.07176,0.23217],[0.19483,0.08339,0.26149],[0.19956,0.09498,0.29024],[0.20415,0.10652,0.31844],[0.20860,0.11802,0.34607],[0.21291,0.12947,0.37314],[0.21708,0.14087,0.39964],[0.22111,0.15223,0.42558],[0.22500,0.16354,0.45096],[0.22875,0.17481,0.47578],[0.23236,0.18603,0.50004],[0.23582,0.19720,0.52373],[0.23915,0.20833,0.54686],[0.24234,0.21941,0.56942],[0.24539,0.23044,0.59142],[0.24830,0.24143,0.61286],[0.25107,0.25237,0.63374],[0.25369,0.26327,0.65406],[0.25618,0.27412,0.67381],[0.25853,0.28492,0.69300],[0.26074,0.29568,0.71162],[0.26280,0.30639,0.72968],[0.26473,0.31706,0.74718],[0.26652,0.32768,0.76412],[0.26816,0.33825,0.78050],[0.26967,0.34878,0.79631],[0.27103,0.35926,0.81156],[0.27226,0.36970,0.82624],[0.27334,0.38008,0.84037],[0.27429,0.39043,0.85393],[0.27509,0.40072,0.86692],[0.27576,0.41097,0.87936],[0.27628,0.42118,0.89123],[0.27667,0.43134,0.90254],[0.27691,0.44145,0.91328],[0.27701,0.45152,0.92347],[0.27698,0.46153,0.93309],[0.27680,0.47151,0.94214],[0.27648,0.48144,0.95064],[0.27603,0.49132,0.95857],[0.27543,0.50115,0.96594],[0.27469,0.51094,0.97275],[0.27381,0.52069,0.97899],[0.27273,0.53040,0.98461],[0.27106,0.54015,0.98930],[0.26878,0.54995,0.99303],[0.26592,0.55979,0.99583],[0.26252,0.56967,0.99773],[0.25862,0.57958,0.99876],[0.25425,0.58950,0.99896],[0.24946,0.59943,0.99835],[0.24427,0.60937,0.99697],[0.23874,0.61931,0.99485],[0.23288,0.62923,0.99202],[0.22676,0.63913,0.98851],[0.22039,0.64901,0.98436],[0.21382,0.65886,0.97959],[0.20708,0.66866,0.97423],[0.20021,0.67842,0.96833],[0.19326,0.68812,0.96190],[0.18625,0.69775,0.95498],[0.17923,0.70732,0.94761],[0.17223,0.71680,0.93981],[0.16529,0.72620,0.93161],[0.15844,0.73551,0.92305],[0.15173,0.74472,0.91416],[0.14519,0.75381,0.90496],[0.13886,0.76279,0.89550],[0.13278,0.77165,0.88580],[0.12698,0.78037,0.87590],[0.12151,0.78896,0.86581],[0.11639,0.79740,0.85559],[0.11167,0.80569,0.84525],[0.10738,0.81381,0.83484],[0.10357,0.82177,0.82437],[0.10026,0.82955,0.81389],[0.09750,0.83714,0.80342],[0.09532,0.84455,0.79299],[0.09377,0.85175,0.78264],[0.09287,0.85875,0.77240],[0.09267,0.86554,0.76230],[0.09320,0.87211,0.75237],[0.09451,0.87844,0.74265],[0.09662,0.88454,0.73316],[0.09958,0.89040,0.72393],[0.10342,0.89600,0.71500],[0.10815,0.90142,0.70599],[0.11374,0.90673,0.69651],[0.12014,0.91193,0.68660],[0.12733,0.91701,0.67627],[0.13526,0.92197,0.66556],[0.14391,0.92680,0.65448],[0.15323,0.93151,0.64308],[0.16319,0.93609,0.63137],[0.17377,0.94053,0.61938],[0.18491,0.94484,0.60713],[0.19659,0.94901,0.59466],[0.20877,0.95304,0.58199],[0.22142,0.95692,0.56914],[0.23449,0.96065,0.55614],[0.24797,0.96423,0.54303],[0.26180,0.96765,0.52981],[0.27597,0.97092,0.51653],[0.29042,0.97403,0.50321],[0.30513,0.97697,0.48987],[0.32006,0.97974,0.47654],[0.33517,0.98234,0.46325],[0.35043,0.98477,0.45002],[0.36581,0.98702,0.43688],[0.38127,0.98909,0.42386],[0.39678,0.99098,0.41098],[0.41229,0.99268,0.39826],[0.42778,0.99419,0.38575],[0.44321,0.99551,0.37345],[0.45854,0.99663,0.36140],[0.47375,0.99755,0.34963],[0.48879,0.99828,0.33816],[0.50362,0.99879,0.32701],[0.51822,0.99910,0.31622],[0.53255,0.99919,0.30581],[0.54658,0.99907,0.29581],[0.56026,0.99873,0.28623],[0.57357,0.99817,0.27712],[0.58646,0.99739,0.26849],[0.59891,0.99638,0.26038],[0.61088,0.99514,0.25280],[0.62233,0.99366,0.24579],[0.63323,0.99195,0.23937],[0.64362,0.98999,0.23356],[0.65394,0.98775,0.22835],[0.66428,0.98524,0.22370],[0.67462,0.98246,0.21960],[0.68494,0.97941,0.21602],[0.69525,0.97610,0.21294],[0.70553,0.97255,0.21032],[0.71577,0.96875,0.20815],[0.72596,0.96470,0.20640],[0.73610,0.96043,0.20504],[0.74617,0.95593,0.20406],[0.75617,0.95121,0.20343],[0.76608,0.94627,0.20311],[0.77591,0.94113,0.20310],[0.78563,0.93579,0.20336],[0.79524,0.93025,0.20386],[0.80473,0.92452,0.20459],[0.81410,0.91861,0.20552],[0.82333,0.91253,0.20663],[0.83241,0.90627,0.20788],[0.84133,0.89986,0.20926],[0.85010,0.89328,0.21074],[0.85868,0.88655,0.21230],[0.86709,0.87968,0.21391],[0.87530,0.87267,0.21555],[0.88331,0.86553,0.21719],[0.89112,0.85826,0.21880],[0.89870,0.85087,0.22038],[0.90605,0.84337,0.22188],[0.91317,0.83576,0.22328],[0.92004,0.82806,0.22456],[0.92666,0.82025,0.22570],[0.93301,0.81236,0.22667],[0.93909,0.80439,0.22744],[0.94489,0.79634,0.22800],[0.95039,0.78823,0.22831],[0.95560,0.78005,0.22836],[0.96049,0.77181,0.22811],[0.96507,0.76352,0.22754],[0.96931,0.75519,0.22663],[0.97323,0.74682,0.22536],[0.97679,0.73842,0.22369],[0.98000,0.73000,0.22161],[0.98289,0.72140,0.21918],[0.98549,0.71250,0.21650],[0.98781,0.70330,0.21358],[0.98986,0.69382,0.21043],[0.99163,0.68408,0.20706],[0.99314,0.67408,0.20348],[0.99438,0.66386,0.19971],[0.99535,0.65341,0.19577],[0.99607,0.64277,0.19165],[0.99654,0.63193,0.18738],[0.99675,0.62093,0.18297],[0.99672,0.60977,0.17842],[0.99644,0.59846,0.17376],[0.99593,0.58703,0.16899],[0.99517,0.57549,0.16412],[0.99419,0.56386,0.15918],[0.99297,0.55214,0.15417],[0.99153,0.54036,0.14910],[0.98987,0.52854,0.14398],[0.98799,0.51667,0.13883],[0.98590,0.50479,0.13367],[0.98360,0.49291,0.12849],[0.98108,0.48104,0.12332],[0.97837,0.46920,0.11817],[0.97545,0.45740,0.11305],[0.97234,0.44565,0.10797],[0.96904,0.43399,0.10294],[0.96555,0.42241,0.09798],[0.96187,0.41093,0.09310],[0.95801,0.39958,0.08831],[0.95398,0.38836,0.08362],[0.94977,0.37729,0.07905],[0.94538,0.36638,0.07461],[0.94084,0.35566,0.07031],[0.93612,0.34513,0.06616],[0.93125,0.33482,0.06218],[0.92623,0.32473,0.05837],[0.92105,0.31489,0.05475],[0.91572,0.30530,0.05134],[0.91024,0.29599,0.04814],[0.90463,0.28696,0.04516],[0.89888,0.27824,0.04243],[0.89298,0.26981,0.03993],[0.88691,0.26152,0.03753],[0.88066,0.25334,0.03521],[0.87422,0.24526,0.03297],[0.86760,0.23730,0.03082],[0.86079,0.22945,0.02875],[0.85380,0.22170,0.02677],[0.84662,0.21407,0.02487],[0.83926,0.20654,0.02305],[0.83172,0.19912,0.02131],[0.82399,0.19182,0.01966],[0.81608,0.18462,0.01809],[0.80799,0.17753,0.01660],[0.79971,0.17055,0.01520],[0.79125,0.16368,0.01387],[0.78260,0.15693,0.01264],[0.77377,0.15028,0.01148],[0.76476,0.14374,0.01041],[0.75556,0.13731,0.00942],[0.74617,0.13098,0.00851],[0.73661,0.12477,0.00769],[0.72686,0.11867,0.00695],[0.71692,0.11268,0.00629],[0.70680,0.10680,0.00571],[0.69650,0.10102,0.00522],[0.68602,0.09536,0.00481],[0.67535,0.08980,0.00449],[0.66449,0.08436,0.00424],[0.65345,0.07902,0.00408],[0.64223,0.07380,0.00401],[0.63082,0.06868,0.00401],[0.61923,0.06367,0.00410],[0.60746,0.05878,0.00427],[0.59550,0.05399,0.00453],[0.58336,0.04931,0.00486],[0.57103,0.04474,0.00529],[0.55852,0.04028,0.00579],[0.54583,0.03593,0.00638],[0.53295,0.03169,0.00705],[0.51989,0.02756,0.00780],[0.50664,0.02354,0.00863],[0.49321,0.01963,0.00955],[0.47960,0.01583,0.01055]] + +# The look-up table contains 256 entries. Each entry is a floating point sRGB triplet. +# To use it with matplotlib, pass cmap=ListedColormap(turbo_colormap_data) as an arg to imshow() (don't forget "from matplotlib.colors import ListedColormap"). +# If you have a typical 8-bit greyscale image, you can use the 8-bit value to index into this LUT directly. +# The floating point color values can be converted to 8-bit sRGB via multiplying by 255 and casting/flooring to an integer. Saturation should not be required for IEEE-754 compliant arithmetic. +# If you have a floating point value in the range [0,1], you can use interpolate() to linearly interpolate between the entries. +# If you have 16-bit or 32-bit integer values, convert them to floating point values on the [0,1] range and then use interpolate(). Doing the interpolation in floating point will reduce banding. +# If some of your values may lie outside the [0,1] range, use interpolate_or_clip() to highlight them. + +def interpolate(colormap, x): + x = max(0.0, min(1.0, x)) + a = int(x*255.0) + b = min(255, a + 1) + f = x*255.0 - a + return [colormap[a][0] + (colormap[b][0] - colormap[a][0]) * f, + colormap[a][1] + (colormap[b][1] - colormap[a][1]) * f, + colormap[a][2] + (colormap[b][2] - colormap[a][2]) * f] + +def interpolate_or_clip(colormap, x): + if x < 0.0: return [0.0, 0.0, 0.0] + elif x > 1.0: return [1.0, 1.0, 1.0] + else: return interpolate(colormap, x) diff --git a/TwoDPASS/utils/vis_utils.py b/TwoDPASS/utils/vis_utils.py new file mode 100644 index 0000000..aae87fb --- /dev/null +++ b/TwoDPASS/utils/vis_utils.py @@ -0,0 +1,187 @@ +import matplotlib.pyplot as plt +import numpy as np +from TwoDPASS.utils.turbo_cmap import interpolate_or_clip, turbo_colormap_data + + +# all classes +NUSCENES_COLOR_PALETTE = [ + (255, 158, 0), # car + (255, 158, 0), # truck + (255, 158, 0), # bus + (255, 158, 0), # trailer + (255, 158, 0), # construction_vehicle + (0, 0, 230), # pedestrian + (255, 61, 99), # motorcycle + (255, 61, 99), # bicycle + (0, 0, 0), # traffic_cone + (0, 0, 0), # barrier + (200, 200, 200), # background +] + +NUSCENSE_LIDARSEG_PALETTE = [ + (0, 0, 0), # noise + (112, 128, 144), # barrier + (220, 20, 60), # bicycle + (255, 127, 80), # bus + (255, 158, 0), # car + (233, 150, 70), # construction_vehicle + (255, 61, 99), # motorcycle + (0, 0, 230), # pedestrian + (47, 79, 79), # traffic_cone + (255, 140, 0), # trailer + (255, 99, 71), # Tomato + (0, 207, 191), # nuTonomy green + (175, 0, 75), + (75, 0, 75), + (112, 180, 60), + (222, 184, 135), # Burlywood + (0, 175, 0) +] + +# classes after merging (as used in xMUDA) +NUSCENES_COLOR_PALETTE_SHORT = [ + (255, 158, 0), # vehicle + (0, 0, 230), # pedestrian + (255, 61, 99), # bike + (0, 0, 0), # traffic boundary + (200, 200, 200), # background +] + +# all classes +A2D2_COLOR_PALETTE_SHORT = [ + (255, 0, 0), # car + (255, 128, 0), # truck + (182, 89, 6), # bike + (204, 153, 255), # person + (255, 0, 255), # road + (150, 150, 200), # parking + (180, 150, 200), # sidewalk + (241, 230, 255), # building + (147, 253, 194), # nature + (255, 246, 143), # other-objects + (0, 0, 0) # ignore +] + +# colors as defined in https://github.com/PRBonn/semantic-kitti-api/blob/master/config/semantic-kitti.yaml +SEMANTIC_KITTI_ID_TO_BGR = { # bgr + 0: [0, 0, 0], + 1: [0, 0, 255], + 10: [245, 150, 100], + 11: [245, 230, 100], + 13: [250, 80, 100], + 15: [150, 60, 30], + 16: [255, 0, 0], + 18: [180, 30, 80], + 20: [255, 0, 0], + 30: [30, 30, 255], + 31: [200, 40, 255], + 32: [90, 30, 150], + 40: [255, 0, 255], + 44: [255, 150, 255], + 48: [75, 0, 75], + 49: [75, 0, 175], + 50: [0, 200, 255], + 51: [50, 120, 255], + 52: [0, 150, 255], + 60: [170, 255, 150], + 70: [0, 175, 0], + 71: [0, 60, 135], + 72: [80, 240, 150], + 80: [150, 240, 255], + 81: [0, 0, 255], + 99: [255, 255, 50], + 252: [245, 150, 100], + 256: [255, 0, 0], + 253: [200, 40, 255], + 254: [30, 30, 255], + 255: [90, 30, 150], + 257: [250, 80, 100], + 258: [180, 30, 80], + 259: [255, 0, 0], +} +SEMANTIC_KITTI_COLOR_PALETTE = [SEMANTIC_KITTI_ID_TO_BGR[id] if id in SEMANTIC_KITTI_ID_TO_BGR.keys() else [0, 0, 0] + for id in range(list(SEMANTIC_KITTI_ID_TO_BGR.keys())[-1] + 1)] + + +# classes after merging (as used in xMUDA) +SEMANTIC_KITTI_COLOR_PALETTE_SHORT_BGR = [ + [245, 150, 100], # car + [180, 30, 80], # truck + [150, 60, 30], # bike + [30, 30, 255], # person + [255, 0, 255], # road + [255, 150, 255], # parking + [75, 0, 75], # sidewalk + [0, 200, 255], # building + [0, 175, 0], # nature + [255, 255, 50], # other-objects + [0, 0, 0], # ignore +] +SEMANTIC_KITTI_COLOR_PALETTE_SHORT = [(c[2], c[1], c[0]) for c in SEMANTIC_KITTI_COLOR_PALETTE_SHORT_BGR] + +def write_obj(points, file, rgb=False): + fout = open('%s.obj' % file, 'w') + for i in range(points.shape[0]): + if not rgb: + fout.write('v %f %f %f %d %d %d\n' % ( + points[i, 0], points[i, 1], points[i, 2], 255, 255, 0)) + else: + fout.write('v %f %f %f %d %d %d\n' % ( + points[i, 0], points[i, 1], points[i, 2], points[i, -3] * 255, points[i, -2] * 255, + points[i, -1] * 255)) + + +def draw_points_image_labels(img, img_indices, seg_labels, show=True, color_palette_type='NuScenes', point_size=3.5): + if color_palette_type == 'NuScenes': + color_palette = NUSCENSE_LIDARSEG_PALETTE + elif color_palette_type == 'A2D2': + color_palette = A2D2_COLOR_PALETTE_SHORT + elif color_palette_type == 'SemanticKITTI': + color_palette = SEMANTIC_KITTI_COLOR_PALETTE_SHORT + elif color_palette_type == 'SemanticKITTI_long': + color_palette = SEMANTIC_KITTI_COLOR_PALETTE + else: + raise NotImplementedError('Color palette type not supported') + + color_palette = np.array(color_palette) / 255. + # seg_labels[seg_labels == -100] = len(color_palette) - 1 + colors = color_palette[seg_labels[:, 0]] + colors = colors[:, [2, 1, 0]] + plt.figure(figsize=(10, 6)) + plt.imshow(img) + plt.scatter(img_indices[:, 1], img_indices[:, 0], c=colors, alpha=0.5, s=point_size) + + plt.axis('off') + + if show: + plt.show() + + +def normalize_depth(depth, d_min, d_max): + # normalize linearly between d_min and d_max + data = np.clip(depth, d_min, d_max) + return (data - d_min) / (d_max - d_min) + + +def draw_points_image_depth(img, img_indices, depth, show=True, point_size=0.5): + # depth = normalize_depth(depth, d_min=3., d_max=50.) + depth = normalize_depth(depth, d_min=depth.min(), d_max=depth.max()) + colors = [] + for depth_val in depth: + colors.append(interpolate_or_clip(colormap=turbo_colormap_data, x=depth_val)) + # ax5.imshow(np.full_like(img, 255)) + plt.imshow(img) + plt.scatter(img_indices[:, 1], img_indices[:, 0], c=colors, alpha=0.5, s=point_size) + + plt.axis('off') + + if show: + plt.show() + + +def draw_bird_eye_view(coords, full_scale=4096): + plt.scatter(coords[:, 0], coords[:, 1], s=0.1) + plt.xlim([0, full_scale]) + plt.ylim([0, full_scale]) + plt.gca().set_aspect('equal', adjustable='box') + plt.show() diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..969532d --- /dev/null +++ b/environment.yml @@ -0,0 +1,339 @@ +name: latentbki_env +channels: + - pytorch + - nvidia/label/cuda-11.3.1 + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - argon2-cffi=21.3.0=pyhd3eb1b0_0 + - argon2-cffi-bindings=21.2.0=py37h7f8727e_0 + - backcall=0.2.0=pyhd3eb1b0_0 + - beautifulsoup4=4.11.1=py37h06a4308_0 + - blas=1.0=mkl + - brotli=1.0.9=h166bdaf_7 + - brotli-bin=1.0.9=h166bdaf_7 + - brotlipy=0.7.0=py37h27cfd23_1003 + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2024.7.2=h06a4308_0 + - catkin_pkg=0.5.2=pyhd8ed1ab_0 + - certifi=2022.12.7=py37h06a4308_0 + - cffi=1.15.0=py37hd667e15_1 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - comm=0.2.2=pyhd8ed1ab_0 + - cryptography=37.0.1=py37h9ce1e76_0 + - cuda-command-line-tools=11.3.1=h712c49d_0 + - cuda-compiler=11.3.1=h712c49d_0 + - cuda-cudart=11.3.109=hfb95d0c_0 + - cuda-cuobjdump=11.3.122=hbf6ec6b_0 + - cuda-cupti=11.3.111=h12ad217_0 + - cuda-cuxxfilt=11.3.122=h4dc11a3_0 + - cuda-gdb=11.3.109=h33b7820_0 + - cuda-libraries=11.3.1=h712c49d_0 + - cuda-libraries-dev=11.3.1=h712c49d_0 + - cuda-memcheck=11.3.109=hf5cb439_0 + - cuda-nvcc=11.3.122=h4814707_0 + - cuda-nvdisasm=11.3.122=ha26faa6_0 + - cuda-nvml-dev=11.3.58=hc25e488_0 + - cuda-nvprof=11.3.111=h95a27d4_0 + - cuda-nvprune=11.3.122=hb3346b8_0 + - cuda-nvrtc=11.3.122=h1aa17d8_0 + - cuda-nvtx=11.3.109=h4ec7630_0 + - cuda-nvvp=11.3.111=h4c4416a_0 + - cuda-samples=11.3.58=h6d5b628_0 + - cuda-sanitizer-api=11.3.111=h2446cfc_0 + - cuda-thrust=11.3.109=he8b717c_0 + - cuda-toolkit=11.3.1=h712c49d_0 + - cuda-tools=11.3.1=h712c49d_0 + - cuda-visual-tools=11.3.1=h712c49d_0 + - cudatoolkit=11.3.1=h2bc3f7f_2 + - cycler=0.11.0=pyhd8ed1ab_0 + - dbus=1.13.6=he372182_0 + - debugpy=1.6.0=py37hd23a5d3_0 + - decorator=5.1.1=pyhd3eb1b0_0 + - defusedxml=0.7.1=pyhd3eb1b0_0 + - distro=1.6.0=pyhd8ed1ab_0 + - docutils=0.18.1=py37h89c1867_1 + - empy=3.3.4=pyh9f0ad1d_1 + - entrypoints=0.4=py37h06a4308_0 + - exceptiongroup=1.2.2=pyhd8ed1ab_0 + - expat=2.4.8=h27087fc_0 + - ffmpeg=4.3=hf484d3e_0 + - fontconfig=2.14.0=h8e229c2_0 + - fonttools=4.33.3=py37h540881e_0 + - freetype=2.11.0=h70c0345_0 + - giflib=5.2.1=h7b6447c_0 + - glib=2.69.1=h4ff587b_1 + - gmp=6.2.1=h295c915_3 + - gnutls=3.6.15=he1e5248_0 + - gst-plugins-base=1.14.0=hbbd80ab_1 + - gstreamer=1.14.0=h28cd5cc_2 + - icu=58.2=hf484d3e_1000 + - idna=3.3=pyhd3eb1b0_0 + - importlib_metadata=4.11.3=hd3eb1b0_0 + - intel-openmp=2021.4.0=h06a4308_3561 + - ipython_genutils=0.2.0=pyhd3eb1b0_1 + - jedi=0.18.1=py37h06a4308_1 + - jinja2=3.1.2=py37h06a4308_0 + - jpeg=9e=h7f8727e_0 + - jupyter=1.0.0=py37h06a4308_8 + - jupyter_client=7.3.4=py37h06a4308_0 + - jupyter_console=6.4.4=py37h06a4308_0 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libbrotlicommon=1.0.9=h166bdaf_7 + - libbrotlidec=1.0.9=h166bdaf_7 + - libbrotlienc=1.0.9=h166bdaf_7 + - libcublas=11.5.1.109=h0fd73e7_0 + - libcufft=10.4.2.109=h2344711_0 + - libcurand=10.2.4.109=h0189693_0 + - libcusolver=11.1.2.109=h1e009e5_0 + - libcusparse=11.6.0.109=hf5bfba9_0 + - libffi=3.3=he6710b0_2 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h7f8727e_2 + - libidn2=2.3.2=h7f8727e_0 + - libnpp=11.3.3.95=h122bb27_0 + - libnvjpeg=11.5.0.109=h159916b_0 + - libpng=1.6.37=hbc83047_0 + - libsodium=1.0.18=h7b6447c_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.16.0=h27cfd23_0 + - libtiff=4.2.0=h2818925_1 + - libunistring=0.9.10=h27cfd23_0 + - libuuid=2.32.1=h7f98852_1000 + - libuv=1.40.0=h7b6447c_0 + - libwebp=1.2.2=h55f646e_0 + - libwebp-base=1.2.2=h7f8727e_0 + - libxcb=1.13=h7f98852_1004 + - libxml2=2.9.14=h74e7548_0 + - libxslt=1.1.35=h4e12654_0 + - lxml=4.9.1=py37h1edc446_0 + - lz4-c=1.9.3=h295c915_1 + - markupsafe=2.1.1=py37h7f8727e_0 + - matplotlib=3.5.2=py37h89c1867_0 + - matplotlib-base=3.5.2=py37hc347a89_0 + - mistune=0.8.4=py37h14c3975_1001 + - mkl=2021.4.0=h06a4308_640 + - mkl-service=2.4.0=py37h7f8727e_0 + - mkl_fft=1.3.1=py37hd3c417c_0 + - mkl_random=1.2.2=py37h51133e4_0 + - munkres=1.1.4=pyh9f0ad1d_0 + - nbconvert-pandoc=6.5.3=pyhd8ed1ab_0 + - ncurses=6.3=h7f8727e_2 + - nettle=3.7.3=hbbd107a_1 + - numpy=1.21.5=py37h6c91a56_3 + - numpy-base=1.21.5=py37ha15fc14_3 + - olefile=0.46=pyh9f0ad1d_1 + - openh264=2.1.1=h4ff587b_0 + - openjpeg=2.4.0=hb52868f_1 + - openssl=1.1.1w=h7f8727e_0 + - packaging=21.3=pyhd8ed1ab_0 + - pandoc=2.19.2=ha770c72_0 + - pandocfilters=1.5.0=pyhd3eb1b0_0 + - parso=0.8.3=pyhd3eb1b0_0 + - pcre=8.45=h9c3ff4c_0 + - pexpect=4.8.0=pyhd3eb1b0_3 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pip=21.2.2=py37h06a4308_0 + - pkgutil-resolve-name=1.3.10=py37h06a4308_0 + - prometheus_client=0.14.1=py37h06a4308_0 + - prompt_toolkit=3.0.36=hd3eb1b0_0 + - psutil=5.9.1=py37h540881e_0 + - pthread-stubs=0.4=h36c2ea0_1001 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - pycparser=2.21=pyhd3eb1b0_0 + - pygments=2.17.2=pyhd8ed1ab_0 + - pyopenssl=22.0.0=pyhd3eb1b0_0 + - pyparsing=3.0.9=pyhd8ed1ab_0 + - pyqt=5.9.2=py37hcca6a23_4 + - pyrsistent=0.18.1=py37h540881e_1 + - pysocks=1.7.1=py37_1 + - python=3.7.13=h12debd9_0 + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python_abi=3.7=2_cp37m + - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 + - pytorch-mutex=1.0=cuda + - pyzmq=23.2.0=py37h6a678d5_0 + - qt=5.9.7=h5867ecd_1 + - qtconsole=5.4.0=py37h06a4308_0 + - qtpy=2.2.0=py37h06a4308_0 + - readline=8.1.2=h7f8727e_1 + - requests=2.27.1=pyhd3eb1b0_0 + - rospkg=1.4.0=pyhd8ed1ab_0 + - send2trash=1.8.0=pyhd3eb1b0_1 + - setuptools=61.2.0=py37h06a4308_0 + - sip=4.19.8=py37hf484d3e_0 + - six=1.16.0=pyhd3eb1b0_1 + - sniffio=1.2.0=py37h06a4308_1 + - soupsieve=2.3.2.post1=py37h06a4308_0 + - sqlite=3.38.5=hc218d9a_0 + - tk=8.6.12=h1ccaba5_0 + - tomli=2.0.1=py37h06a4308_0 + - torchaudio=0.11.0=py37_cu113 + - torchvision=0.12.0=py37_cu113 + - tornado=6.1=py37h540881e_3 + - typing-extensions=4.7.1=hd8ed1ab_0 + - typing_extensions=4.7.1=pyha770c72_0 + - unicodedata2=14.0.0=py37h540881e_1 + - urllib3=1.26.9=py37h06a4308_0 + - wcwidth=0.2.5=pyhd3eb1b0_0 + - webencodings=0.5.1=pyhd8ed1ab_2 + - wheel=0.37.1=pyhd3eb1b0_0 + - xorg-libxau=1.0.9=h7f98852_0 + - xorg-libxdmcp=1.1.3=h7f98852_0 + - xz=5.2.5=h7f8727e_1 + - yaml=0.2.5=h7f98852_2 + - zeromq=4.3.5=h6a678d5_0 + - zlib=1.2.12=h7f8727e_2 + - zstd=1.5.2=ha4553b6_0 + - pip: + - absl-py==1.1.0 + - addict==2.4.0 + - aiohttp==3.8.6 + - aiosignal==1.3.1 + - anyio==3.6.1 + - arrow==1.2.3 + - async-timeout==4.0.3 + - asynctest==0.13.0 + - attrs==21.4.0 + - babel==2.10.3 + - bleach==5.0.1 + - blessed==1.20.0 + - boto3==1.33.13 + - botocore==1.33.13 + - cachetools==5.2.0 + - carla==0.9.13 + - ccimport==0.4.3 + - click==8.1.7 + # - clip==1.0 + - croniter==1.3.15 + - cumm-cu113==0.4.11 + - dateutils==0.6.12 + - deepdiff==6.7.1 + - deprecation==2.1.0 + - descartes==1.1.0 + - easydict==1.13 + - fastapi==0.88.0 + - fastjsonschema==2.15.3 + - filelock==3.12.2 + - fire==0.6.0 + - frozenlist==1.3.3 + - fsspec==2023.1.0 + - ftfy==6.1.1 + - future==1.0.0 + - fvcore==0.1.5.post20221221 + - gdown==4.7.3 + - google-auth==2.8.0 + - google-auth-oauthlib==0.4.6 + - grpcio==1.47.0 + - h11==0.14.0 + - huggingface-hub==0.16.4 + - importlib-metadata==4.12.0 + - importlib-resources==5.8.0 + - inquirer==2.10.1 + - iopath==0.1.10 + - ipykernel==6.15.0 + - ipython==7.34.0 + - ipywidgets==7.7.1 + - itsdangerous==2.1.2 + - jmespath==1.0.1 + - joblib==1.1.0 + - json5==0.9.8 + - jsonschema==4.6.0 + - jupyter-core==4.10.0 + - jupyter-packaging==0.12.2 + - jupyter-server==1.18.0 + - jupyterlab==3.4.3 + - jupyterlab-pygments==0.2.2 + - jupyterlab-server==2.14.0 + - jupyterlab-widgets==1.1.1 + - kiwisolver==1.4.3 + - lark==1.1.9 + - lightning-cloud==0.5.69 + - lightning-utilities==0.10.1 + - markdown==3.3.7 + - markdown-it-py==2.2.0 + - matplotlib-inline==0.1.3 + - mdurl==0.1.2 + - multidict==6.0.5 + - nbclassic==0.3.7 + - nbclient==0.6.4 + - nbconvert==6.5.0 + - nbformat==5.4.0 + - nest-asyncio==1.5.5 + - ninja==1.11.1.1 + - notebook==6.4.12 + - notebook-shim==0.1.0 + - nuscenes-devkit==1.1.10 + - oauthlib==3.2.0 + - open3d==0.15.2 + - opencv-python==4.10.0.82 + - ordered-set==4.1.0 + - pandas==1.3.5 + - path==16.6.0 + - pccm==0.4.14 + - pillow==9.5.0 + - plyfile==0.9 + - portalocker==2.7.0 + - prompt-toolkit==3.0.30 + - protobuf==3.19.4 + - ptflops==0.6.9 + - pyasn1==0.4.8 + - pyasn1-modules==0.2.8 + - pybind11==2.13.1 + - pycocotools==2.0.7 + - pydantic==1.10.16 + - pydeprecate==0.3.0 + - pyjwt==2.8.0 + - pyliblzfse==0.4.1 + - pyquaternion==0.9.9 + - python-editor==1.0.4 + - python-multipart==0.0.8 + - pytorch-lightning==1.3.8 + - pytorch3d==0.7.8 + - pytz==2022.1 + - pyyaml==5.4.1 + - readchar==4.0.5 + - regex==2024.4.16 + - requests-oauthlib==1.3.1 + - rich==13.7.1 + - rsa==4.8 + - s3transfer==0.8.2 + - safetensors==0.4.3 + - scikit-learn==1.0.2 + - scipy==1.7.3 + - seaborn==0.12.2 + - shapely==1.8.5 + - sklearn==0.0 + - spconv-cu113==2.3.6 + - starlette==0.22.0 + - starsessions==1.3.0 + - tabulate==0.9.0 + - tensorboard==2.9.1 + - tensorboard-data-server==0.6.1 + - tensorboard-plugin-wit==1.8.1 + - termcolor==2.3.0 + - terminado==0.15.0 + - threadpoolctl==3.1.0 + - timm==0.9.12 + - tinycss2==1.1.1 + - tomlkit==0.11.0 + - torch-scatter==2.1.1 + - torch-tb-profiler==0.4.3 + - torchmetrics==0.5.0 + # - torchsparse==1.4.0 + - tqdm==4.64.0 + - traitlets==5.3.0 + - uvicorn==0.22.0 + - websocket-client==1.3.3 + - websockets==11.0.3 + - werkzeug==2.1.2 + - widgetsnbextension==3.6.1 + - yacs==0.1.8 + - yarl==1.9.4 + - zipp==3.8.0 +prefix: /workspace/miniconda3/envs/latentbki_env diff --git a/generate_results.py b/generate_results.py new file mode 100644 index 0000000..c04b277 --- /dev/null +++ b/generate_results.py @@ -0,0 +1,453 @@ +import os +import time +import yaml +os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + +import numpy as np +from tqdm import tqdm +import torch +import clip + +# Custom Imports +from Data.utils import * +from Models.LatentBKI import * +from Models.mapping_utils import * +from Data.MP3D import MP3D +from Data.RealWorldData import RealWorldData +from Data.KITTI_SPVCNN import KITTI_SPVCNN +from Models.Lseg.Lseg_module import Lseg_module +from Models.SPVCNN.SPVCNN_module import SPVCNN_Module + +# result same as train +class Results(): + def __init__(self) -> None: + self.num_correct = 0 + self.num_total = 0 + self.all_intersections = 0 + self.all_unions = 0 + + self.num_correct_seg = 0 + self.num_total_seg = 0 + self.all_intersections_seg = 0 + self.all_unions_seg = 0 + +def save_map(): + print("Saving Map ...") + features = map_object.global_map[:,3:FEATURE_SIZE+3].to(device) + if DOWN_SAMPLE_FEATURE: + features = back_project_fn(features) + labels = torch.argmax(map_object.decode(features, map_object.category_feature), dim=1, keepdim=True) + confidence = map_object.global_map[:,-1].reshape(-1,1).to(dtype=torch.float32) + global_map = torch.cat((map_object.global_map[:,:3], labels.to(torch.float32).cpu(), confidence), dim=1) + global_map = global_map.numpy() + print("Map size:", global_map.shape) + np.save(os.path.join(SAVE_MAP_PATH, "global_map.npy"), global_map) + np.save(os.path.join(SAVE_MAP_PATH, "global_map_latent.npy"), map_object.global_map) + +def inference(unlabeld_pc_torch_list, pred_labels_list, gt_labels_list, map_object, results, SAVE_MAP_PATH, with_variance): + # first save the points + torch.save(unlabeld_pc_torch_list, os.path.join(SAVE_MAP_PATH, "unlabeld_pc_torch_list.pt")) + torch.save(pred_labels_list, os.path.join(SAVE_MAP_PATH, "pred_labels_list.pt")) + torch.save(gt_labels_list, os.path.join(SAVE_MAP_PATH, "gt_labels_list.pt")) + + print(f"Inference {last_scene} ... ") + unlabeld_pc_torch_list = unlabeld_pc_torch_list.to(device=device, non_blocking=True) + pred_labels_list = pred_labels_list.to(device=device, non_blocking=True) + gt_labels_list = gt_labels_list.to(device=device, non_blocking=True) + print(gt_labels_list.shape) + features = map_object.label_points_iterative(unlabeld_pc_torch_list, with_variance=with_variance) + torch.save(features.cpu(), os.path.join(SAVE_MAP_PATH, "predcited_features.pt")) # save predicted features, variance, confidence + + category_pred = features[:, -1].to(torch.int64).to(device) + + for i in range(map_object.num_classes): + gt_i = gt_labels_list == i + pred_bki_i = category_pred == i + pred_seg_i = pred_labels_list == i + + sequence_class[i] += torch.sum(gt_i) + sequence_int_bki[i] += torch.sum(gt_i & pred_bki_i) + sequence_int_seg[i] += torch.sum(gt_i & pred_seg_i) + sequence_un_bki[i] += torch.sum(gt_i | pred_bki_i) + sequence_un_seg[i] += torch.sum(gt_i | pred_seg_i) + + # accuracy + correct = torch.sum(category_pred == gt_labels_list).item() + total = gt_labels_list.shape[0] + results.num_correct += correct + results.num_total += total + + # miou + inter, union = iou_one_frame(category_pred, gt_labels_list, n_classes=NUM_CLASSES) + union += 1e-6 + results.all_intersections += inter + results.all_unions += union + + # accuracy_seg + # TODO: remove ignore lables? + correct_seg = torch.sum(pred_labels_list == gt_labels_list).item() + total_seg = gt_labels_list.shape[0] + results.num_correct_seg += correct_seg + results.num_total_seg += total_seg + + # miou_seg + inter_seg, union_seg = iou_one_frame(pred_labels_list, gt_labels_list, n_classes=NUM_CLASSES) + union_seg += 1e-6 + results.all_intersections_seg += inter_seg + results.all_unions_seg += union_seg + + # save statistics + print(f"{last_scene} stats:") + seq_intersections = inter[union > 0] + seq_unions = union[union > 0] + seq_miou = np.mean(seq_intersections / seq_unions) + print(f'Average map accuracy: {correct/total}') + print(f'Map miou: {seq_miou}') + + seq_intersections_seg = inter_seg[union_seg > 0] + seq_unions_seg = union_seg[union_seg > 0] + seq_miou_seg = np.mean(seq_intersections_seg / seq_unions_seg) + print(f'Average segmentation network accuracy: {correct_seg/total_seg}') + print(f'Segmentation network miou: {seq_miou_seg}') + print("") + + with open(os.path.join(SAVE_MAP_PATH, 'result.txt'), 'w') as file: + file.write(f"{last_scene} stats:\n") + seq_intersections = inter[union > 0] + seq_unions = union[union > 0] + seq_miou = np.mean(seq_intersections / seq_unions) + file.write(f'Average map accuracy: {correct/total}\n') + file.write(f'Map miou: {seq_miou}\n') + + seq_intersections_seg = inter_seg[union_seg > 0] + seq_unions_seg = union_seg[union_seg > 0] + seq_miou_seg = np.mean(seq_intersections_seg / seq_unions_seg) + file.write(f'Average segmentation network accuracy: {correct_seg/total_seg}\n') + file.write(f'Segmentation network miou: {seq_miou_seg}\n') + + +########################## main script ############################ + +# MODEL_NAME = "LatentBKI_default" +# MODEL_NAME = "LatentBKI_realworld" +# MODEL_NAME = "LatentBKI_vlmap" +MODEL_NAME = "LatentBKI_kitti" + +print("Model is:", MODEL_NAME) + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +print("device is ", device) + +# Model Parameters +model_params_file = os.path.join(os.getcwd(), "Config", MODEL_NAME + ".yaml") +with open(model_params_file, "r") as stream: + try: + model_params = yaml.safe_load(stream) + DATASET = model_params["dataset"] + MEAS_RESULT = model_params["meas_result"] + SAVE_MAP = model_params["save_map"] + ELL = model_params["ell"] + WITH_VARIANCE = model_params['with_variance'] + USE_RELATIVE_POSE = model_params['use_relative_pose'] + PSEDUO_DISCRETE = model_params['pseduo_discrete'] + FILTER_SIZE = model_params["filter_size"] + GRID_PARAMS = model_params["grid_params"] + except yaml.YAMLError as exc: + print(exc) + +# Data Parameters +data_params_file = os.path.join(os.getcwd(), "Config", DATASET + ".yaml") +with open(data_params_file, "r") as stream: + try: + data_params = yaml.safe_load(stream) + NUM_CLASSES = data_params["num_classes"] + DATA_DIR = data_params["data_dir"] + CATEGORY = data_params["category"] + FEATURE_DIR = data_params["feature_dir"] + FEATURE_SIZE = data_params["feature_size"] + RAW_DATA = data_params['raw_data'] + DOWN_SAMPLE_FEATURE = data_params["down_sample_feature"] + PCA_PATH = data_params['pca_path'] + GRID_MASK = data_params['grid_mask'] + SUBSAMPLE = data_params['subsample_points'] + SEQUENCES = data_params['sequences'] + INTRINSIC = data_params['intrinsic'] + except yaml.YAMLError as exc: + print(exc) + +# construct save directory +cell_size = (GRID_PARAMS['max_bound'][0] - GRID_PARAMS['min_bound'][0]) / GRID_PARAMS['grid_size'][0] +SAVE_FOLDER = f"{MODEL_NAME}_{DATASET}_{FILTER_SIZE}_{ELL}_{FEATURE_SIZE}_{cell_size}_{SUBSAMPLE}" +RESULT_SAVE = os.path.join("Results", SAVE_FOLDER) + +if SAVE_MAP: + if not os.path.exists(RESULT_SAVE): + os.makedirs(RESULT_SAVE) + else: + SAVE_FOLDER += time.strftime("_%Y-%m-%d_%H-%M-%S") + RESULT_SAVE = os.path.join("Results", SAVE_FOLDER) + print(f"Save to {RESULT_SAVE}") + +print("Measure Result:", MEAS_RESULT) +print("Save Map :", SAVE_MAP) +print("grid_mask:", GRID_MASK) +print("Pseudo discrete: ", PSEDUO_DISCRETE) +print("with variance inference:", WITH_VARIANCE) +print("Subsampling input points:", SUBSAMPLE) + +# PCA feature reduction functions +down_sampling_fn = None +back_project_fn = None +CATEGORY_CLIP = torch.empty(0, device=device) +# Create segmentation module +if DATASET != 'semantic_kitti': + # clip features + clip_model, preprocess = clip.load("ViT-B/32", device=device) + text = clip.tokenize(CATEGORY).to(device) + with torch.no_grad(): + text_features = clip_model.encode_text(text) + text_features = text_features.to(torch.float32) + CATEGORY_CLIP = text_features / text_features.norm(dim=-1, keepdim=True) + + print(f"category_clip size: {CATEGORY_CLIP.shape}") + + # lseg module + seg_module = Lseg_module(pca_path=PCA_PATH, device=device) + if DOWN_SAMPLE_FEATURE: + down_sampling_fn = seg_module.down_sampling + back_project_fn = seg_module.backproject_to_clip +else: + print("before SPVCNN_Module") + seg_module = SPVCNN_Module(device) + print("after SPVCNN_Module") + +# Load data set +if DATASET == "mp3d": + test_ds = MP3D( + GRID_PARAMS, + INTRINSIC, + segmentation_encode=seg_module.encoding_feature, + pca_downsample=down_sampling_fn, + feature_dir=FEATURE_DIR, + directory=DATA_DIR, + device=device, + latent_size=FEATURE_SIZE, + down_sample_feature=DOWN_SAMPLE_FEATURE, + sequences=SEQUENCES, + raw=RAW_DATA, + grid_mask=GRID_MASK + ) +elif DATASET == 'realworld': + test_ds = RealWorldData( + GRID_PARAMS, + INTRINSIC, + segmentation_encode=seg_module.encoding_feature, + pca_downsample=down_sampling_fn, + feature_dir=FEATURE_DIR, + directory=DATA_DIR, + device=device, + latent_size=FEATURE_SIZE, + down_sample_feature=DOWN_SAMPLE_FEATURE, + sequences=SEQUENCES, + ) +elif DATASET == 'semantic_kitti': + test_ds = KITTI_SPVCNN(device=device, grid_params=GRID_PARAMS) +else: + raise ValueError("Invalid Dataset") + +# Create map object +map_object = GlobalMapContinuous( + torch.tensor([int(p) for p in GRID_PARAMS['grid_size']], dtype=torch.long).to(device), # Grid size + torch.tensor(GRID_PARAMS['min_bound']).to(device), # Lower bound + torch.tensor(GRID_PARAMS['max_bound']).to(device), # Upper bound + FILTER_SIZE, # Filter size + decode=seg_module.decoding_feature, + pca_upsample=back_project_fn, + ell=ELL, + category_feature=CATEGORY_CLIP, + num_classes=NUM_CLASSES, + latent_dim=FEATURE_SIZE, + device=device, # Device + use_relative_pose=USE_RELATIVE_POSE, + pseduo_discrete = PSEDUO_DISCRETE +) + +# result statistics +results = Results() +sequence_class = torch.zeros(map_object.num_classes, device=device) +sequence_int_bki = torch.zeros(map_object.num_classes, device=device) +sequence_int_seg = torch.zeros(map_object.num_classes, device=device) +sequence_un_bki = torch.zeros(map_object.num_classes, device=device) +sequence_un_seg = torch.zeros(map_object.num_classes, device=device) +total_t = 0.0 + +# Iteratively loop through each scan +last_scene = None +last_frame_id = None +seq_dir = None +frame_num = 0 +unlabeld_pc_torch_list = torch.empty(0,3) +pred_labels_list = torch.empty(0) +gt_labels_list = torch.empty(0) + +# for idx in tqdm(range(len(test_ds))): +for idx in tqdm(range(0, 10, 1)): +# for idx in tqdm([0,50]): + with torch.no_grad(): + # Load data + pose, points, pred_labels, gt_labels, scene_id, frame_id = test_ds.get_test_item(idx) + + # NOTE: scene_id, frame_id is the id will be processed, curent_id is the id been processed in last iteration + # Reset and mearues result if new subsequence + if scene_id != last_scene: #or (frame_id - 1) != last_frame_id: + if MEAS_RESULT and map_object.global_map is not None and DATASET != "realworld": + # save map + if SAVE_MAP: + SAVE_MAP_PATH = os.path.join(RESULT_SAVE, last_scene) + if not os.path.exists(SAVE_MAP_PATH): + os.makedirs(SAVE_MAP_PATH) + save_map() + + # inference + inference(unlabeld_pc_torch_list, pred_labels_list, gt_labels_list, map_object, results, SAVE_MAP_PATH, WITH_VARIANCE) + + # reset unlabeled pc + unlabeld_pc_torch_list = torch.empty(0,3) + pred_labels_list = torch.empty(0) + gt_labels_list = torch.empty(0) + + map_object.reset_grid() + + # Update pose if not + start_t = time.time() + map_object.propagate(pose) + + # Add points to map + labeled_pc_torch = torch.hstack((points.to(device), pred_labels.to(device))) + + # NOTE: subsample ranomd points for heldout calculation + if DATASET != 'realworld': + # additional processing for comparison with VLMap + if MODEL_NAME == "LatentBKI_vlmap": + # subsample 1% input points + depth_sample_rate = 100 + np.random.seed(42) + shuffle_mask = np.arange(labeled_pc_torch.shape[0]) + np.random.shuffle(shuffle_mask) + shuffle_mask = shuffle_mask[::depth_sample_rate] + labeled_pc_torch = labeled_pc_torch[shuffle_mask, :] + gt_labels = gt_labels[shuffle_mask, :] + pred_labels = pred_labels[shuffle_mask, :] + + # out of range points filter out our pose + pc_global = map_object.camera_to_global(labeled_pc_torch[:,:3]).clone().cpu().numpy() + rows_cols_heights = np.array([base_pos2grid_id_3d(p[0], p[1], p[2]+1.5) for p in pc_global]) # add camera height with 1.5 on z axis + + out_of_range_mask = np.array([out_of_range(row, col, height) for row, col, height in rows_cols_heights]) + labeled_pc_torch = labeled_pc_torch[~out_of_range_mask, :] + pc_global = pc_global[~out_of_range_mask, :] + gt_labels = gt_labels[~out_of_range_mask, :] + pred_labels = pred_labels[~out_of_range_mask, :] + + # close camera points + mask = labeled_pc_torch[:, 0] > 0.1 + mask = torch.logical_and(mask, labeled_pc_torch[:, 0] < 6) + labeled_pc_torch = labeled_pc_torch[mask, :] + + # eval & input the same + gt_labels = gt_labels[mask, :] + pred_labels = pred_labels[mask, :] + unlabeld_pc_torch = labeled_pc_torch[:,:3].clone() + else: + # heldout points + np.random.seed(42) # each data point has a diffrent random seed preventing generate same random index + point_num = labeled_pc_torch.shape[0] + sampled_index = np.random.choice(point_num, int(0.2*point_num), replace=False) + heldout_mask = np.full(point_num, False) + heldout_mask[sampled_index] = True + + # mask for heldout points, eval points set + gt_labels = gt_labels[heldout_mask, :] + pred_labels = pred_labels[heldout_mask, :] + unlabeld_pc_torch = labeled_pc_torch[heldout_mask, :3] + labeled_pc_torch = labeled_pc_torch[~heldout_mask, :] + + # testing to use fewer points for update + if SUBSAMPLE < 1 and SUBSAMPLE > 0: + # use subsample points here + point_num = labeled_pc_torch.shape[0] + sampled_index = np.random.choice(point_num, int(SUBSAMPLE*point_num), replace=False) + mask = np.full(point_num, False) + mask[sampled_index] = True + labeled_pc_torch = labeled_pc_torch[mask, :] + + # TODO: input eval the same # + # gt_labels = gt_labels[mask, :] + # pred_labels = pred_labels[mask, :] + # unlabeld_pc_torch = labeled_pc_torch[:, :3].clone() + # TODO: input eval the same # + + # update map using observations + map_object.update_map(labeled_pc_torch) + total_t += time.time() - start_t + + if MEAS_RESULT and DATASET != "realworld": + # decode pred_labels + pred_labels = pred_labels.to(device, non_blocking=True) + if DOWN_SAMPLE_FEATURE: + pred_labels = back_project_fn(pred_labels) + pred_labels = seg_module.decoding_feature(pred_labels, map_object.category_feature) + pred_labels = pred_labels.softmax(dim=-1) + + if pred_labels.shape[1] > 1: + pred_labels = torch.argmax(pred_labels, dim=1) + else: + pred_labels = pred_labels.view(-1) + + # camera frame to global frame + unlabeld_pc_torch = map_object.camera_to_global(unlabeld_pc_torch) + unlabeld_pc_torch_list = torch.vstack((unlabeld_pc_torch_list,unlabeld_pc_torch.detach().cpu())) + pred_labels_list = torch.hstack((pred_labels_list, pred_labels.detach().cpu())) + gt_labels_list = torch.hstack((gt_labels_list, gt_labels.view(-1))) + + last_scene = scene_id + last_frame_id = frame_id + frame_num += 1 + +# post processing +if SAVE_MAP: + SAVE_MAP_PATH = os.path.join(RESULT_SAVE, last_scene) + if not os.path.exists(SAVE_MAP_PATH): + os.makedirs(SAVE_MAP_PATH) + save_map() + +if MEAS_RESULT and DATASET != "realworld": + # if KNN_INFERENCE: + inference(unlabeld_pc_torch_list, pred_labels_list, gt_labels_list, map_object, results, SAVE_MAP_PATH, WITH_VARIANCE) + + with open(os.path.join(RESULT_SAVE, 'result.txt'), 'w') as file: + file.write("Final results:\n") + file.write("Seg:\n") + for i in range(NUM_CLASSES): + file.write(f"{i}: {(sequence_int_seg[i] / sequence_un_seg[i] * 100).item()} ({sequence_int_seg[i]} / {sequence_un_seg[i]})\n") + file.write("BKI:\n") + for i in range(NUM_CLASSES): + file.write(f"{i}: {(sequence_int_bki[i] / sequence_un_bki[i] * 100).item()} ({sequence_int_bki[i]} / {sequence_un_bki[i]})\n") + + file.write("Map_update statistics:\n") + + all_intersections = results.all_intersections[results.all_unions > 0] + all_unions = results.all_unions[results.all_unions > 0] + all_miou = np.mean(all_intersections / all_unions) + file.write(f'Average map accuracy: {results.num_correct/results.num_total}\n') + file.write(f'Map miou: {all_miou}\n') + + all_intersections_seg = results.all_intersections_seg[results.all_unions_seg > 0] + all_unions_seg = results.all_unions_seg[results.all_unions_seg > 0] + all_miou_seg = np.mean(all_intersections_seg / all_unions_seg) + file.write(f'Average segmentation network accuracy: {results.num_correct_seg/results.num_total_seg}\n') + file.write(f'Segmentation network miou: {all_miou_seg}\n') + + + + \ No newline at end of file diff --git a/inference.py b/inference.py new file mode 100644 index 0000000..203afb6 --- /dev/null +++ b/inference.py @@ -0,0 +1,276 @@ +import os +import yaml +os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + +import numpy as np +import torch +import clip + +# Custom Imports +from Data.utils import * +from Models.LatentBKI import * +from Models.mapping_utils import * +from Models.Lseg.Lseg_module import Lseg_module +from Models.SPVCNN.SPVCNN_module import SPVCNN_Module + +# result same as train +class Results(): + def __init__(self) -> None: + self.num_correct = 0 + self.num_total = 0 + self.all_intersections = 0 + self.all_unions = 0 + + self.num_correct_seg = 0 + self.num_total_seg = 0 + self.all_intersections_seg = 0 + self.all_unions_seg = 0 + +def save_map(): + print("Saving Map ...") + features = map_object.global_map[:,3:FEATURE_SIZE+3].to(device) + if DOWN_SAMPLE_FEATURE: + features = back_project_fn(features) + labels = torch.argmax(map_object.decode(features, map_object.category_feature), dim=1, keepdim=True) + confidence = map_object.global_map[:,-1].reshape(-1,1).to(dtype=torch.float32) + global_map = torch.cat((map_object.global_map[:,:3], labels.to(torch.float32).cpu(), confidence), dim=1) + global_map = global_map.numpy() + print(global_map.shape) + np.save(os.path.join(SAVE_MAP_PATH, "global_map.npy"), global_map) + np.save(os.path.join(SAVE_MAP_PATH, "global_map_latent.npy"), map_object.global_map) + +def inference(unlabeld_pc_torch_list, pred_labels_list, gt_labels_list, map_object, results, SAVE_MAP_PATH, with_variance): + # first save the points + torch.save(unlabeld_pc_torch_list, os.path.join(SAVE_MAP_PATH, "unlabeld_pc_torch_list.pt")) + torch.save(pred_labels_list, os.path.join(SAVE_MAP_PATH, "pred_labels_list.pt")) + torch.save(gt_labels_list, os.path.join(SAVE_MAP_PATH, "gt_labels_list.pt")) + + print(f"Inference {current_scene} ... ") + unlabeld_pc_torch_list = unlabeld_pc_torch_list.to(device=device, non_blocking=True) + pred_labels_list = pred_labels_list.to(device=device, non_blocking=True) + gt_labels_list = gt_labels_list.to(device=device, non_blocking=True) + print(gt_labels_list.shape) + features = map_object.label_points_iterative(unlabeld_pc_torch_list, with_variance=with_variance) + torch.save(features.cpu(), os.path.join(SAVE_MAP_PATH, "predcited_features.pt")) # save predicted features, variance, confidence + + category_pred = features[:, -1].to(torch.int64).to(device) + + for i in range(map_object.num_classes): + gt_i = gt_labels_list == i + pred_bki_i = category_pred == i + pred_seg_i = pred_labels_list == i + + sequence_class[i] += torch.sum(gt_i) + sequence_int_bki[i] += torch.sum(gt_i & pred_bki_i) + sequence_int_seg[i] += torch.sum(gt_i & pred_seg_i) + sequence_un_bki[i] += torch.sum(gt_i | pred_bki_i) + sequence_un_seg[i] += torch.sum(gt_i | pred_seg_i) + + # accuracy + correct = torch.sum(category_pred == gt_labels_list).item() + total = gt_labels_list.shape[0] + results.num_correct += correct + results.num_total += total + + # miou + inter, union = iou_one_frame(category_pred, gt_labels_list, n_classes=NUM_CLASSES) + union += 1e-6 + results.all_intersections += inter + results.all_unions += union + + # accuracy_seg + # TODO: remove ignore lables? + correct_seg = torch.sum(pred_labels_list == gt_labels_list).item() + total_seg = gt_labels_list.shape[0] + results.num_correct_seg += correct_seg + results.num_total_seg += total_seg + + # miou_seg + inter_seg, union_seg = iou_one_frame(pred_labels_list, gt_labels_list, n_classes=NUM_CLASSES) + union_seg += 1e-6 + results.all_intersections_seg += inter_seg + results.all_unions_seg += union_seg + + # save statistics + print(f"{current_scene} stats:") + seq_intersections = inter[union > 0] + seq_unions = union[union > 0] + seq_miou = np.mean(seq_intersections / seq_unions) + print(f'Average map accuracy: {correct/total}') + print(f'Map miou: {seq_miou}') + + seq_intersections_seg = inter_seg[union_seg > 0] + seq_unions_seg = union_seg[union_seg > 0] + seq_miou_seg = np.mean(seq_intersections_seg / seq_unions_seg) + print(f'Average segmentation network accuracy: {correct_seg/total_seg}') + print(f'Segmentation network miou: {seq_miou_seg}') + print("") + + with open(os.path.join(SAVE_MAP_PATH, 'result_inference.txt'), 'w') as file: + file.write(f"{current_scene} stats:\n") + seq_intersections = inter[union > 0] + seq_unions = union[union > 0] + seq_miou = np.mean(seq_intersections / seq_unions) + file.write(f'Average map accuracy: {correct/total}\n') + file.write(f'Map miou: {seq_miou}\n') + + seq_intersections_seg = inter_seg[union_seg > 0] + seq_unions_seg = union_seg[union_seg > 0] + seq_miou_seg = np.mean(seq_intersections_seg / seq_unions_seg) + file.write(f'Average segmentation network accuracy: {correct_seg/total_seg}\n') + file.write(f'Segmentation network miou: {seq_miou_seg}\n') + +########################## main script ############################ + +MODEL_NAME = "LatentBKI_default" +# MODEL_NAME = "LatentBKI_realworld" +# MODEL_NAME = "LatentBKI_vlmap" +# MODEL_NAME = "LatentBKI_kitti" +RESULT_SAVE = 'Results/LatentBKI_default_mp3d_3_0.5_64_0.1_1' +# scenes = ['5LpN3gDmAk7_1' , 'gTV8FGcVJC9_1', ] +scenes = ['5LpN3gDmAk7_1' ] +# scenes = ['08'] +WIHT_VARIANCE = False +DISCRETE = True +BATCH_SIZE = 100000 + +print("Model is:", MODEL_NAME) + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +print("device is ", device) + +print("---------------") +print("with variance: ", WIHT_VARIANCE) +print("discrete_knn: ", DISCRETE) +print("result save:", RESULT_SAVE) +print("batch size:", BATCH_SIZE) + +# Model Parameters +model_params_file = os.path.join(os.getcwd(), "Config", MODEL_NAME + ".yaml") +with open(model_params_file, "r") as stream: + try: + model_params = yaml.safe_load(stream) + DATASET = model_params["dataset"] + MEAS_RESULT = model_params["meas_result"] + SAVE_MAP = model_params["save_map"] + ELL = model_params["ell"] + WITH_VARIANCE = model_params['with_variance'] + USE_RELATIVE_POSE = model_params['use_relative_pose'] + PSEDUO_DISCRETE = model_params['pseduo_discrete'] + FILTER_SIZE = model_params["filter_size"] + GRID_PARAMS = model_params["grid_params"] + except yaml.YAMLError as exc: + print(exc) + +# Data Parameters +data_params_file = os.path.join(os.getcwd(), "Config", DATASET + ".yaml") +with open(data_params_file, "r") as stream: + try: + data_params = yaml.safe_load(stream) + NUM_CLASSES = data_params["num_classes"] + DATA_DIR = data_params["data_dir"] + CATEGORY = data_params["category"] + FEATURE_DIR = data_params["feature_dir"] + FEATURE_SIZE = data_params["feature_size"] + RAW_DATA = data_params['raw_data'] + DOWN_SAMPLE_FEATURE = data_params["down_sample_feature"] + PCA_PATH = data_params['pca_path'] + GRID_MASK = data_params['grid_mask'] + SUBSAMPLE = data_params['subsample_points'] + SEQUENCES = data_params['sequences'] + INTRINSIC = data_params['intrinsic'] + except yaml.YAMLError as exc: + print(exc) + +# PCA feature reduction functions +down_sampling_fn = None +back_project_fn = None +CATEGORY_CLIP = None +# Create segmentation module +if DATASET != 'semantic_kitti': + # clip features + clip_model, preprocess = clip.load("ViT-B/32", device=device) + text = clip.tokenize(CATEGORY).to(device) + with torch.no_grad(): + text_features = clip_model.encode_text(text) + text_features = text_features.to(torch.float32) + CATEGORY_CLIP = text_features / text_features.norm(dim=-1, keepdim=True) + + print(f"category_clip size: {CATEGORY_CLIP.shape}") + + # lseg module + seg_module = Lseg_module(pca_path=PCA_PATH, device=device) + if DOWN_SAMPLE_FEATURE: + down_sampling_fn = seg_module.down_sampling + back_project_fn = seg_module.backproject_to_clip +else: + seg_module = SPVCNN_Module(device) + +# Create map object +map_object = GlobalMapContinuous( + torch.tensor([int(p) for p in GRID_PARAMS['grid_size']], dtype=torch.long).to(device), # Grid size + torch.tensor(GRID_PARAMS['min_bound']).to(device), # Lower bound + torch.tensor(GRID_PARAMS['max_bound']).to(device), # Upper bound + FILTER_SIZE, # Filter size + decode=seg_module.decoding_feature, + pca_upsample=back_project_fn, + ell=ELL, + category_feature=CATEGORY_CLIP, + num_classes=NUM_CLASSES, + latent_dim=FEATURE_SIZE, + device=device, # Device + use_relative_pose=USE_RELATIVE_POSE, + pseduo_discrete = PSEDUO_DISCRETE +) + +# result statistics +results = Results() +sequence_class = torch.zeros(map_object.num_classes, device=device) +sequence_int_bki = torch.zeros(map_object.num_classes, device=device) +sequence_int_seg = torch.zeros(map_object.num_classes, device=device) +sequence_un_bki = torch.zeros(map_object.num_classes, device=device) +sequence_un_seg = torch.zeros(map_object.num_classes, device=device) +total_t = 0.0 + +# Evaluation Loop +for current_scene in scenes: + # load data + print(f"Processing {current_scene}") + folder = os.path.join(RESULT_SAVE, current_scene) + unlabeld_pc_torch_list = torch.load(f"{folder}/unlabeld_pc_torch_list.pt") + pred_labels_list = torch.load(f"{folder}/pred_labels_list.pt") + gt_labels_list = torch.load(f"{folder}/gt_labels_list.pt") + map_object.global_map = torch.tensor(np.load(f"{folder}/global_map_latent.npy"), dtype=torch.float) + + print("map shape:", map_object.global_map.shape) + + SAVE_MAP_PATH = os.path.join(RESULT_SAVE, current_scene) + if not os.path.exists(SAVE_MAP_PATH): + print(SAVE_MAP_PATH) + os.makedirs(SAVE_MAP_PATH) + + inference(unlabeld_pc_torch_list, pred_labels_list, gt_labels_list, map_object, results, SAVE_MAP_PATH, WITH_VARIANCE) + +# Write result to file +with open(os.path.join(RESULT_SAVE, 'result_inference.txt'), 'w') as file: + file.write("Final results:\n") + file.write("Seg:\n") + for i in range(NUM_CLASSES): + file.write(f"{i}: {(sequence_int_seg[i] / sequence_un_seg[i] * 100).item()} ({sequence_int_seg[i]} / {sequence_un_seg[i]})\n") + file.write("BKI:\n") + for i in range(NUM_CLASSES): + file.write(f"{i}: {(sequence_int_bki[i] / sequence_un_bki[i] * 100).item()} ({sequence_int_bki[i]} / {sequence_un_bki[i]})\n") + + file.write("Map_update statistics:\n") + + all_intersections = results.all_intersections[results.all_unions > 0] + all_unions = results.all_unions[results.all_unions > 0] + all_miou = np.mean(all_intersections / all_unions) + file.write(f'Average map accuracy: {results.num_correct/results.num_total}\n') + file.write(f'Map miou: {all_miou}\n') + + all_intersections_seg = results.all_intersections_seg[results.all_unions_seg > 0] + all_unions_seg = results.all_unions_seg[results.all_unions_seg > 0] + all_miou_seg = np.mean(all_intersections_seg / all_unions_seg) + file.write(f'Average segmentation network accuracy: {results.num_correct_seg/results.num_total_seg}\n') + file.write(f'Segmentation network miou: {all_miou_seg}\n') \ No newline at end of file diff --git a/ipca.py b/ipca.py new file mode 100644 index 0000000..ee0318a --- /dev/null +++ b/ipca.py @@ -0,0 +1,195 @@ +import os +import sys +import pandas as pd +import numpy as np +import json +import time +import seaborn as sns +import clip +import torch +import torch.nn.functional as F +import matplotlib.pyplot as plt +import pickle +from mpl_toolkits.mplot3d import Axes3D +from sklearn.decomposition import PCA + +from sklearn.preprocessing import StandardScaler + + +from PCAonGPU.gpu_pca import IncrementalPCAonGPU + + +def mean_iou(ground_truth, prediction, num_classes): + iou_list = [] + + for cls in range(num_classes): + # Create binary masks for the current class + gt_mask = (ground_truth == cls) + pred_mask = (prediction == cls) + + # Calculate intersection and union + intersection = np.logical_and(gt_mask, pred_mask).sum() + union = np.logical_or(gt_mask, pred_mask).sum() + + if union == 0: + # Avoid division by zero, consider IoU for this class as 1 if both are empty + iou = 1 if intersection == 0 else 0 + else: + iou = intersection / union + + iou_list.append(iou) + + # Calculate mean IoU + mean_iou = np.mean(iou_list) + return mean_iou + +def write_miou(ground_truth, prediction, num_classes, file_path): + with open(file_path, mode='a') as file: + miou = mean_iou(ground_truth, prediction, num_classes) + file.write(f"{miou}\n") + + + +def get_sim_mat(batch_size, bp_data, label_features): + num_batches = (bp_data.size(0) + batch_size - 1) // batch_size + similarity_matrices = [] + for i in range(num_batches): + start_idx = i * batch_size + end_idx = min(start_idx + batch_size, bp_data.size(0)) + batch_bp_data = bp_data[start_idx:end_idx] + similarity_matrix = F.cosine_similarity(batch_bp_data.unsqueeze(1), label_features.unsqueeze(0), dim=2) + similarity_matrices.append(similarity_matrix) + + similarity_matrix = torch.cat(similarity_matrices, dim=0) + print("similarity_matrix.shape: ", similarity_matrix.shape) + return similarity_matrix + + +def get_flattened_data(lseg_dir, npy_file, device): + data = np.load(os.path.join(lseg_dir, npy_file)) + print("data.shape: ", data.shape) + + data = data.squeeze(0) + print("data.shape: ", data.shape) + + flattened_data = data.reshape(data.shape[0], -1) + print("flattened_data.shape: ", flattened_data.shape) + + flattened_data = flattened_data.T + print("flattened_data.shape: ", flattened_data.shape) + + flattened_data = torch.tensor(flattened_data).to(device) + + return flattened_data + + +def main(): + device = "cuda" if torch.cuda.is_available() else "cpu" + #device = "cpu" + print("device: ", device) + model, _ = clip.load("ViT-B/32", device) + + + # initialize ipca + target_dimension = 128 + if len(sys.argv) > 1: + target_dimension = int(sys.argv[1]) + print(f"Received argument: {target_dimension}") + else: + print("No arguments provided") + ipca = IncrementalPCAonGPU(n_components=target_dimension) + + # path of data + data_dir = '/workspace/sdh1/vlmaps_data_dir/vlmaps_dataset' + data_dir2 = '/workspace/sdb1/vlmaps_data_dir/vlmaps_dataset' + sequence_name = 'gTV8FGcVJC9_1' + #sequence_dir = os.path.join(data_dir, sequence_name) + lseg_dir = os.path.join(data_dir2, f'{sequence_name}/lseg_feature') + #pred_dir = os.path.join(data_dir, f'{sequence_name}/lseg_pred') + npy_files = sorted([f for f in os.listdir(lseg_dir) if f.endswith('.npy')]) + + pca_feature_dir = os.path.join(data_dir, f'{sequence_name}/ipca_feature_{target_dimension}') + pca_pred_dir = os.path.join(data_dir, f'{sequence_name}/ipca_pred_{target_dimension}') + pca_miou_dir = os.path.join(data_dir, f'{sequence_name}/ipca_miou_{target_dimension}') + pca_save_dir = os.path.join(data_dir, f'{sequence_name}/ipca_{target_dimension}') + pca_save_path = os.path.join(pca_save_dir, 'ipca.pkl') + os.makedirs(pca_feature_dir, exist_ok=True) + os.makedirs(pca_pred_dir, exist_ok=True) + os.makedirs(pca_miou_dir, exist_ok=True) + os.makedirs(pca_save_dir, exist_ok=True) + # initialize labels + labels = ["void","wall","floor","chair", + "door","table","picture","cabinet", + "cushion","window","sofa","bed", + "curtain","chest_of_drawers","plant", + "sink","stairs","ceiling","toilet", + "stool","towel","mirror","tv_monitor", + "shower","column","bathtub","counter", + "fireplace","lighting","beam","railing", + "shelving","blinds","gym_equipment", + "seating","board_panel","furniture", + "appliances","clothes","objects",] + print("len(labels): ", len(labels)) + label_token = clip.tokenize(labels).to(device) + print("label_token.shape: ", label_token.shape) + with torch.no_grad(): + label_features = model.encode_text(label_token) + print("label_features.shape: ", label_features.shape) + h = 720 + w = 1080 + + for i, npy_file in enumerate(npy_files): + + print(i, " start") + flattened_data = get_flattened_data(lseg_dir, npy_file, device) + ipca.partial_fit(flattened_data) + print("partial fit ", i) + + with open(pca_save_path, 'wb') as file: + pickle.dump(ipca, file) + print("IPCA model saved.") + + for i, npy_file in enumerate(npy_files): + + feature_path = os.path.join(pca_feature_dir, f'{i:06d}.pt') + pred_path = os.path.join(pca_pred_dir, f'{i:06d}.pt') + + print(i, "start") + flattened_data = get_flattened_data(lseg_dir, npy_file, device) + pca_data = ipca.transform(flattened_data) + + # back project to 512 dimension + bp_data = ipca.inverse_transform(pca_data) + print('bp_data.shape: ', bp_data.shape) + + # save pca features + pca_data = pca_data.transpose(0, 1) + print('pca_data.shape: ', pca_data.shape) + pca_data = pca_data.reshape(pca_data.shape[0], h, w) + print('pca_data.shape: ', pca_data.shape) + pca_data = pca_data.unsqueeze(0) + print('pca_data.shape: ', pca_data.shape) + if not os.path.exists(feature_path): + torch.save(pca_data, feature_path) + print("save pca_data") + + similarity_matrix = get_sim_mat(1000, bp_data, label_features) + + print("similarity_matrix.shape: ", similarity_matrix.shape) + prediction_probs = F.softmax(similarity_matrix, dim=1) + print("prediction_probs.shape: ", prediction_probs.shape) + predictions = torch.argmax(prediction_probs, dim=1) + print("predictions.shape: ", predictions.shape) + predictions = predictions.reshape(h, w) + print("predictions.shape: ", predictions.shape) + + # save pca predictions + if not os.path.exists(pred_path): + torch.save(predictions, pred_path) + print("save predictions") + + print(i, "finished") + + +if __name__ == "__main__": + main() diff --git a/ipca_miou.py b/ipca_miou.py new file mode 100644 index 0000000..24ea212 --- /dev/null +++ b/ipca_miou.py @@ -0,0 +1,83 @@ +import os +import pandas as pd +import numpy as np +import sys +import json +import time +import seaborn as sns +import clip +import torch +import torch.nn.functional as F +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D +from sklearn.decomposition import PCA +from sklearn.decomposition import IncrementalPCA +from sklearn.preprocessing import StandardScaler +from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity +from factor_analyzer.factor_analyzer import calculate_kmo + + +def main(): + device = "cuda" if torch.cuda.is_available() else "cpu" + print("device: ", device) + + # path of data + data_dir = '/workspace/sdb1/vlmaps_data_dir/vlmaps_dataset' + sequence_name = 'gTV8FGcVJC9_1' + gt_dir = os.path.join(data_dir, f'{sequence_name}/semantic') + gt_files = sorted([f for f in os.listdir(gt_dir) if f.endswith('.npy')]) + + target_dimension = 64 + if len(sys.argv) > 1: + target_dimension = int(sys.argv[1]) + print(f"Received argument: {target_dimension}") + else: + print("No arguments provided") + + num_classes = 40 + + pca_pred_dir = os.path.join(data_dir, f'{sequence_name}/ipca_pred_{target_dimension}') + pca_miou_dir = os.path.join(data_dir, f'{sequence_name}/ipca_miou_{target_dimension}/miou.txt') + #pca_pred_dir = os.path.join(data_dir, f'{sequence_name}/lseg_pred') + #pca_miou_dir = os.path.join(data_dir, f'{sequence_name}/ipca_miou_{target_dimension}/miou.txt') + + iou_list = [] + for cls in range(num_classes): + intersection = 0 + union = 0 + + for i, gt_file in enumerate(gt_files): + gd_pred = np.load(os.path.join(gt_dir, gt_file)) + root, _ = os.path.splitext(os.path.join(pca_pred_dir, gt_file)) + pca_pred = torch.load(root + '.pt') + + pca_pred = pca_pred.to('cpu').numpy() + + gt_mask = (gd_pred == cls) + pred_mask = (pca_pred == cls) + + + # Calculate intersection and union + intersection += np.logical_and(gt_mask, pred_mask).sum() + union += np.logical_or(gt_mask, pred_mask).sum() + #print("intersections: ", intersection) + #print("union: ", union) + #time.sleep(2) + + if union == 0: + # Avoid division by zero, consider IoU for this class as 1 if both are empty + iou = 1 if intersection == 0 else 0 + else: + iou = intersection / union + iou_list.append(iou) + print(cls) + + # Calculate mean IoU + mean_iou = np.mean(iou_list) + with open(pca_miou_dir, mode='a') as file: + file.write(f"{mean_iou}\n") + + + +if __name__ == "__main__": + main() diff --git a/open_vocabulary_search.py b/open_vocabulary_search.py new file mode 100644 index 0000000..04f53db --- /dev/null +++ b/open_vocabulary_search.py @@ -0,0 +1,230 @@ + +import clip +import os +import torch +import yaml +import rospy +import numpy as np +from tqdm import tqdm + +from Models.Lseg.Lseg_module import Lseg_module +from torch.distributions.studentT import StudentT +# from pyquaternion import Quaternion +from visualization_msgs.msg import Marker +from visualization_msgs.msg import MarkerArray +from geometry_msgs.msg import Point32 +from std_msgs.msg import ColorRGBA +from matplotlib import pyplot as plt +from matplotlib.colors import ListedColormap + +class OpenQuerier(): + def __init__(self, latent_map_path, latent_size, device, pca_path, grid_params, threshold) -> None: + self.device = device + self.seg_module = Lseg_module(pca_path=pca_path, device=self.device) + self.clip_model, _ = clip.load("ViT-B/32", device=self.device) + self.latent_map = np.load(latent_map_path) + + # Flip from OpenGL coordinate to x forward coordinate + # q_xforward = Quaternion([0.5, 0.5, -0.5, -0.5, ]) + # self.latent_map[:,:3] = (q_xforward.rotation_matrix @ self.latent_map[:,:3].T).T + self.latent_map[:,:3] = self.latent_map[:,:3] + self.latent_map = torch.tensor(self.latent_map) + + self.latent_size = latent_size + self.max_dim = grid_params["max_bound"] + self.min_dim = grid_params["min_bound"] + self.grid_dims = grid_params["grid_size"] + self.threshold = threshold + + self.heatmap_publisher = rospy.Publisher("/Open_Query/Heatmap",MarkerArray, queue_size=10) + self.uncertainty_publisher = rospy.Publisher("/Open_Query/Uncertainty",MarkerArray, queue_size=10) + + def sampling_for_variance(self, t_v, t_mean, t_variance, category_features, batch_size = 1000, sample_size = 30): + N = t_v.shape[0] + logits_variances_list = torch.empty(0,1) + + for start in tqdm(range(0, N, batch_size)): + end = min(start+batch_size, N) + + distribution = StudentT(df=t_v[start:end], loc=t_mean[start:end], scale=t_variance[start:end]) + sampled_features = distribution.sample(torch.zeros(sample_size).shape).permute(1,0,2) # (B, m, latent_size) + + # decode into 40 categories + category_logits = (self.seg_module.backproject_to_clip(sampled_features.to(self.device)) @ category_features.T).cpu() # (B, m, # of category) + + # calculate variance in logits space + difference_square = (category_logits - category_logits.mean(dim=1, keepdim=True)).pow(2) + logits_variance = (difference_square / (sample_size - 1)).sum(dim=1) # (B, # of category) + # print(logits_variance.shape) + + if difference_square.sum(dim=1).isinf().any(): + raise + + # add to list + logits_variances_list = torch.vstack((logits_variances_list, logits_variance.reshape(-1,1))) + + # clean cache if needed + torch.cuda.empty_cache() + + return logits_variances_list + + def sample_uncertainty(self, category_features): + t_v = self.latent_map[:,-1].reshape(-1,1) + wishart_variance = self.latent_map[:, 3+64:3+64*2] + t_variance = (t_v + 1) / (t_v * t_v) * wishart_variance + t_mean = self.latent_map[:, 3:3+64] + # take confidence > 2, t distribution variance will only be effective when > 2, otherwise undefined + mask = (t_v > 2).reshape(-1) + t_variance = t_variance[mask] + t_mean = t_mean[mask] + t_v = t_v[mask] + xyz = self.latent_map[:,:3][mask] + + logits_variances_list = self.sampling_for_variance(t_v, t_mean, t_variance, category_features, sample_size=30, batch_size=10000) + # global_map_variance = torch.hstack((xyz, per_voxel_logits_variance.reshape(-1,1))) + return xyz, logits_variances_list.reshape(-1,1) + + def heatmap_to_marker(self, xyz, score, ns): + + score -= torch.min(score) + score /= torch.max(score) + markerArray = MarkerArray() + + # only publish map that's greater than threshold + # score_mask = (score > 0.8).reshape(-1,) + # xyz = xyz[score_mask] + # score = score[score_mask] + + print("Creating ros message") + marker = Marker() + marker.id = 2 + marker.ns = ns + marker.header.frame_id = "map" # change this to match model + scene name LMSC_000001 + marker.type = marker.CUBE_LIST + marker.action = marker.ADD + marker.header.stamp = rospy.Time.now() + + marker.pose.orientation.x = 0.0 + marker.pose.orientation.y = 0.0 + marker.pose.orientation.z = 0.0 + marker.pose.orientation.w = 1 + + marker.scale.x = (self.max_dim[0] - self.min_dim[0]) / self.grid_dims[0] + marker.scale.y = (self.max_dim[1] - self.min_dim[1]) / self.grid_dims[1] + marker.scale.z = (self.max_dim[2] - self.min_dim[2]) / self.grid_dims[2] + + for i in range(xyz.shape[0]): + + point = Point32() + color = ColorRGBA() + point.x = xyz[i, 0] + point.y = xyz[i, 1] + point.z = xyz[i, 2] + var = 2 * score[i].squeeze() + color.r = max(0, var - 1) + color.b = max(0, 1 - var) + color.g = 1 - color.r - color.b + color.a = 1.0 + + if ns == "Open_Query_Heatmap": + cmap = plt.cm.get_cmap('plasma', 11) + else: + cmap = plt.cm.get_cmap('viridis', 11) + listed_cmap = ListedColormap(cmap(np.arange(11))) + + var = score[i].squeeze() + idx = int(var / 0.1) + color.r, color.g, color.b, color.a = listed_cmap(idx) + + marker.points.append(point) + marker.colors.append(color) + + markerArray.markers.append(marker) + return markerArray + + def query(self, str, with_uncertainty = False): + text = clip.tokenize(str).to(self.device) + with torch.no_grad(): + text_features = self.clip_model.encode_text(text) + clip_text = text_features / text_features.norm(dim=1, keepdim=True) + clip_text = clip_text.to(torch.float32) + + if self.latent_size < 512: + latent_feature = self.seg_module.backproject_to_clip(self.latent_map[:,3:3+self.latent_size]) # (n, 512) + else: + latent_feature = self.latent_map[:,3:3+self.latent_size] + + # compute similarity + latent_feature /= latent_feature.norm(dim=1, keepdim=True) + score = (latent_feature @ clip_text.T).cpu() # (-1 to 1) + print(score.min(), score.max()) + score[score <= self.threshold] = self.threshold # cut off value for better visual + + # publish heat map + query_result = self.heatmap_to_marker(self.latent_map[:,:3], score, "Open_Query_Heatmap") + print("Published heatmap!") + self.heatmap_publisher.publish(query_result) + + if with_uncertainty: + xyz, uncertainty = self.sample_uncertainty(clip_text) + # crop out too high uncertatinty for visualization + sorted_uncertainty = sorted(uncertainty) + value = sorted_uncertainty[int(len(sorted_uncertainty) * 0.95)] # ascending order + uncertainty[uncertainty > value] = value + # crop out too high uncertatinty for visualization + uncertainty_result = self.heatmap_to_marker(xyz, uncertainty.cpu(), "Open_Query_Uncertainty") + print("Published uncertainty!") + self.uncertainty_publisher.publish(uncertainty_result) + +def main(): + # TODO: modify the model and path to the map you want to query + MODEL_NAME = "LatentBKI_realworld" + latent_map_path = "/Users/multyxu/Desktop/Programming/LatentBKI/Results/real_world/my_house_long/global_map_latent.npy" + threshold = 0.8 + device = ("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else 'cpu')) + + model_params_file = os.path.join(os.getcwd(), "Config", MODEL_NAME + ".yaml") + with open(model_params_file, "r") as stream: + try: + model_params = yaml.safe_load(stream) + dataset = model_params["dataset"] + GRID_PARAMS = model_params["grid_params"] + except yaml.YAMLError as exc: + print(exc) + + data_params_file = os.path.join(os.getcwd(), "Config", dataset + ".yaml") + with open(data_params_file, "r") as stream: + try: + data_params = yaml.safe_load(stream) + FEATURE_SIZE = data_params["feature_size"] + PCA_PATH = data_params['pca_path'] + except yaml.YAMLError as exc: + print(exc) + + # PCA_PATH = '/Users/multyxu/Desktop/Programming/LatentBKI/Results/real_world/64_state_dict.pt' # manually set on macbook + + print("Init querier...") + querier = OpenQuerier(latent_map_path, FEATURE_SIZE, device, PCA_PATH, GRID_PARAMS, threshold) + rospy.init_node('Open_vocabulary_demo', anonymous=True) + + while not rospy.is_shutdown(): + word = input("What's te word you want to query? (enter 'q' to quit) ") + if word == 'q': + print("Ending query session...") + break + with_uncertainty = input("With Uncertainty? (True or False, enter 'q' to quit)") + if with_uncertainty == "True": + with_uncertainty = True + else: + with_uncertainty = False + if with_uncertainty == 'q': + print("Ending query session...") + break + print("Querying for:", word, "With uncertainty = ",with_uncertainty) + querier.query(word, with_uncertainty) + rospy.sleep(1) + +if __name__ == '__main__': + main() + + \ No newline at end of file diff --git a/publish_map.py b/publish_map.py new file mode 100644 index 0000000..c2e6485 --- /dev/null +++ b/publish_map.py @@ -0,0 +1,187 @@ +import rospy +import yaml +import os +import numpy as np +import torch +import roslib; roslib.load_manifest('visualization_marker_tutorials') +from matplotlib import pyplot as plt +from matplotlib.colors import ListedColormap +from visualization_msgs.msg import Marker +from visualization_msgs.msg import MarkerArray +from geometry_msgs.msg import Point32 +from std_msgs.msg import ColorRGBA + +# view variance map +latent_map = np.load("/Users/multyxu/Desktop/Programming/LatentBKI/Results/5L/global_map_latent.npy") +category_map = np.load("/Users/multyxu/Desktop/Programming/LatentBKI/Results/5L/global_map.npy") + +# sampled variance map +# latent_map = torch.load("/Users/multyxu/Desktop/Programming/LatentBKI/Results/5L/global_map_variance.pt").numpy() +# category_map = torch.load("/Users/multyxu/Desktop/Programming/LatentBKI/Results/5L/global_map_category.pt").numpy() + +def create_marker(marker_ns, xyz, value, min_dim, max_dim, grid_dims, colors): + ''' + xyz: (N, 3) + value: (N, ) + ''' + marker = Marker() + marker.id = 1 + marker.ns = "Global_Semantic_Map" # + marker.ns = marker_ns + marker.header.frame_id = "map" # change this to match model + scene name LMSC_000001 + marker.type = marker.CUBE_LIST + marker.action = marker.ADD + marker.header.stamp = rospy.Time.now() + + marker.pose.orientation.x = 0.0 + marker.pose.orientation.y = 0.0 + marker.pose.orientation.z = 0.0 + marker.pose.orientation.w = 1 + + marker.scale.x = (max_dim[0] - min_dim[0]) / grid_dims[0] + marker.scale.y = (max_dim[1] - min_dim[1]) / grid_dims[1] + marker.scale.z = (max_dim[2] - min_dim[2]) / grid_dims[2] + + for i in range(xyz.shape[0]): + point = Point32() + color = ColorRGBA() + point.x = xyz[i, 0] + point.y = xyz[i, 1] + point.z = xyz[i, 2] + color.r, color.g, color.b, color.a = colors(value[i]) + + marker.points.append(point) + marker.colors.append(color) + + return marker + +def create_semantic_marker(global_map, num_class, min_dim, max_dim, grid_dims): + ''' + global_map: (N, 3+1+1) + ''' + markerArray = MarkerArray() + + cmap = plt.cm.get_cmap('jet', num_class) + listed_cmap = ListedColormap(cmap(np.arange(num_class))) + + print("Creating marker for semantic with map shape of", global_map.shape) + semantic_labels = global_map[:,3].astype(np.int32).reshape(-1,) + centroids = global_map[:, :3] + + marker = create_marker("Global_Semantic_Map", centroids, semantic_labels, min_dim, max_dim, grid_dims, listed_cmap) + markerArray.markers.append(marker) + + return markerArray + +def create_variance_marker(global_map, min_dim, max_dim, grid_dims): + markerArray = MarkerArray() + + cmap = plt.cm.get_cmap('viridis', 11) + listed_cmap = ListedColormap(cmap(np.arange(11))) + + print("Creating marker for variance with map shape of", global_map.shape) + + xyz = global_map[:,:3] + variance = global_map[:, 3] + variance -= np.min(variance) + variance /= np.max(variance) # make it between 0-1 + variance = (variance / 0.1).astype(np.int32) + + marker = create_marker("Global_Variance_Map", xyz, variance, min_dim, max_dim, grid_dims, listed_cmap) + markerArray.markers.append(marker) + + return markerArray + +############## main script ################ + +# TODO: change these +MODEL_NAME = "LatentBKI_default" +SAMPLEED = True # view sampled variance map +D_OP = True # view variance in latent space +MAP_DIR = "/Users/multyxu/Desktop/Programming/LatentBKI/Results/gT" +# TODO: change these + +publisher = rospy.Publisher('visualization_marker_array', MarkerArray, queue_size=10) +rospy.init_node('register') + +# load map from memory +if SAMPLEED: + # TODO: need to preprocess the variance map + latent_map = torch.load(os.path.join(MAP_DIR,"global_map_variance.pt")).numpy() + category_map = torch.load(os.path.join(MAP_DIR,"global_map_category.pt")).numpy() +else: + latent_map = np.load(os.path.join(MAP_DIR,"global_map_latent.npy")) + category_map = np.load(os.path.join(MAP_DIR,"global_map.npy")) + +model_params_file = os.path.join(os.getcwd(), "Config", MODEL_NAME + ".yaml") +with open(model_params_file, "r") as stream: + try: + model_params = yaml.safe_load(stream) + DATASET = model_params["dataset"] + GRID_PARAMS = model_params["grid_params"] + MIN_BOUND = np.array(GRID_PARAMS["min_bound"]) + MAX_BOUND = np.array(GRID_PARAMS["max_bound"]) + GRID_SIZE = np.array(GRID_PARAMS["grid_size"]) + except yaml.YAMLError as exc: + print(exc) + +data_params_file = os.path.join(os.getcwd(), "Config", DATASET + ".yaml") +with open(data_params_file, "r") as stream: + try: + data_params = yaml.safe_load(stream) + FEATURE_SIZE = data_params["feature_size"] + PCA_PATH = data_params['pca_path'] + NUM_CLASS = data_params['num_classes'] + except yaml.YAMLError as exc: + print(exc) + +# mask out ceiling +if DATASET == "mp3d": + mask = category_map[:,3] != 17 # in MP3D, 17 is ceiling + latent_map = latent_map[mask] + category_map = category_map[mask] + +# process variance +if SAMPLEED: + xyz = latent_map[:,:3] + variance = latent_map[:,4].reshape(-1,1) +else: + # convert into Multivariate Student-t Distribution per voxel + pred_confidence = latent_map[:,-1].reshape(-1,1) + pred_variance = latent_map[:,3+64: 3+64*2] + # mean remain the same + t_variance = (pred_confidence + 1) / (pred_confidence * pred_confidence) * pred_variance + t_mean = latent_map[:, 3:3+64] + t_v = pred_confidence + + confidence_mask = (t_v > 2).reshape(-1) # variance is effective when v > 2? + t_variance = t_variance[confidence_mask] + t_mean = t_mean[confidence_mask] + t_v = t_v[confidence_mask] + t_xyz = latent_map[:,:3][confidence_mask] + + if D_OP: + t_variance_norm = np.sum(np.log(t_variance), axis=-1, keepdims=True) / t_variance.shape[-1] + t_variance_norm = np.exp(t_variance_norm) + else: + # E-OP + t_variance_norm = np.max(np.abs(t_variance), axis=-1, keepdims=True) + + xyz = t_xyz + variance = t_variance_norm + +print(variance.max(), variance.min(), variance.mean()) + +# cut off points for better visuals +sorted_uncertainty = sorted(variance) +value = sorted_uncertainty[int(len(sorted_uncertainty) * 0.95)] # ascending order +variance[variance > value] = value +variance_map = np.hstack((xyz, variance)) + +semantic_marker = create_semantic_marker(category_map, NUM_CLASS, MIN_BOUND, MAX_BOUND, GRID_SIZE) +variance_marker = create_variance_marker(variance_map, MIN_BOUND, MAX_BOUND, GRID_SIZE) + +while not rospy.is_shutdown(): + publisher.publish(semantic_marker) + publisher.publish(variance_marker) + rospy.sleep(3) \ No newline at end of file