-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patha_save_slide_metadata.py
156 lines (125 loc) · 5.91 KB
/
a_save_slide_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Saves metadata for all WSIs in a dataset.
For DHMC dataset, the metadata is saved in a CSV file. The metadata includes `mpp` and `power` values for each slide.
"""
# built-in imports
import argparse
import os
import sys
import glob
import importlib
import json
from pprint import pprint
import concurrent.futures
from pathlib import Path
# standard imports
import numpy as np
import pandas as pd
# get tiatoolbox>=1.6
import tiatoolbox
assert tiatoolbox.__version__ >= "1.6", (
f"tiatoolbox version {tiatoolbox.__version__} is installed, but version >= 1.6 is required."
)
# tiatoolbox imports
from tiatoolbox.wsicore.wsireader import WSIReader
DHMC_metadata_path = "/well/rittscher-dart/shared/datasets/lung/DHMC/MetaData_Release_1.0.csv"
class NumpyPosixEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, Path):
return str(obj)
else:
return super(NumpyPosixEncoder, self).default(obj)
def process_slide(slide_path, dataset, metadata_df, verbose):
slide_basename = Path(slide_path).name
slide_id = Path(slide_path).stem
if dataset.startswith("DHMC"):
slide_metadata = metadata_df.loc[slide_basename]
mpp = slide_metadata["Microns Per Pixel"]
power = slide_metadata["Magnification"]
else:
mpp = None
power = None
# open the slide
try:
reader = WSIReader.open(slide_path, mpp=mpp, power=power)
except Exception as e:
return slide_id, None, str(e)
if reader is None:
return slide_id, None, "reader is None"
info_dict = reader.info.as_dict()
if verbose:
pprint(reader.info.as_dict())
slide_metadata = {}
slide_metadata["stain"] = "EVG" if "_EVG" in slide_path else "H&E"
for key, value in info_dict.items():
slide_metadata[key] = value
return slide_id, slide_metadata, None
# python a_save_slide_metadata.py --dataset ouh_batch1_20x --slide_format ndpi
# python a_save_slide_metadata.py --dataset ouh_batch1_40x --slide_format tif
# python a_save_slide_metadata.py --dataset ouh_batch2_20x --slide_format tif
# python a_save_slide_metadata.py --dataset ouh_batch2_40x --slide_format tif
# python a_save_slide_metadata.py --dataset ouh_batch3_40x --slide_format tif
# python a_save_slide_metadata.py --dataset DHMC_20x --slide_format tif
# python a_save_slide_metadata.py --dataset DHMC_40x --slide_format tif
# python a_save_slide_metadata.py --dataset TCGA-lung --slide_format svs
# python a_save_slide_metadata.py --dataset TCIA_CPTAC_test --slide_format svs
# python a_save_slide_metadata.py --dataset TCIA-CPTAC --slide_format svs
# python a_save_slide_metadata.py --dataset DART_001 --slide_format tif
# python a_save_slide_metadata.py --dataset DART_002 --slide_format tif
# python a_save_slide_metadata.py --dataset DART_003 --slide_format tif
# python a_save_slide_metadata.py --dataset DART_004 --slide_format tif
# python a_save_slide_metadata.py --dataset CAMELYON16 --slide_format tif
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Saving metadata for all WSIs in a dataset")
parser.add_argument("--dataset", type=str, default="TCIA-CPTAC_test", help="Dataset name")
parser.add_argument("--slide_format", type=str, default="svs", help="Image format for tiles.", choices=["svs", "tif", "tiff", "ndpi"])
parser.add_argument("--save_dir", default="wsi_metadata/", type=str, help="Directory to save metadata")
# compute arguments
parser.add_argument("--num_workers", type=int, default=8, help="Number of worker processes to use")
# flags
parser.add_argument("--verbose", action="store_true", help="Print verbose output", default=False)
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing metadata file", default=False)
# all arguments
args = parser.parse_args()
# get all slide paths in the dataset
path_base = os.path.join("WSI", args.dataset)
dataset_metadata_save_path = os.path.join(args.save_dir, f"{args.dataset}.json")
if os.path.exists(dataset_metadata_save_path) and not args.overwrite:
raise FileExistsError(f"Metadata for {args.dataset} already exists. Skipping.")
all_slide_paths = sorted(
glob.glob(os.path.join(path_base, "*/*." + args.slide_format))
+ glob.glob(os.path.join(path_base, "*/*/*." + args.slide_format))
)
if args.dataset.startswith("DHMC"):
metadata_df = pd.read_csv(DHMC_metadata_path, index_col="File Name")
else:
metadata_df = None
total_failed = 0
slide_2_metadata = {}
slide_2_failure_details = {}
with concurrent.futures.ProcessPoolExecutor(max_workers=args.num_workers) as executor:
futures = [
executor.submit(process_slide, slide_path, args.dataset, metadata_df, args.verbose)
for slide_path in all_slide_paths
]
for future in concurrent.futures.as_completed(futures):
slide_id, slide_metadata, error = future.result()
if error:
total_failed += 1
slide_2_failure_details[slide_id] = error
else:
assert slide_id not in slide_2_metadata, f"Duplicate slide: {slide_id}"
slide_2_metadata[slide_id] = slide_metadata
print(f"\nDONE!!! Extracting metadata done for {len(all_slide_paths)-total_failed} / {len(all_slide_paths)} slides of {args.dataset} dataset.")
# save metadata to a JSON file
with open(dataset_metadata_save_path, "w") as f:
json.dump(slide_2_metadata, f, cls=NumpyPosixEncoder, indent=2)
# save details of failed slides to a file
with open(os.path.join(args.save_dir, f"{args.dataset}_failed_slides.json"), "w") as f:
json.dump(slide_2_failure_details, f, indent=2)