diff --git a/README.md b/README.md index 51df1c1..64508ba 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,14 @@ -# inm-icf-utilities +# Utilities for managing the INM-ICF DICOM data store at Research Center Jülich [![Documentation Status](https://readthedocs.org/projects/inm-icf-utilities/badge/?version=latest)](https://inm-icf-utilities.readthedocs.io/en/latest/?badge=latest) - [![Build status](https://ci.appveyor.com/api/projects/status/jaife669slqyru52/branch/main?svg=true)](https://ci.appveyor.com/project/mih/inm-icf-utilities/branch/main) + + +## Acknowledgements + +This software was developed with support from the German Federal Ministry of +Education and Research (BMBF 01GQ1905), the US National Science Foundation (NSF +1912266), the Helmholtz research center Jülich (RDM challenge 2022), and the +Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under grant +SFB 1451 ([431549029](https://gepris.dfg.de/gepris/projekt/431549029), INF +project). diff --git a/bin/catalogify_studyvisit_from_meta b/bin/catalogify_studyvisit_from_meta index 4e07101..5a33587 100755 --- a/bin/catalogify_studyvisit_from_meta +++ b/bin/catalogify_studyvisit_from_meta @@ -12,7 +12,6 @@ import sys import tempfile from uuid import uuid4 -import datalad.api as dl from datalad_catalog.catalog import Catalog from datalad_catalog.webcatalog import WebCatalog @@ -95,7 +94,7 @@ def get_catalog(study_id, catalog_path): # 3. set catalog home page ctlg.main_id = study_entry.get('dataset_id') ctlg.main_version = study_entry.get('dataset_version') - ctlg.set_main_dataset() + ctlg.set_main_dataset() return ctlg @@ -109,7 +108,7 @@ def generate_study_entry(study_id): ds_version='latest', ds_name=study_id, ds_description=desc) - + def update_entry(ds_id, ds_version, ds_name, key, value, study_catalog_path): meta_item = { @@ -247,7 +246,6 @@ def format_bytes(bytes, decimals=2): return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}" - if __name__ == '__main__': import argparse p = argparse.ArgumentParser(description=__doc__) diff --git a/bin/dataladify_studyvisit_from_meta b/bin/deposit_visit_dataset similarity index 73% rename from bin/dataladify_studyvisit_from_meta rename to bin/deposit_visit_dataset index 6249169..a70ba38 100755 --- a/bin/dataladify_studyvisit_from_meta +++ b/bin/deposit_visit_dataset @@ -1,20 +1,27 @@ #!/usr/bin/env python3 """ - +This command reads the metadata deposit from `deposit_visit_metadata` for a +visit in a study (given by their respective identifiers) from the data store, +and generates a DataLad dataset from it. This DataLad dataset provides +versioned access to the visit's DICOM data, up to single-image granularity. +Moreover, all DICOM files are annotated with basic DICOM tags that enable +on-demand dataset views for particular applications (e.g., DICOMs sorted +by image series and protocol name). The DataLad dataset is deposited in +two files in the study directory: + +- `{visit_id}_XDLRA--refs` +- `{visit_id}_XDLRA--repo-export` + +where the former enables `datalad/git clone` operations, and the latter +represents the actual dataset as a compressed archive. """ import json import os from pathlib import Path -import sys import tempfile import datalad.api as dl -# this points to the top of the ICF data store. -# internally it will be amended with the missing components -# for study and visit deposit locations -icfstore_baseurl = 'https://data.inm-icf.de' - # which DICOM tags to extract from DICOM files and store as # git-annex metadata (e.g., to enable metadata-driven views # of visit datasets) @@ -28,9 +35,12 @@ dicom_metadata_keys = [ ] -def main(store_dir: str, - study_id: str, - visit_id: str): +def main( + store_dir: str, + store_url: str, + study_id: str, + visit_id: str, +): store_base_dir = Path(store_dir) # where to deposit the final datalad dataset repo_base_path = store_base_dir / study_id / f'{visit_id}_' @@ -48,20 +58,27 @@ def main(store_dir: str, f'{visit_id}_metadata_dicoms.json' with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir: - runshit( + deposit_dataset( # workdir wdir, # path to deposited dataset metadata dataset_metadata_path.absolute(), # path to deposited file metadata file_metadata_path.absolute(), + # base URL of the store to complete access URLs + store_url, # path to deposit the repo at repo_base_path.absolute(), ) -def runshit(wdir, metapath_dataset, metapath_file, repobasepath): - +def deposit_dataset( + wdir: Path, + metapath_dataset: Path, + metapath_files: Path, + store_url: str, + repobasepath: Path, +): # read tar metadata dict tar_metadata = read_json_file(metapath_dataset) expected_keys = ('size', 'md5', 'dspath', 'storepath') @@ -88,7 +105,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath): uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid'] assert uncurl_uuid # register the URL of the tarball - tar_metadata['url'] = f"{icfstore_baseurl}/{tar_metadata['storepath']}" + tar_metadata['url'] = f"{store_url}/{tar_metadata['storepath']}" res = ds.addurls( [tar_metadata], '{url}', @@ -98,9 +115,11 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath): # fish out annex key of tarball. # we could also construct that, but let's not duplicate the setup above tarpath = Path(tar_metadata.get('dspath')) - tarkey = [r.get('annexkey') for r in res - if r.get('action') == 'fromkey' - and r.get('path', '').endswith(tarpath.name)] + tarkey = [ + r.get('annexkey') for r in res + if r.get('action') == 'fromkey' + and r.get('path', '').endswith(tarpath.name) + ] assert len(tarkey) == 1 tarkey = tarkey[0] assert tarkey @@ -123,7 +142,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath): assert archivist_uuid # load dicom metadata - dicoms = read_json_file(metapath_file) + dicoms = read_json_file(metapath_files) # add to dataset dicom_recs = ds.addurls( dicoms, @@ -146,7 +165,10 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath): repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1']) repo.call_git([ - 'remote', 'add', 'icfstore', + 'remote', 'add', + # the remote name is arbitrary, it will not end up in the resulting + # deposit + 'store', # this is a little twisted: # the first line is an f-string, because we need to get the base URL # pointing to the study directory into the remote URL @@ -163,7 +185,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath): # to be able to actually push everything repo.call_annex(['whereis', '--key', dicomkeys[0]]) ds.push( - to='icfstore', + to='store', # under no circumstances do we want to push annexed content. # and there also should be none data='nothing', @@ -174,31 +196,36 @@ def read_json_file(file_path): """ Load content from catalog metadata file for current node """ - try: - with open(file_path) as f: - return json.load(f) - except OSError as err: - raise("OS error: {0}".format(err)) - except: - raise("Unexpected error:", sys.exc_info()[0]) + with open(file_path) as f: + return json.load(f) if __name__ == '__main__': import argparse - p = argparse.ArgumentParser(description=__doc__) + p = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.add_argument( + '--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True, + help="study and visit identifiers, used to " + "locate the visit archive in the storage organization. " + ) p.add_argument( "-o", "--store-dir", metavar='PATH', default=os.getcwd(), - help="Root directory of the ICF data store. " + help="root directory of the data store. " "Visit data will be read from it, and the DataLad dataset will be " "deposited into it." ) p.add_argument( - '--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True, - help="The study and visit identifiers, used to " - "locate the visit archive in the storage organization. " + '--store-url', metavar='URL', default='https://data.inm-icf.de', + help="base URL of the DICOM data store. This URL is used to " + "register TAR archive download URLs in the generated DataLad " + "dataset." ) args = p.parse_args() main(store_dir=args.store_dir, + store_url=args.store_url, study_id=args.id[0], visit_id=args.id[1], ) diff --git a/bin/getmeta_studyvisit b/bin/deposit_visit_metadata similarity index 76% rename from bin/getmeta_studyvisit rename to bin/deposit_visit_metadata index 80b8e96..6371b26 100755 --- a/bin/getmeta_studyvisit +++ b/bin/deposit_visit_metadata @@ -1,6 +1,25 @@ #!/usr/bin/env python3 """ +This command locates the DICOM tarball for a particular visit in a study (given +by their respective identifiers) in the data store, and extracts a minimal set +of metadata tags for each DICOM image, and the TAR archive as a whole. These +metadata are then deposited in two files, in JSON format, in the study +directory: +- `{visit_id}_metadata_tarball.json` + + JSON object with basic properties of the archive, such as 'size', and + 'md5'. + +- `{visit_id}_metadata_dicoms.json` + + JSON array with essential properties for each DICOM image file, such as + 'path' (relative path inside the TAR archive), 'md5' (MD5 checksum of + the DICOM file), 'size' (in bytes), and select standard DICOM tags, + such as "SeriesDescription", "SeriesNumber", "Modality", + "MRAcquisitionType", "ProtocolName", "PulseSequenceName". The latter + enable a rough, technical characterization of the images in the TAR + archive. """ import logging import os @@ -17,11 +36,6 @@ from datalad.utils import md5sum lgr = logging.getLogger('inm-icf-utilities') -# this points to the top of the ICF data store. -# internally it will be amended with the missing components -# for study and visit deposit locations -icfstore_baseurl = 'https://data.inm-icf.de' - # which DICOM tags to extract from DICOM files and store as # git-annex metadata (e.g., to enable metadata-driven views # of visit datasets) @@ -58,7 +72,7 @@ def main(store_dir: str, if not tar_path.exists(): raise ValueError(f'no tarball at {tar_path}') - runshit( + describe_tarball( # source visit tarball tar_path.resolve(), # source visit tarball URL, relative to store @@ -70,7 +84,7 @@ def main(store_dir: str, ) -def runshit(tarpath, tarurl, metapath_dataset, metapath_file): +def describe_tarball(tarpath, tarurl, metapath_dataset, metapath_file): # construct and dump dataset metadata tar_meta = { 'size': tarpath.stat().st_size, @@ -115,10 +129,13 @@ def runshit(tarpath, tarurl, metapath_dataset, metapath_file): if __name__ == '__main__': import argparse - p = argparse.ArgumentParser(description=__doc__) + p = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) p.add_argument( "-o", "--store-dir", metavar='PATH', default=os.getcwd(), - help="Root directory of the ICF data store. " + help="Root directory of the data store. " "Visit data will be read from it, and extracted metadata will be " "deposited into it." ) diff --git a/docs/source/admin.rst b/docs/source/admin.rst new file mode 100644 index 0000000..ad2605d --- /dev/null +++ b/docs/source/admin.rst @@ -0,0 +1,152 @@ +Administrator docs +================== + +The INM-ICF Utilities `Github repository`_ provides a set of +executable Python scripts which automate generation of deposits in the +ICF archive. To simplify deployment, these scripts and all their +dependencies are packaged as a `Singularity`_ v3 container +(`download`_). + +.. _github repository: https://github.com/psychoinformatics-de/inm-icf-utilities +.. _singularity: https://docs.sylabs.io/guides/main/user-guide/ +.. _download: https://ci.appveyor.com/api/projects/mih/inm-icf-utilities/artifacts/icf.sif + +Archive generation +------------------ + +Containerized execution +^^^^^^^^^^^^^^^^^^^^^^^ + +With the Singilarity image, ``icf.sif``, all scripts are made directly +available, either through ``singularity run``: + +.. code-block:: console + + $ singularity run icf.sif