Skip to content

Commit

Permalink
Merge branch 'main' into enh-ci-https
Browse files Browse the repository at this point in the history
  • Loading branch information
christian-monch committed May 31, 2023
2 parents dd5775b + 30960e4 commit ff65671
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 66 deletions.
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# inm-icf-utilities
# Utilities for managing the INM-ICF DICOM data store at Research Center Jülich

[![Documentation Status](https://readthedocs.org/projects/inm-icf-utilities/badge/?version=latest)](https://inm-icf-utilities.readthedocs.io/en/latest/?badge=latest)

[![Build status](https://ci.appveyor.com/api/projects/status/jaife669slqyru52/branch/main?svg=true)](https://ci.appveyor.com/project/mih/inm-icf-utilities/branch/main)


## Acknowledgements

This software was developed with support from the German Federal Ministry of
Education and Research (BMBF 01GQ1905), the US National Science Foundation (NSF
1912266), the Helmholtz research center Jülich (RDM challenge 2022), and the
Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under grant
SFB 1451 ([431549029](https://gepris.dfg.de/gepris/projekt/431549029), INF
project).
6 changes: 2 additions & 4 deletions bin/catalogify_studyvisit_from_meta
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import sys
import tempfile
from uuid import uuid4

import datalad.api as dl
from datalad_catalog.catalog import Catalog
from datalad_catalog.webcatalog import WebCatalog

Expand Down Expand Up @@ -95,7 +94,7 @@ def get_catalog(study_id, catalog_path):
# 3. set catalog home page
ctlg.main_id = study_entry.get('dataset_id')
ctlg.main_version = study_entry.get('dataset_version')
ctlg.set_main_dataset()
ctlg.set_main_dataset()
return ctlg


Expand All @@ -109,7 +108,7 @@ def generate_study_entry(study_id):
ds_version='latest',
ds_name=study_id,
ds_description=desc)


def update_entry(ds_id, ds_version, ds_name, key, value, study_catalog_path):
meta_item = {
Expand Down Expand Up @@ -247,7 +246,6 @@ def format_bytes(bytes, decimals=2):
return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}"



if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser(description=__doc__)
Expand Down
91 changes: 59 additions & 32 deletions bin/dataladify_studyvisit_from_meta → bin/deposit_visit_dataset
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
#!/usr/bin/env python3
"""
This command reads the metadata deposit from `deposit_visit_metadata` for a
visit in a study (given by their respective identifiers) from the data store,
and generates a DataLad dataset from it. This DataLad dataset provides
versioned access to the visit's DICOM data, up to single-image granularity.
Moreover, all DICOM files are annotated with basic DICOM tags that enable
on-demand dataset views for particular applications (e.g., DICOMs sorted
by image series and protocol name). The DataLad dataset is deposited in
two files in the study directory:
- `{visit_id}_XDLRA--refs`
- `{visit_id}_XDLRA--repo-export`
where the former enables `datalad/git clone` operations, and the latter
represents the actual dataset as a compressed archive.
"""
import json
import os
from pathlib import Path
import sys
import tempfile

import datalad.api as dl

# this points to the top of the ICF data store.
# internally it will be amended with the missing components
# for study and visit deposit locations
icfstore_baseurl = 'https://data.inm-icf.de'

# which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets)
Expand All @@ -28,9 +35,12 @@ dicom_metadata_keys = [
]


def main(store_dir: str,
study_id: str,
visit_id: str):
def main(
store_dir: str,
store_url: str,
study_id: str,
visit_id: str,
):
store_base_dir = Path(store_dir)
# where to deposit the final datalad dataset
repo_base_path = store_base_dir / study_id / f'{visit_id}_'
Expand All @@ -48,20 +58,27 @@ def main(store_dir: str,
f'{visit_id}_metadata_dicoms.json'

with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir:
runshit(
deposit_dataset(
# workdir
wdir,
# path to deposited dataset metadata
dataset_metadata_path.absolute(),
# path to deposited file metadata
file_metadata_path.absolute(),
# base URL of the store to complete access URLs
store_url,
# path to deposit the repo at
repo_base_path.absolute(),
)


def runshit(wdir, metapath_dataset, metapath_file, repobasepath):

def deposit_dataset(
wdir: Path,
metapath_dataset: Path,
metapath_files: Path,
store_url: str,
repobasepath: Path,
):
# read tar metadata dict
tar_metadata = read_json_file(metapath_dataset)
expected_keys = ('size', 'md5', 'dspath', 'storepath')
Expand All @@ -88,7 +105,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid']
assert uncurl_uuid
# register the URL of the tarball
tar_metadata['url'] = f"{icfstore_baseurl}/{tar_metadata['storepath']}"
tar_metadata['url'] = f"{store_url}/{tar_metadata['storepath']}"
res = ds.addurls(
[tar_metadata],
'{url}',
Expand All @@ -98,9 +115,11 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
# fish out annex key of tarball.
# we could also construct that, but let's not duplicate the setup above
tarpath = Path(tar_metadata.get('dspath'))
tarkey = [r.get('annexkey') for r in res
if r.get('action') == 'fromkey'
and r.get('path', '').endswith(tarpath.name)]
tarkey = [
r.get('annexkey') for r in res
if r.get('action') == 'fromkey'
and r.get('path', '').endswith(tarpath.name)
]
assert len(tarkey) == 1
tarkey = tarkey[0]
assert tarkey
Expand All @@ -123,7 +142,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
assert archivist_uuid

# load dicom metadata
dicoms = read_json_file(metapath_file)
dicoms = read_json_file(metapath_files)
# add to dataset
dicom_recs = ds.addurls(
dicoms,
Expand All @@ -146,7 +165,10 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1'])

repo.call_git([
'remote', 'add', 'icfstore',
'remote', 'add',
# the remote name is arbitrary, it will not end up in the resulting
# deposit
'store',
# this is a little twisted:
# the first line is an f-string, because we need to get the base URL
# pointing to the study directory into the remote URL
Expand All @@ -163,7 +185,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
# to be able to actually push everything
repo.call_annex(['whereis', '--key', dicomkeys[0]])
ds.push(
to='icfstore',
to='store',
# under no circumstances do we want to push annexed content.
# and there also should be none
data='nothing',
Expand All @@ -174,31 +196,36 @@ def read_json_file(file_path):
"""
Load content from catalog metadata file for current node
"""
try:
with open(file_path) as f:
return json.load(f)
except OSError as err:
raise("OS error: {0}".format(err))
except:
raise("Unexpected error:", sys.exc_info()[0])
with open(file_path) as f:
return json.load(f)


if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser(description=__doc__)
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
help="study and visit identifiers, used to "
"locate the visit archive in the storage organization. "
)
p.add_argument(
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
help="Root directory of the ICF data store. "
help="root directory of the data store. "
"Visit data will be read from it, and the DataLad dataset will be "
"deposited into it."
)
p.add_argument(
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
help="The study and visit identifiers, used to "
"locate the visit archive in the storage organization. "
'--store-url', metavar='URL', default='https://data.inm-icf.de',
help="base URL of the DICOM data store. This URL is used to "
"register TAR archive download URLs in the generated DataLad "
"dataset."
)
args = p.parse_args()
main(store_dir=args.store_dir,
store_url=args.store_url,
study_id=args.id[0],
visit_id=args.id[1],
)
35 changes: 26 additions & 9 deletions bin/getmeta_studyvisit → bin/deposit_visit_metadata
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
#!/usr/bin/env python3
"""
This command locates the DICOM tarball for a particular visit in a study (given
by their respective identifiers) in the data store, and extracts a minimal set
of metadata tags for each DICOM image, and the TAR archive as a whole. These
metadata are then deposited in two files, in JSON format, in the study
directory:
- `{visit_id}_metadata_tarball.json`
JSON object with basic properties of the archive, such as 'size', and
'md5'.
- `{visit_id}_metadata_dicoms.json`
JSON array with essential properties for each DICOM image file, such as
'path' (relative path inside the TAR archive), 'md5' (MD5 checksum of
the DICOM file), 'size' (in bytes), and select standard DICOM tags,
such as "SeriesDescription", "SeriesNumber", "Modality",
"MRAcquisitionType", "ProtocolName", "PulseSequenceName". The latter
enable a rough, technical characterization of the images in the TAR
archive.
"""
import logging
import os
Expand All @@ -17,11 +36,6 @@ from datalad.utils import md5sum

lgr = logging.getLogger('inm-icf-utilities')

# this points to the top of the ICF data store.
# internally it will be amended with the missing components
# for study and visit deposit locations
icfstore_baseurl = 'https://data.inm-icf.de'

# which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets)
Expand Down Expand Up @@ -58,7 +72,7 @@ def main(store_dir: str,
if not tar_path.exists():
raise ValueError(f'no tarball at {tar_path}')

runshit(
describe_tarball(
# source visit tarball
tar_path.resolve(),
# source visit tarball URL, relative to store
Expand All @@ -70,7 +84,7 @@ def main(store_dir: str,
)


def runshit(tarpath, tarurl, metapath_dataset, metapath_file):
def describe_tarball(tarpath, tarurl, metapath_dataset, metapath_file):
# construct and dump dataset metadata
tar_meta = {
'size': tarpath.stat().st_size,
Expand Down Expand Up @@ -115,10 +129,13 @@ def runshit(tarpath, tarurl, metapath_dataset, metapath_file):

if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser(description=__doc__)
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
help="Root directory of the ICF data store. "
help="Root directory of the data store. "
"Visit data will be read from it, and extracted metadata will be "
"deposited into it."
)
Expand Down
Loading

0 comments on commit ff65671

Please sign in to comment.