diff --git a/references.bib b/references.bib index f05565f..312dee3 100644 --- a/references.bib +++ b/references.bib @@ -1,3 +1,80 @@ +@software{zarr, + author = {Alistair Miles and + jakirkham and + M Bussonnier and + Josh Moore and + Dimitri Papadopoulos Orfanos and + Davis Bennett and + David Stansby and + Joe Hamman and + James Bourbeau and + Andrew Fulton and + Gregory Lee and + Ryan Abernathey and + Norman Rzepka and + Zain Patel and + Mads R. B. Kristensen and + Sanket Verma and + Saransh Chopra and + Matthew Rocklin and + AWA BRANDON AWA and + Max Jones and + Martin Durant and + Elliott Sales de Andrade and + Vincent Schut and + raphael dussin and + Shivank Chaudhary and + Chris Barnes and + Juan Nunez-Iglesias and + shikharsg}, + title = {zarr-developers/zarr-python: v3.0.0-alpha}, + month = jun, + year = 2024, + publisher = {Zenodo}, + version = {v3.0.0-alpha}, + doi = {10.5281/zenodo.11592827}, + url = {https://doi.org/10.5281/zenodo.11592827} +} + +@inproceedings{Norman2021CloudBank, +author = {Norman, Michael and Kellen, Vince and Smallen, Shava and DeMeulle, Brian and Strande, Shawn and Lazowska, Ed and Alterman, Naomi and Fatland, Rob and Stone, Sarah and Tan, Amanda and Yelick, Katherine and Van Dusen, Eric and Mitchell, James}, +title = {{CloudBank: Managed Services to Simplify Cloud Access for Computer Science Research and Education}}, +year = {2021}, +isbn = {9781450382922}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3437359.3465586}, +doi = {10.1145/3437359.3465586}, +abstract = {CloudBank is a cloud access entity founded to enable the computer science research and education communities to harness the profound computational potential of public clouds. By delivering a set of managed services designed to alleviate common points of friction associated with cloud adoption, Cloudbank serves as an integrated service provider to the research and education community. These services include front-line help desk support, cloud solution consulting, training, account management, cost monitoring and optimization support, and automated billing. CloudBank has a multi-cloud pay-per-use billing model and aims to serve the spectrum of cloud users from novice to advanced.}, +booktitle = {Practice and Experience in Advanced Research Computing}, +articleno = {45}, +numpages = {4}, +keywords = {Cloud Computing}, +location = {Boston, MA, USA}, +series = {PEARC '21} +} + +@article{Connolly2023Software, + author = {Connolly, Andrew and Hellerstein, Joseph and Alterman, Naomi and Beck, David and Fatland, Rob and Lazowska, Ed and Mandava, Vani and Stone, Sarah}, + journal = {Harvard Data Science Review}, + number = {2}, + year = {2023}, + month = {apr 27}, + note = {https://hdsr.mitpress.mit.edu/pub/f0f7h5cu}, + publisher = {}, + title = { + +{Software} {Engineering} {Practices} in {Academia}: Promoting the 3Rs---{Readability}, {Resilience}, and {Reuse}}, + volume = {5}, +} + +@article{pestilli2021community, + title={A community-driven development of the Brain Imaging Data Standard (BIDS) to describe macroscopic brain connections}, + author={Pestilli, Franco and Poldrack, Russ and Rokem, Ariel and Satterthwaite, Theodore and Feingold, Franklin and Duff, Eugene and Pernet, Cyril and Smith, Robert and Esteban, Oscar and Cieslak, Matt}, + journal={OSF}, + year={2021} +} + @MISC{Nosek2019CultureChange, title = "Strategy for Culture Change", author = "Nosek, Brian", diff --git a/sections/02-use-cases.qmd b/sections/02-use-cases.qmd index 79d55b2..92768f9 100644 --- a/sections/02-use-cases.qmd +++ b/sections/02-use-cases.qmd @@ -1,4 +1,4 @@ -# Use cases +# Use cases {#sec-use-cases} To understand how OSS development practices affect the development of data and metadata standards, it is informative to demonstrate this cross-fertilization diff --git a/sections/03-challenges.qmd b/sections/03-challenges.qmd index eca357b..2164c1e 100644 --- a/sections/03-challenges.qmd +++ b/sections/03-challenges.qmd @@ -1,9 +1,9 @@ -# Opportunities and risks for open-source standards {#sec-opportunities} +# Opportunities and risks for open-source standards {#sec-challenges} At the same time, these tools and practices are associated with risks that need to be mitigated. -## Flexibility vs. stability +## Flexibility vs. Stability One of the defining characteristics of OSS is its dynamism and its rapid evolution. Because OSS can be used by anyone and, in most cases, contributions @@ -59,27 +59,60 @@ standardization lacks formal avenues for success and recognition, for example th Data standardization investment is justified if the standard is generalizable beyond any specific science domain. However while the use cases are domain sciences based, data standardization is seen as a data infrastructure and not a -science investment. Moreover due to how science research funding works, -scientists lack incentives to work across domains, or work on infrastructure +science investment. Moreover, due to how science research funding works, +scientists lack incentives to work across domains or to work on infrastructure problems. ## Data instrumentation issues Data for scientific observations are often generated by proprietary -instrumentation due to commercialization or other profit driven incentives. -There islack of regulatory oversight to adhere to available standards or evolve -Significant data transformation is required to get data to a state that is -amenable to standards, if available. If not available, there is lack of +instrumentation due to commercialization or other profit-driven incentives. +There is a lack of regulatory oversight to adhere to available standards or +evolve Significant data transformation is required to get data to a state that +is amenable to standards, if available. If not available, there is a lack of incentive to set aside investment or resources to invest in establishing data standards. +### Harnessing new computing paradigms and technologies + +Open-source standards development faces the challenges of adapting to new +computing paradigms and technologies. Cloud computing provides a particularly +stark set of opportunities and challenges. On the one hand, cloud computing +offers practical solutions for many challenges of contemporary data-driven +research. For example, the scalability of cloud resources addresses some of the +challenges of the scale of data that is produced by instruments in many fields. +The cloud also makes data access relatively straightforward, because of the +ability to determine data access permissions in a granular fashion. On the +other hand, cloud computing requires reinstrumenting many data formats. This is +because cloud data access patterns are fundamentally different from the ones +that are used in local posix-style file-systems. Suspicion of cloud computing +comes in two different flavors: the first by researchers and administrators who +may be wary of costs associated with cloud computing, and especially with the +difficulty of predicting these costs. Projects such as NSF's Cloud Bank seek to +mitigate some of these concerns, by providing an additional layer of +transparency into cloud costs [@Norman2021CloudBank]. The other type of +objection relates to the fact that cloud computing services, by their very +nature, are closed ecosystems that resist portability and interoperability. +Some aspects of the services are always going to remain hidden and privy only +to the cloud computing service provider. In this respect, cloud computing runs +afoul of some of the appealing aspects of OSS. That said, the development of +"cloud native" standards can provide significant benefits in terms of the +research that can be conducted. For example, NOAA plans to use cloud computing +for integration across the multiple disparate datasets that it collects to +build knowledge graphs that can be queried by researchers to answer questions +that can only be answered through this integration. Putting all the data "in +one place" should help with that. Adaptation to the cloud in terms of data +standards has driven development of new file formats. A salient example is the +ZARR format [@zarr], which supports random access into array-based datasets +stored in cloud object storage, facilitating scalable and parallelized +computing on these data. Indeed, data standards such as NWB (neuroscience) and +OME (microscopy) now use ZARR as a backend for cloud-based storage. In other +cases, file formats that were once not straightforward to use in the cloud, +such as HDF5 and TIFF have been adapted to cloud use (e.g., through the +cloud-optimized geoTIFF format). + ## Sustainability ## The importance of automated validation -## Harnessing new computing paradigms and technologies - -Open-source standards development faces the challenges of adapting to new -technologies The development of standards that are well-Cloud computing -provides diff --git a/sections/05-recommendations.qmd b/sections/05-recommendations.qmd index a3441c2..18920db 100644 --- a/sections/05-recommendations.qmd +++ b/sections/05-recommendations.qmd @@ -1,9 +1,9 @@ -# Recommendations for open-source data and metadata standards +# Recommendations for open-source data and metadata standards {#sec-recommendations} In conclusion of this report, we propose the following recommendations: -## Funding or Grantmaking entities: +## Policy-making and Funding entities: ### Fund Data Standards Development @@ -15,17 +15,26 @@ encourage the development and adoption of standards, and fund associated community efforts and tools for this. The OSS model is seen as a particularly promising avenue for an investment of resources, because it builds on previously-developed procedures and technical infrastructure and because it -provides avenues for community input along the way. The clarity offered by -procedures for enhancement proposals and semantic versioning schemes adopted in -standards development offer avenues for a range of stakeholders to propose to -funding bodies well-defined contributions to large and field-wide standards -efforts. - -### Invest in Data Stewards Recognize data stewards as a distinct role in -research and science investment. Set up programs for training for data stewards -and invest in career paths that encourage this role. Initial proposals for the -curriculum and scope of the role have already been proposed (e.g., in -[@Mons2018DataStewardshipBook]) +provides avenues for democratization of development processes and for community +input along the way. The clarity offered by procedures for enhancement +proposals and semantic versioning schemes adopted in standards development +offer avenues for a range of stakeholders to propose to funding bodies +well-defined contributions to large and field-wide standards efforts (e.g., [@pestilli2021community]). + +### Invest in Data Stewards + +Advancing the development and adoption of open-source standards requires the +dissemination of knowledge to researchers in a variety of fields, but this +dissemination itself may not be enough without the fostering of specialized +expertise. Therefore, it is important to recognize *data stewards* as a +distinct role in research. To truly support experts whose role will be to +develop, maintain, and facilitate the adoption and use of open-source +standards, it will be necessary to set up programs for training for data +stewards and invest in career paths that encourage this role. Initial proposals +for the curriculum and scope of the role have already been proposed (e.g., in +[@Mons2018DataStewardshipBook]). In addition, in order for these individuals to be able to make the best use of open-source standards, it will be important for these individuals to be facile in the methodology of OSS. This does not mean that they need to become software engineers -- though there may be some overlap with the role of research software engineers [@Connolly2023Software] -- but rather that they +need to become familiar with those parts of the OSS development life-cycle that +are useful for development of open-source standards. ### Review Data Standards Pathways @@ -50,18 +59,22 @@ metadata and descriptions of how to use it. ### Program Manage Cross Sector alliances -Encourage cross sector and cross domain alliances that can impact successful standards creation. Invest in robust program management of these alliances to align pace and create incentives (for instance via Open Source Program Office / OSPO efforts). Similar to program officers at funding agencies, standards evolution need sustained PM efforts. Multi company partnerships should include strategic initiatives for standard establishment e.g. [Pistoiaalliance](https://www.pistoiaalliance.org/news/press-release-pistoia-alliance-launches-idmp-1-0/). - +Encourage cross-sector and cross-domain alliances that can impact successful +standards creation. Invest in robust program management of these alliances to +align pace and create incentives (for instance via Open Source Program Office / +OSPO efforts). Similar to program officers at funding agencies, standards +evolution need sustained PM efforts. Multi company partnerships should include +strategic initiatives for standard establishment e.g. +[Pistoiaalliance](https://www.pistoiaalliance.org/news/press-release-pistoia-alliance-launches-idmp-1-0/). ### Curriculum Development Stakeholder organizations should invest in training grants to establish curriculum for data and metadata standards education. - ## Science and Technology Communities: -### User Driven Development +### User-Driven Development Standards should be needs-driven and developed in close collaboration with users. Changes and enhancements should be in response to community feedback.