huggingface · CloseChoice · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -90,6 +90,8 @@
       title: Create a document dataset
     - local: nifti_dataset
       title: Create a medical imaging dataset
+    - local: dicom_dataset
+      title: Create a medical dataset, containing images, signals or videos and additional metadata
     title: "Vision"
   - sections:
     - local: nlp_load

diff --git a/docs/source/dicom_dataset.mdx b/docs/source/dicom_dataset.mdx
@@ -0,0 +1,121 @@
+# Create a DICOM dataset
+
+This page shows how to create and share a dataset of medical data in DICOM format (.dcm / .dicom) using the `datasets` library.
+
+You can share a dataset with your team or with anyone in the community by creating a dataset repository on the Hugging Face Hub:
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("<username>/my_dicom_dataset")
+```
+
+There are two common ways to create a DICOM dataset:
+
+- Create a dataset from local DICOM files in Python and upload it with `Dataset.push_to_hub`.
+- Use a folder-based convention (one file per example) and a small helper to convert it into a `Dataset`.
+
+> [!TIP]
+> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information.
+
+## Local files
+
+If you already have a list of file paths to DICOM files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Dicom` feature.
+
+```py
+from datasets import Dataset, Dicom
+
+# simple example: create a dataset from file paths
+files = ["/path/to/file_001.dcm", "/path/to/file_002.dcm"]
+ds = Dataset.from_dict({"dicom": files}).cast_column("dicom", Dicom())
+```
+
+Pydicom, the library used to handle DICOM files, supports loading DICOM files missing File Meta Information header or 'DICM' prefix using the `force=True` parameter, which defaults to `force=False`.
+
+```py
+from datasets import Dataset, Dicom
+
+ds = Dataset.from_dict({"dicom": ["/path/to/file_without_meta.dcm"]}).cast_column("dicom", Dicom(force=True))
+img = ds[0]["dicom"]
+arr = img.pixel_array
+```
+
+After preparing the dataset you can push it to the Hub:
+
+```py
+ds.push_to_hub("<username>/my_dicom_dataset")
+```
+
+This will create a dataset repository containing your DICOM dataset with a `data/` folder of parquet shards.
+
+## Folder conventions and metadata
+
+If you organize your dataset in folders you can create splits automatically (train/test/validation) by following a structure like:
+
+```
+dataset/train/scan_0001.dcm
+dataset/train/scan_0002.dcm
+dataset/validation/scan_1001.dcm
+dataset/test/scan_2001.dcm
+```
+
+If you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the DICOM file next to the metadata file.
+
+Example `metadata.csv`:
+
+```csv
+file_name,patient_id,age,diagnosis
+scan_0001.dcm,P001,45,healthy
+scan_0002.dcm,P002,59,disease_x
+```
+
+## Converting to PyTorch tensors
+
+The numerical data (signals, images or videos) are stored under `dicom_object.pixel_array`. Note that not all DICOM files need to contain these. You can convert these to PyTorch tensors on-the-fly using a dataset transformation.
+
+Use the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset:
+
+```py
+import torch 
+import pydicom
+import numpy as np
+
+def transform_to_pytorch(example):
+    example["dicom_torch"] = [torch.tensor(ex.pixel_array) for ex in example["dicom"]]
+    return example
+
+ds.set_transform(transform_to_pytorch)
+
+```
+Accessing elements now (e.g. `ds[0]`) will yield torch tensors in the `"dicom_torch"` key.
+
+
+## Usage of Pydicom
+
+The DICOM files are loaded using the [pydicom](https://pydicom.github.io/) library. Therefore, you can use all functionality of pydicom to access metadata and pixel data.
+
+```python
+from datasets import load_dataset
+dicom_ds = load_dataset("<username>/my_dicom_dataset")
+for dicom_img in dicom_ds:
+    dicom_object = dicom_img["dicom"]
+    print(dicom_object.PatientID)
+    print(dicom_object.StudyDate)
+    pixel_array = dicom_object.pixel_array
+    print(pixel_array.shape)
+```
+
+You can visualize the DICOM images using matplotlib as follows:
+
+```Python
+import matplotlib.pyplot as plt
+from datasets import load_dataset
+dicom_ds = load_dataset("<username>/my_dicom_dataset")
+for dicom_img in dicom_ds:
+    dicom_object = dicom_img["dicom"]
+    plt.imshow(dicom_object.pixel_array, cmap=plt.cm.gray)
+    plt.show()
+```
+
+For further reading we refer to the [pydicom documentation](https://pydicom.github.io/pydicom/stable/) and [tutorials](https://pydicom.github.io/pydicom/stable/tutorials/index.html)
+---
diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx
@@ -109,6 +109,12 @@ load_dataset("csv", data_dir="path/to/data/dir", sep="\t")
 
 [[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolder
 
+### Dicom
+
+[[autodoc]] datasets.packaged_modules.dicomfolder.DicomFolderConfig
+
+[[autodoc]] datasets.packaged_modules.dicomfolder.DicomFolder
+
 ### WebDataset
 
 [[autodoc]] datasets.packaged_modules.webdataset.WebDataset
diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx
@@ -275,6 +275,10 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable
 
 [[autodoc]] datasets.Nifti
 
+### Dicom
+
+[[autodoc]] datasets.Dicom
+
 ## Filesystems
 
 [[autodoc]] datasets.filesystems.is_remote_filesystem

diff --git a/setup.py b/setup.py
@@ -210,6 +210,8 @@
 
 NIBABEL_REQUIRE = ["nibabel>=5.3.2"]
 
+PYDICOM_REQUIRE = ["pydicom>=3.0.1"]
+
 EXTRAS_REQUIRE = {
     "audio": AUDIO_REQUIRE,
     "vision": VISION_REQUIRE,
@@ -228,6 +230,7 @@
     "docs": DOCS_REQUIRE,
     "pdfs": PDFS_REQUIRE,
     "nibabel": NIBABEL_REQUIRE,
+    "pydicom": PYDICOM_REQUIRE,
 }
 
 setup(

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -140,6 +140,7 @@
 TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
 PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
 NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None
+PYDICOM_AVAILABLE = importlib.util.find_spec("pydicom") is not None
 
 # Optional compression tools
 RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None

diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py
@@ -16,8 +16,10 @@
     "Video",
     "Pdf",
     "Nifti",
+    "Dicom",
 ]
 from .audio import Audio
+from .dicom import Dicom
 from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value
 from .image import Image
 from .nifti import Nifti