broadinstitute · kilonzi · Apr 24, 2025 · Apr 24, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/model_zoo/PCLR/.gitattributes b/model_zoo/PCLR/.gitattributes
@@ -0,0 +1 @@
+*.h5 filter=lfs diff=lfs merge=lfs -text
diff --git a/model_zoo/PCLR/README.md b/model_zoo/PCLR/README.md
@@ -22,7 +22,7 @@ python -i get_representations.py  # test the setup worked
 You can get ECG representations using [get_representations.py](./get_representations.py).
 `get_representations.get_representations` builds `N x 320` ECG representations from `N` ECGs.
 
-The model expects 10s 12-lead ECGs with a specific lead order and interpolated to be 4,096 samples long.
+The model expects 10s 12-lead ECGs meaured in milli-volts with a specific lead order and interpolated to be 4,096 samples long.
 [preprocess_ecg.py](./preprocess_ecg.py) shows how to do the pre-processing.
 
 ### Use git LFS to localize the model file
@@ -103,6 +103,24 @@ the model only takes lead I of the ECG as input.
 ## Lead II PCLR
 [Lead II PCLR](./PCLR_lead_II.h5) is like lead I PCLR except it was trained with all ECGs sampled to 250Hz.
 
+## C3PO PCLR and AUG C3PO PCLR
+We also provide PCLR models trained using subjects from the C3PO cohort, with and without augmentation.
+The model files are available via:
+
+`git lfs pull --include model_zoo/PCLR/c3po_pclr.h5`
+
+`git lfs pull --include model_zoo/PCLR/aug_c3po_pclr.h5`
+
+You can get ECG representations using for example [get_representations.py(ecgs, model_name='c3po_pclr')](./get_representations.py).
+`get_representations.get_representations` builds `N x 320` ECG representations from `N` ECGs.
+
+The model expects 10s 12-lead ECGs measured in milli-volts with a specific lead order and interpolated to be 2,500 samples long. Note that this interpolation is different from the standard PCLR model.
+[preprocess_ecg.py](./preprocess_ecg.py) shows how to do the pre-processing; when calling it remember to set `ecg_samples=2500`.
+
+The code snippet above showing example inference with UKB ECGs is also appropriate for these models. Remember to:
+1. Load `c3po_pclr.h5` or `aug_c3po_pclr.h5` instead of `PCLR.h5`.
+2. Interpolate to 2500 instead of 4096.
+
 ## Alternative save format
 The newer keras saved model format is available for the 12-lead and single lead models at [PCLR](./PCLR)
 and [PCLR_lead_I](./PCLR_lead_I) and [PCLR_lead_II](./PCLR_lead_II).
diff --git a/model_zoo/PCLR/aug_c3po_pclr.h5 b/model_zoo/PCLR/aug_c3po_pclr.h5
diff --git a/model_zoo/PCLR/c3po_pclr.h5 b/model_zoo/PCLR/c3po_pclr.h5
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/__init__.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/__init__.py
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/c3po_pclr_model_schema.json b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/c3po_pclr_model_schema.json
@@ -0,0 +1,16 @@
+{
+  "inputs": [ 
+    {
+      "name": "ecg",
+      "shape": [2500, 12],
+      "dtype": "FP32"
+    }
+  ],
+  "outputs": [
+    {
+      "name": "output_0",
+      "shape": [320],
+      "dtype": "FP32"
+    },
+  ]
+}
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/Dockerfile b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.9-slim
+WORKDIR /app
+COPY prepare.py /app/
+COPY finalize.py /app/
+COPY requirements.txt /app/
+RUN pip install -r /app/requirements.txt
+ENTRYPOINT ["python"]
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/finalize.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/finalize.py
@@ -0,0 +1,32 @@
+import argparse
+import json
+import pandas as pd
+
+latent_dimensions = 320
+
+def finalize(input_csv, predictions_json, output_csv):
+    with open(predictions_json, "r") as f:
+        prediction_data = json.load(f)
+
+    df = pd.read_csv(input_csv, dtype={"file_id": str})
+
+    embedding = prediction_data["output_0"]
+
+    if len(embedding) != len(df):
+        raise ValueError(f"Mismatch: {len(embedding)} predictions but {len(df)} rows in input CSV!")
+
+    new_frame = pd.DataFrame(embedding, columns=[f'pclr_{i}' for i in range(latent_dimensions)])
+    df = pd.concat([df, new_frame], axis=1)
+
+    df.to_csv(output_csv, index=False)
+    print(f"✅ Predictions written to {output_csv} ({len(df)} rows).")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Path to input CSV")
+    parser.add_argument("--output", required=True, help="Path to final CSV with predictions")
+    parser.add_argument("--predictions", required=True, help="Path to predictions JSON")
+    args = parser.parse_args()
+
+    finalize(args.input, args.predictions, args.output)
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/prepare.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/prepare.py
@@ -0,0 +1,56 @@
+import argparse
+
+import h5py
+import numpy as np
+import pandas as pd
+import smart_open
+
+leads = [
+    'I', 'II', 'III', 'aVR', 'aVL', 'aVF',
+    'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+]
+
+ECG_LENGTH = 2500
+ECG_SHAPE = (ECG_LENGTH, 12)
+ECG_HD5_PATH = 'ukb_ecg_rest'
+
+def ecg_as_tensor(ecg_file):
+    with smart_open.open(ecg_file, 'rb') as f:
+        with h5py.File(f, 'r') as hd5:
+            ecg = np.zeros(ECG_SHAPE, dtype=np.float32)
+            for k,l in enumerate(leads):
+                lead = np.array(hd5[f'{ECG_HD5_PATH}/strip_{l}/instance_0'])
+
+                interpolated_lead = np.interp(
+                    np.linspace(0, 1, ECG_LENGTH),
+                    np.linspace(0, 1, lead.shape[0]),
+                    lead,
+                )
+                ecg[:, k] = interpolated_lead / 1000
+
+    return ecg
+
+def prepare(input_csv, output_h5):
+    """Processes ECG files into HDF5 tensor format from GCS/Azure/Local."""
+    df = pd.read_csv(input_csv, dtype={"file": str})
+    h5_file = h5py.File(output_h5, "w")
+    tensors_group = h5_file.create_group("tensors")
+    df = df.dropna(subset=["file"])
+    df["file"] = df["file"].astype(str)
+    for _, row in df.iterrows():
+        sample_id, file_path = row["file_id"], row["file"]
+        print(f"Processing: sample_id={sample_id}, file_path={file_path}, type={type(file_path)}")
+        tensor = ecg_as_tensor(file_path)
+        tensors_group.create_dataset(str(sample_id), data=tensor)
+
+    h5_file.close()
+    print(f"Processed ECG tensors saved to {output_h5}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Path to input CSV")
+    parser.add_argument("--output", required=True, help="Path to output HDF5 file")
+    args = parser.parse_args()
+
+    prepare(args.input, args.output)
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/requirements.txt b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+numpy
+h5py
+smart-open[gcs]
diff --git a/model_zoo/PCLR/deployment/PCLR/v1/__init__.py b/model_zoo/PCLR/deployment/PCLR/v1/__init__.py
diff --git a/model_zoo/PCLR/deployment/PCLR/v1/pclr_model_schema.json b/model_zoo/PCLR/deployment/PCLR/v1/pclr_model_schema.json
@@ -0,0 +1,16 @@
+{
+  "inputs": [ 
+    {
+      "name": "ecg",
+      "shape": [4096, 12],
+      "dtype": "FP32"
+    }
+  ],
+  "outputs": [
+    {
+      "name": "output_0",
+      "shape": [320],
+      "dtype": "FP32"
+    },
+  ]
+}
diff --git a/model_zoo/PCLR/deployment/PCLR/v1/processing_image/Dockerfile b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.9-slim
+WORKDIR /app
+COPY prepare.py /app/
+COPY finalize.py /app/
+COPY requirements.txt /app/
+RUN pip install -r /app/requirements.txt
+ENTRYPOINT ["python"]
diff --git a/model_zoo/PCLR/deployment/PCLR/v1/processing_image/finalize.py b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/finalize.py
@@ -0,0 +1,32 @@
+import argparse
+import json
+import pandas as pd
+
+latent_dimensions = 320
+
+def finalize(input_csv, predictions_json, output_csv):
+    with open(predictions_json, "r") as f:
+        prediction_data = json.load(f)
+
+    df = pd.read_csv(input_csv, dtype={"file_id": str})
+
+    embedding = prediction_data["output_0"]
+
+    if len(embedding) != len(df):
+        raise ValueError(f"Mismatch: {len(embedding)} predictions but {len(df)} rows in input CSV!")
+
+    new_frame = pd.DataFrame(embedding, columns=[f'pclr_{i}' for i in range(latent_dimensions)])
+    df = pd.concat([df, new_frame], axis=1)
+
+    df.to_csv(output_csv, index=False)
+    print(f"✅ Predictions written to {output_csv} ({len(df)} rows).")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Path to input CSV")
+    parser.add_argument("--output", required=True, help="Path to final CSV with predictions")
+    parser.add_argument("--predictions", required=True, help="Path to predictions JSON")
+    args = parser.parse_args()
+
+    finalize(args.input, args.predictions, args.output)
diff --git a/model_zoo/PCLR/deployment/PCLR/v1/processing_image/prepare.py b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/prepare.py
@@ -0,0 +1,56 @@
+import argparse
+
+import h5py
+import numpy as np
+import pandas as pd
+import smart_open
+
+leads = [
+    'I', 'II', 'III', 'aVR', 'aVL', 'aVF',
+    'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+]
+
+ECG_LENGTH = 4096
+ECG_SHAPE = (ECG_LENGTH, 12)
+ECG_HD5_PATH = 'ukb_ecg_rest'
+
+def ecg_as_tensor(ecg_file):
+    with smart_open.open(ecg_file, 'rb') as f:
+        with h5py.File(f, 'r') as hd5:
+            ecg = np.zeros(ECG_SHAPE, dtype=np.float32)
+            for k,l in enumerate(leads):
+                lead = np.array(hd5[f'{ECG_HD5_PATH}/strip_{l}/instance_0'])
+
+                interpolated_lead = np.interp(
+                    np.linspace(0, 1, ECG_LENGTH),
+                    np.linspace(0, 1, lead.shape[0]),
+                    lead,
+                )
+                ecg[:, k] = interpolated_lead / 1000
+
+    return ecg
+
+def prepare(input_csv, output_h5):
+    """Processes ECG files into HDF5 tensor format from GCS/Azure/Local."""
+    df = pd.read_csv(input_csv, dtype={"file": str})
+    h5_file = h5py.File(output_h5, "w")
+    tensors_group = h5_file.create_group("tensors")
+    df = df.dropna(subset=["file"])
+    df["file"] = df["file"].astype(str)
+    for _, row in df.iterrows():
+        sample_id, file_path = row["file_id"], row["file"]
+        print(f"Processing: sample_id={sample_id}, file_path={file_path}, type={type(file_path)}")
+        tensor = ecg_as_tensor(file_path)
+        tensors_group.create_dataset(str(sample_id), data=tensor)
+
+    h5_file.close()
+    print(f"Processed ECG tensors saved to {output_h5}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Path to input CSV")
+    parser.add_argument("--output", required=True, help="Path to output HDF5 file")
+    args = parser.parse_args()
+
+    prepare(args.input, args.output)
diff --git a/model_zoo/PCLR/deployment/PCLR/v1/processing_image/requirements.txt b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+numpy
+h5py
+smart-open[gcs]
diff --git a/model_zoo/PCLR/deployment/__init__.py b/model_zoo/PCLR/deployment/__init__.py
diff --git a/model_zoo/PCLR/get_representations.py b/model_zoo/PCLR/get_representations.py
@@ -6,20 +6,27 @@
 from preprocess_ecg import process_ecg, LEADS
 
 
-def get_model() -> Model:
+def get_model(model_name = 'pclr') -> Model:
     """Get PCLR embedding model"""
-    return load_model("./PCLR.h5")
+    if model_name == 'pclr':
+        return load_model("./PCLR.h5")
+    elif model_name == 'c3po_pclr':
+        return load_model("./c3po_pclr.h5")
+    elif model_name == 'aug_c3po_pclr':
+        return load_model("./aug_c3po_pclr.h5")
 
 
-def get_representations(ecgs: List[Dict[str, np.ndarray]]) -> np.ndarray:
+def get_representations(ecgs: List[Dict[str, np.ndarray]], model_name:str = 'pclr') -> np.ndarray:
     """
     Uses PCLR trained model to build representations of ECGs
     :param ecgs: A list of dictionaries mapping lead name to lead values.
                  The lead values should be measured in milli-volts.
                  Each lead should represent 10s of samples.
+    :param model_name: Specifies the model to use: either 'pclr', 'c3po_pclr' or 'aug_c3po_pclr'.
+                 Default is 'pclr'
     :return:
     """
-    model = get_model()
+    model = get_model(model_name)
     ecgs = np.stack(list(map(process_ecg, ecgs)))
     return model.predict(ecgs)