Merge pull request #510 from OpenCOMPES/sxp_fixes-for-new-daq

rettigl · web-flow · commit b219760471db · 2024-11-11T19:34:24.000+01:00
Sxp fixes for new daq
diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt
@@ -84,6 +84,7 @@ dfpart
 dfpid
 dictionarized
 dictmerge
+DOOCS
 dpkg
 dropna
 dset
@@ -383,6 +384,7 @@ xarray
 xaxis
 xcirc
 xdata
+XFEL
 xind
 Xinstrument
 xlabel
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -52,7 +52,6 @@ jobs:
         run: |
           cp -r $GITHUB_WORKSPACE/tutorial $GITHUB_WORKSPACE/docs/
           cp -r $GITHUB_WORKSPACE/sed/config $GITHUB_WORKSPACE/docs/sed
-          rm $GITHUB_WORKSPACE/docs/tutorial/5_sxp_workflow.ipynb
 
 
       - name: download RAW data
@@ -61,10 +60,11 @@ jobs:
           cd $GITHUB_WORKSPACE/docs
           poetry run python scripts/download_data.py
 
-      - name: build Flash parquet files
+      - name: build parquet files
         run: |
           cd $GITHUB_WORKSPACE/docs
           poetry run python scripts/build_flash_parquets.py
+          poetry run python scripts/build_sxp_parquets.py
 
       # to be removed later. This theme doesn't support <3.9 python and our lock file contains 3.8
       - name: install pydata-sphinx-theme
diff --git a/docs/scripts/build_sxp_parquets.py b/docs/scripts/build_sxp_parquets.py
@@ -0,0 +1,45 @@
+from pathlib import Path
+
+import sed
+from sed import SedProcessor
+from sed.dataset import dataset
+
+config_file = Path(sed.__file__).parent / "config/sxp_example_config.yaml"
+
+dataset.get("Au_Mica", root_dir="./tutorial")
+data_path = dataset.dir
+
+
+config_override = {
+    "core": {
+        "paths": {
+            "data_raw_dir": data_path,
+            "data_parquet_dir": data_path + "/processed/",
+        },
+    },
+}
+
+runs = [
+    "0058",
+    "0059",
+    "0060",
+    "0061",
+    "0064",
+    "0065",
+    "0066",
+    "0067",
+    "0068",
+    "0069",
+    "0070",
+    "0071",
+    "0072",
+    "0073",
+    "0074",
+]
+for run in runs:
+    sp = SedProcessor(
+        runs=run,
+        config=config_override,
+        system_config=config_file,
+        collect_metadata=False,
+    )
diff --git a/docs/scripts/download_data.py b/docs/scripts/download_data.py
@@ -5,3 +5,4 @@
 dataset.get("WSe2", remove_zip=True, root_dir=root_dir)
 dataset.get("Gd_W110", remove_zip=True, root_dir=root_dir)
 dataset.get("TaS2", remove_zip=True, root_dir=root_dir)
+dataset.get("Au_Mica", remove_zip=True, root_dir=root_dir)
diff --git a/docs/workflows/index.md b/docs/workflows/index.md
@@ -8,4 +8,5 @@ myst:
 
 ```{toctree}
 ../tutorial/4_hextof_workflow
+../tutorial/5_sxp_workflow.ipynb
 ```
diff --git a/poetry.lock b/poetry.lock
diff --git a/sed/config/sxp_example_config.yaml b/sed/config/sxp_example_config.yaml
@@ -17,6 +17,7 @@ dataframe:
   daq: DA03
   forward_fill_iterations: 2
   num_trains: 10
+  # num_pulses: 400 # only needed for data from new DAQ
   x_column: dldPosX
   corrected_x_column: "X"
   kx_column: "kx"
@@ -27,6 +28,7 @@ dataframe:
   tof_ns_column: dldTime
   corrected_tof_column: "tm"
   bias_column: "sampleBias"
+  delay_column: "delayStage"
   tof_binwidth: 6.875E-12 # in seconds
   tof_binning: 0
   jitter_cols: ["dldPosX", "dldPosY", "dldTimeSteps"]
@@ -84,6 +86,11 @@ dataframe:
       format: per_train
       dataset_key: "/CONTROL/SCS_ILH_LAS/MDL/OPTICALDELAY_PP800/actualPosition/value"
       index_key: "/INDEX/trainId"
+#     test:
+#       daq: DA02 # change DAQ for a channel
+#       format: per_pulse
+#       dataset_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/intensitySa3TD"
+#       index_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/trainId"
 
   stream_name_prefixes:
     DA03: "RAW-R"
@@ -92,3 +99,12 @@ dataframe:
 
   beamtime_dir:
     sxp: "/gpfs/exfel/exp/SXP/"
+
+histogram:
+  # number of bins used for histogram visualization
+  bins: [80, 80, 80, 80]
+  # default axes to use for histogram visualization.
+  # Axes names starting with "@" refer to keys in the "dataframe" section
+  axes: ["@x_column", "@y_column", "@tof_column", "@delay_column"]
+  # default ranges to use for histogram visualization (in unbinned detector coordinates)
+  ranges: [[0, 4000], [0, 4000], [1000, 28000], [-1000, 1000]]
diff --git a/sed/dataset/datasets.json b/sed/dataset/datasets.json
@@ -21,6 +21,9 @@
             "energycal_2020_07_20"
         ]
     },
+    "Au_Mica": {
+        "url": "https://zenodo.org/records/13952965/files/Au_Mica_SXP.zip"
+    },
     "Test": {
         "url": "http://test.com/files/file.zip",
         "subdirs": [
diff --git a/sed/loader/sxp/loader.py b/sed/loader/sxp/loader.py
@@ -256,6 +256,7 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
         for i in train_id.index:
             # removing broken trailing hit copies
             num_trains = self._config["dataframe"].get("num_trains", 0)
+            num_pulses = self._config["dataframe"].get("num_pulses", 0)
             if num_trains:
                 try:
                     num_valid_hits = np.where(np.diff(mib_array[i].astype(np.int32)) < 0)[0][
@@ -270,7 +271,10 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
             index = 0
             for train, train_end in enumerate(train_ends):
                 macrobunch_index.append(train_id[i] + np.uint(train))
-                microbunch_ids.append(mib_array[i, index:train_end])
+                if num_pulses:
+                    microbunch_ids.append(mib_array[i, index:train_end] % num_pulses)
+                else:
+                    microbunch_ids.append(mib_array[i, index:train_end])
                 indices.append(slice(index, train_end))
                 index = train_end + 1
             macrobunch_indices.append(indices)
@@ -509,7 +513,7 @@ def create_dataframe_per_train(
 
     def create_dataframe_per_channel(
         self,
-        h5_file: h5py.File,
+        file_path: Path,
         channel: str,
     ) -> Union[Series, DataFrame]:
         """
@@ -520,7 +524,7 @@ def create_dataframe_per_channel(
         DataFrame depends on the channel's format specified in the configuration.
 
         Args:
-            h5_file (h5py.File): The h5py.File object representing the HDF5 file.
+            file_path (Path): The path to the main HDF5 file.
             channel (str): The name of the channel.
 
         Returns:
@@ -530,11 +534,16 @@ def create_dataframe_per_channel(
             ValueError: If the channel has an undefined format.
 
         """
+        channel_dict = self._config["dataframe"]["channels"][channel]  # channel parameters
+        main_daq = self._config["dataframe"]["daq"]
+        channel_daq = self._config["dataframe"]["channels"][channel].get("daq", main_daq)
+        # load file corresponding to daq
+        h5_file = h5py.File(Path(str(file_path).replace(main_daq, channel_daq)))
+
         [train_id, np_array] = self.create_numpy_array_per_channel(
             h5_file,
             channel,
         )  # numpy Array created
-        channel_dict = self._config["dataframe"]["channels"][channel]  # channel parameters
 
         # If np_array is size zero, fill with NaNs
         if np_array.size == 0:
@@ -585,7 +594,7 @@ def create_dataframe_per_channel(
 
     def concatenate_channels(
         self,
-        h5_file: h5py.File,
+        file_path: Path,
     ) -> DataFrame:
         """
         Concatenates the channels from the provided h5py.File into a pandas DataFrame.
@@ -595,7 +604,7 @@ def concatenate_channels(
         available channels specified in the configuration.
 
         Args:
-            h5_file (h5py.File): The h5py.File object representing the HDF5 file.
+            file_path (Path): The path to the main HDF5 file.
 
         Returns:
             DataFrame: A concatenated pandas DataFrame containing the channels.
@@ -604,11 +613,13 @@ def concatenate_channels(
             ValueError: If the group_name for any channel does not exist in the file.
 
         """
-        all_keys = parse_h5_keys(h5_file)  # Parses all channels present
-
         # Check for if the provided dataset_keys and index_keys actually exists in the file
         for channel in self._config["dataframe"]["channels"]:
             dataset_key = self._config["dataframe"]["channels"][channel]["dataset_key"]
+            daq = self._config["dataframe"]["channels"][channel].get("daq", "DA03")
+            # load file corresponding to daq
+            h5_file = h5py.File(Path(str(file_path).replace("DA03", daq)))
+            all_keys = parse_h5_keys(h5_file)  # Parses all channels present
             if dataset_key not in all_keys:
                 raise ValueError(
                     f"The dataset_key for channel {channel} does not exist.",
@@ -621,7 +632,7 @@ def concatenate_channels(
 
         # Create a generator expression to generate data frames for each channel
         data_frames = (
-            self.create_dataframe_per_channel(h5_file, each) for each in self.available_channels
+            self.create_dataframe_per_channel(file_path, each) for each in self.available_channels
         )
 
         # Use the reduce function to join the data frames into a single DataFrame
@@ -649,14 +660,13 @@ def create_dataframe_per_file(
 
         """
         # Loads h5 file and creates a dataframe
-        with h5py.File(file_path, "r") as h5_file:
-            self.reset_multi_index()  # Reset MultiIndexes for next file
-            df = self.concatenate_channels(h5_file)
-            df = df.dropna(subset=self._config["dataframe"].get("tof_column", "dldTimeSteps"))
-            # correct the 3 bit shift which encodes the detector ID in the 8s time
-            if self._config["dataframe"].get("split_sector_id_from_dld_time", False):
-                df = split_dld_time_from_sector_id(df, config=self._config)
-            return df
+        self.reset_multi_index()  # Reset MultiIndexes for next file
+        df = self.concatenate_channels(file_path)
+        df = df.dropna(subset=self._config["dataframe"].get("tof_column", "dldTimeSteps"))
+        # correct the 3 bit shift which encodes the detector ID in the 8s time
+        if self._config["dataframe"].get("split_sector_id_from_dld_time", False):
+            df = split_dld_time_from_sector_id(df, config=self._config)
+        return df
 
     def create_buffer_file(self, h5_path: Path, parquet_path: Path) -> Union[bool, Exception]:
         """
diff --git a/tutorial/5_sxp_workflow.ipynb b/tutorial/5_sxp_workflow.ipynb