Skip to content

Commit b219760

Browse files
authored
Merge pull request #510 from OpenCOMPES/sxp_fixes-for-new-daq
Sxp fixes for new daq
2 parents 8f5e392 + 52a06c1 commit b219760

File tree

10 files changed

+326
-94
lines changed

10 files changed

+326
-94
lines changed

.cspell/custom-dictionary.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ dfpart
8484
dfpid
8585
dictionarized
8686
dictmerge
87+
DOOCS
8788
dpkg
8889
dropna
8990
dset
@@ -383,6 +384,7 @@ xarray
383384
xaxis
384385
xcirc
385386
xdata
387+
XFEL
386388
xind
387389
Xinstrument
388390
xlabel

.github/workflows/documentation.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ jobs:
5252
run: |
5353
cp -r $GITHUB_WORKSPACE/tutorial $GITHUB_WORKSPACE/docs/
5454
cp -r $GITHUB_WORKSPACE/sed/config $GITHUB_WORKSPACE/docs/sed
55-
rm $GITHUB_WORKSPACE/docs/tutorial/5_sxp_workflow.ipynb
5655
5756
5857
- name: download RAW data
@@ -61,10 +60,11 @@ jobs:
6160
cd $GITHUB_WORKSPACE/docs
6261
poetry run python scripts/download_data.py
6362
64-
- name: build Flash parquet files
63+
- name: build parquet files
6564
run: |
6665
cd $GITHUB_WORKSPACE/docs
6766
poetry run python scripts/build_flash_parquets.py
67+
poetry run python scripts/build_sxp_parquets.py
6868
6969
# to be removed later. This theme doesn't support <3.9 python and our lock file contains 3.8
7070
- name: install pydata-sphinx-theme

docs/scripts/build_sxp_parquets.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from pathlib import Path
2+
3+
import sed
4+
from sed import SedProcessor
5+
from sed.dataset import dataset
6+
7+
config_file = Path(sed.__file__).parent / "config/sxp_example_config.yaml"
8+
9+
dataset.get("Au_Mica", root_dir="./tutorial")
10+
data_path = dataset.dir
11+
12+
13+
config_override = {
14+
"core": {
15+
"paths": {
16+
"data_raw_dir": data_path,
17+
"data_parquet_dir": data_path + "/processed/",
18+
},
19+
},
20+
}
21+
22+
runs = [
23+
"0058",
24+
"0059",
25+
"0060",
26+
"0061",
27+
"0064",
28+
"0065",
29+
"0066",
30+
"0067",
31+
"0068",
32+
"0069",
33+
"0070",
34+
"0071",
35+
"0072",
36+
"0073",
37+
"0074",
38+
]
39+
for run in runs:
40+
sp = SedProcessor(
41+
runs=run,
42+
config=config_override,
43+
system_config=config_file,
44+
collect_metadata=False,
45+
)

docs/scripts/download_data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
dataset.get("WSe2", remove_zip=True, root_dir=root_dir)
66
dataset.get("Gd_W110", remove_zip=True, root_dir=root_dir)
77
dataset.get("TaS2", remove_zip=True, root_dir=root_dir)
8+
dataset.get("Au_Mica", remove_zip=True, root_dir=root_dir)

docs/workflows/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ myst:
88

99
```{toctree}
1010
../tutorial/4_hextof_workflow
11+
../tutorial/5_sxp_workflow.ipynb
1112
```

poetry.lock

Lines changed: 12 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tutorial/sxp_config.yaml renamed to sed/config/sxp_example_config.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ dataframe:
1717
daq: DA03
1818
forward_fill_iterations: 2
1919
num_trains: 10
20+
# num_pulses: 400 # only needed for data from new DAQ
2021
x_column: dldPosX
2122
corrected_x_column: "X"
2223
kx_column: "kx"
@@ -27,6 +28,7 @@ dataframe:
2728
tof_ns_column: dldTime
2829
corrected_tof_column: "tm"
2930
bias_column: "sampleBias"
31+
delay_column: "delayStage"
3032
tof_binwidth: 6.875E-12 # in seconds
3133
tof_binning: 0
3234
jitter_cols: ["dldPosX", "dldPosY", "dldTimeSteps"]
@@ -84,6 +86,11 @@ dataframe:
8486
format: per_train
8587
dataset_key: "/CONTROL/SCS_ILH_LAS/MDL/OPTICALDELAY_PP800/actualPosition/value"
8688
index_key: "/INDEX/trainId"
89+
# test:
90+
# daq: DA02 # change DAQ for a channel
91+
# format: per_pulse
92+
# dataset_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/intensitySa3TD"
93+
# index_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/trainId"
8794

8895
stream_name_prefixes:
8996
DA03: "RAW-R"
@@ -92,3 +99,12 @@ dataframe:
9299

93100
beamtime_dir:
94101
sxp: "/gpfs/exfel/exp/SXP/"
102+
103+
histogram:
104+
# number of bins used for histogram visualization
105+
bins: [80, 80, 80, 80]
106+
# default axes to use for histogram visualization.
107+
# Axes names starting with "@" refer to keys in the "dataframe" section
108+
axes: ["@x_column", "@y_column", "@tof_column", "@delay_column"]
109+
# default ranges to use for histogram visualization (in unbinned detector coordinates)
110+
ranges: [[0, 4000], [0, 4000], [1000, 28000], [-1000, 1000]]

sed/dataset/datasets.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
"energycal_2020_07_20"
2222
]
2323
},
24+
"Au_Mica": {
25+
"url": "https://zenodo.org/records/13952965/files/Au_Mica_SXP.zip"
26+
},
2427
"Test": {
2528
"url": "http://test.com/files/file.zip",
2629
"subdirs": [

sed/loader/sxp/loader.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
256256
for i in train_id.index:
257257
# removing broken trailing hit copies
258258
num_trains = self._config["dataframe"].get("num_trains", 0)
259+
num_pulses = self._config["dataframe"].get("num_pulses", 0)
259260
if num_trains:
260261
try:
261262
num_valid_hits = np.where(np.diff(mib_array[i].astype(np.int32)) < 0)[0][
@@ -270,7 +271,10 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
270271
index = 0
271272
for train, train_end in enumerate(train_ends):
272273
macrobunch_index.append(train_id[i] + np.uint(train))
273-
microbunch_ids.append(mib_array[i, index:train_end])
274+
if num_pulses:
275+
microbunch_ids.append(mib_array[i, index:train_end] % num_pulses)
276+
else:
277+
microbunch_ids.append(mib_array[i, index:train_end])
274278
indices.append(slice(index, train_end))
275279
index = train_end + 1
276280
macrobunch_indices.append(indices)
@@ -509,7 +513,7 @@ def create_dataframe_per_train(
509513

510514
def create_dataframe_per_channel(
511515
self,
512-
h5_file: h5py.File,
516+
file_path: Path,
513517
channel: str,
514518
) -> Union[Series, DataFrame]:
515519
"""
@@ -520,7 +524,7 @@ def create_dataframe_per_channel(
520524
DataFrame depends on the channel's format specified in the configuration.
521525
522526
Args:
523-
h5_file (h5py.File): The h5py.File object representing the HDF5 file.
527+
file_path (Path): The path to the main HDF5 file.
524528
channel (str): The name of the channel.
525529
526530
Returns:
@@ -530,11 +534,16 @@ def create_dataframe_per_channel(
530534
ValueError: If the channel has an undefined format.
531535
532536
"""
537+
channel_dict = self._config["dataframe"]["channels"][channel] # channel parameters
538+
main_daq = self._config["dataframe"]["daq"]
539+
channel_daq = self._config["dataframe"]["channels"][channel].get("daq", main_daq)
540+
# load file corresponding to daq
541+
h5_file = h5py.File(Path(str(file_path).replace(main_daq, channel_daq)))
542+
533543
[train_id, np_array] = self.create_numpy_array_per_channel(
534544
h5_file,
535545
channel,
536546
) # numpy Array created
537-
channel_dict = self._config["dataframe"]["channels"][channel] # channel parameters
538547

539548
# If np_array is size zero, fill with NaNs
540549
if np_array.size == 0:
@@ -585,7 +594,7 @@ def create_dataframe_per_channel(
585594

586595
def concatenate_channels(
587596
self,
588-
h5_file: h5py.File,
597+
file_path: Path,
589598
) -> DataFrame:
590599
"""
591600
Concatenates the channels from the provided h5py.File into a pandas DataFrame.
@@ -595,7 +604,7 @@ def concatenate_channels(
595604
available channels specified in the configuration.
596605
597606
Args:
598-
h5_file (h5py.File): The h5py.File object representing the HDF5 file.
607+
file_path (Path): The path to the main HDF5 file.
599608
600609
Returns:
601610
DataFrame: A concatenated pandas DataFrame containing the channels.
@@ -604,11 +613,13 @@ def concatenate_channels(
604613
ValueError: If the group_name for any channel does not exist in the file.
605614
606615
"""
607-
all_keys = parse_h5_keys(h5_file) # Parses all channels present
608-
609616
# Check for if the provided dataset_keys and index_keys actually exists in the file
610617
for channel in self._config["dataframe"]["channels"]:
611618
dataset_key = self._config["dataframe"]["channels"][channel]["dataset_key"]
619+
daq = self._config["dataframe"]["channels"][channel].get("daq", "DA03")
620+
# load file corresponding to daq
621+
h5_file = h5py.File(Path(str(file_path).replace("DA03", daq)))
622+
all_keys = parse_h5_keys(h5_file) # Parses all channels present
612623
if dataset_key not in all_keys:
613624
raise ValueError(
614625
f"The dataset_key for channel {channel} does not exist.",
@@ -621,7 +632,7 @@ def concatenate_channels(
621632

622633
# Create a generator expression to generate data frames for each channel
623634
data_frames = (
624-
self.create_dataframe_per_channel(h5_file, each) for each in self.available_channels
635+
self.create_dataframe_per_channel(file_path, each) for each in self.available_channels
625636
)
626637

627638
# Use the reduce function to join the data frames into a single DataFrame
@@ -649,14 +660,13 @@ def create_dataframe_per_file(
649660
650661
"""
651662
# Loads h5 file and creates a dataframe
652-
with h5py.File(file_path, "r") as h5_file:
653-
self.reset_multi_index() # Reset MultiIndexes for next file
654-
df = self.concatenate_channels(h5_file)
655-
df = df.dropna(subset=self._config["dataframe"].get("tof_column", "dldTimeSteps"))
656-
# correct the 3 bit shift which encodes the detector ID in the 8s time
657-
if self._config["dataframe"].get("split_sector_id_from_dld_time", False):
658-
df = split_dld_time_from_sector_id(df, config=self._config)
659-
return df
663+
self.reset_multi_index() # Reset MultiIndexes for next file
664+
df = self.concatenate_channels(file_path)
665+
df = df.dropna(subset=self._config["dataframe"].get("tof_column", "dldTimeSteps"))
666+
# correct the 3 bit shift which encodes the detector ID in the 8s time
667+
if self._config["dataframe"].get("split_sector_id_from_dld_time", False):
668+
df = split_dld_time_from_sector_id(df, config=self._config)
669+
return df
660670

661671
def create_buffer_file(self, h5_path: Path, parquet_path: Path) -> Union[bool, Exception]:
662672
"""

0 commit comments

Comments
 (0)