From 72687d47f9f22506d7864c1e646311f9085e3fe0 Mon Sep 17 00:00:00 2001 From: rettigl Date: Sat, 6 Apr 2024 23:06:52 +0200 Subject: [PATCH 01/10] add option for selecting daq per channel --- sed/loader/sxp/loader.py | 37 ++++++++++++++++++++---------------- tutorial/sxp_config.yaml | 41 +++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 31 deletions(-) diff --git a/sed/loader/sxp/loader.py b/sed/loader/sxp/loader.py index 878ad3fa..30a3f40b 100644 --- a/sed/loader/sxp/loader.py +++ b/sed/loader/sxp/loader.py @@ -509,7 +509,7 @@ def create_dataframe_per_train( def create_dataframe_per_channel( self, - h5_file: h5py.File, + file_path: Path, channel: str, ) -> Union[Series, DataFrame]: """ @@ -520,7 +520,7 @@ def create_dataframe_per_channel( DataFrame depends on the channel's format specified in the configuration. Args: - h5_file (h5py.File): The h5py.File object representing the HDF5 file. + file_path (Path): The path to the main HDF5 file. channel (str): The name of the channel. Returns: @@ -530,11 +530,15 @@ def create_dataframe_per_channel( ValueError: If the channel has an undefined format. """ + channel_dict = self._config["dataframe"]["channels"][channel] # channel parameters + daq = self._config["dataframe"]["channels"][channel].get("daq", "DA03") + # load file corresponding to daq + h5_file = h5py.File(Path(str(file_path).replace("DA03", daq))) + [train_id, np_array] = self.create_numpy_array_per_channel( h5_file, channel, ) # numpy Array created - channel_dict = self._config["dataframe"]["channels"][channel] # channel parameters # If np_array is size zero, fill with NaNs if np_array.size == 0: @@ -585,7 +589,7 @@ def create_dataframe_per_channel( def concatenate_channels( self, - h5_file: h5py.File, + file_path: Path, ) -> DataFrame: """ Concatenates the channels from the provided h5py.File into a pandas DataFrame. @@ -595,7 +599,7 @@ def concatenate_channels( available channels specified in the configuration. Args: - h5_file (h5py.File): The h5py.File object representing the HDF5 file. + file_path (Path): The path to the main HDF5 file. Returns: DataFrame: A concatenated pandas DataFrame containing the channels. @@ -604,11 +608,13 @@ def concatenate_channels( ValueError: If the group_name for any channel does not exist in the file. """ - all_keys = parse_h5_keys(h5_file) # Parses all channels present - # Check for if the provided dataset_keys and index_keys actually exists in the file for channel in self._config["dataframe"]["channels"]: dataset_key = self._config["dataframe"]["channels"][channel]["dataset_key"] + daq = self._config["dataframe"]["channels"][channel].get("daq", "DA03") + # load file corresponding to daq + h5_file = h5py.File(Path(str(file_path).replace("DA03", daq))) + all_keys = parse_h5_keys(h5_file) # Parses all channels present if dataset_key not in all_keys: raise ValueError( f"The dataset_key for channel {channel} does not exist.", @@ -621,7 +627,7 @@ def concatenate_channels( # Create a generator expression to generate data frames for each channel data_frames = ( - self.create_dataframe_per_channel(h5_file, each) for each in self.available_channels + self.create_dataframe_per_channel(file_path, each) for each in self.available_channels ) # Use the reduce function to join the data frames into a single DataFrame @@ -649,14 +655,13 @@ def create_dataframe_per_file( """ # Loads h5 file and creates a dataframe - with h5py.File(file_path, "r") as h5_file: - self.reset_multi_index() # Reset MultiIndexes for next file - df = self.concatenate_channels(h5_file) - df = df.dropna(subset=self._config["dataframe"].get("tof_column", "dldTimeSteps")) - # correct the 3 bit shift which encodes the detector ID in the 8s time - if self._config["dataframe"].get("split_sector_id_from_dld_time", False): - df = split_dld_time_from_sector_id(df, config=self._config) - return df + self.reset_multi_index() # Reset MultiIndexes for next file + df = self.concatenate_channels(file_path) + df = df.dropna(subset=self._config["dataframe"].get("tof_column", "dldTimeSteps")) + # correct the 3 bit shift which encodes the detector ID in the 8s time + if self._config["dataframe"].get("split_sector_id_from_dld_time", False): + df = split_dld_time_from_sector_id(df, config=self._config) + return df def create_buffer_file(self, h5_path: Path, parquet_path: Path) -> Union[bool, Exception]: """ diff --git a/tutorial/sxp_config.yaml b/tutorial/sxp_config.yaml index 70e7d163..dc855f47 100644 --- a/tutorial/sxp_config.yaml +++ b/tutorial/sxp_config.yaml @@ -16,7 +16,7 @@ dataframe: ubid_offset: 0 daq: DA03 forward_fill_iterations: 2 - num_trains: 10 + num_trains: 1 x_column: dldPosX corrected_x_column: "X" kx_column: "kx" @@ -51,39 +51,50 @@ dataframe: channels: timeStamp: + daq: "DA03" format: per_train dataset_key: "/INDEX/timestamp" index_key: "/INDEX/trainId" scale: 1000000000 pulseId: + daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/starterCounter" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/starterCounter" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" max_hits: 10000 trainId: + daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/masterCounter" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/masterCounter" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" max_hits: 10000 dldPosX: + daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/x" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/x" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" max_hits: 10000 dldPosY: + daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/y" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/y" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" max_hits: 10000 dldTimeSteps: + daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/t" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/t" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" max_hits: 10000 - delayStage: - format: per_train - dataset_key: "/CONTROL/SCS_ILH_LAS/MDL/OPTICALDELAY_PP800/actualPosition/value" - index_key: "/INDEX/trainId" +# delayStage: +# format: per_train +# dataset_key: "/CONTROL/SCS_ILH_LAS/MDL/OPTICALDELAY_PP800/actualPosition/value" +# index_key: "/INDEX/trainId" + test: + daq: DA02 + format: per_pulse + dataset_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/intensitySa3TD" + index_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/trainId" stream_name_prefixes: DA03: "RAW-R" From 61b8364922511bd0fe4598e3414ad6601e06c062 Mon Sep 17 00:00:00 2001 From: rettigl Date: Sat, 6 Apr 2024 23:07:55 +0200 Subject: [PATCH 02/10] add quickfix for buggy daq: take pulseId mod. num_pulses --- sed/loader/sxp/loader.py | 6 +++++- tutorial/sxp_config.yaml | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sed/loader/sxp/loader.py b/sed/loader/sxp/loader.py index 30a3f40b..6c86887f 100644 --- a/sed/loader/sxp/loader.py +++ b/sed/loader/sxp/loader.py @@ -256,6 +256,7 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None: for i in train_id.index: # removing broken trailing hit copies num_trains = self._config["dataframe"].get("num_trains", 0) + num_pulses = self._config["dataframe"].get("num_pulses", 0) if num_trains: try: num_valid_hits = np.where(np.diff(mib_array[i].astype(np.int32)) < 0)[0][ @@ -270,7 +271,10 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None: index = 0 for train, train_end in enumerate(train_ends): macrobunch_index.append(train_id[i] + np.uint(train)) - microbunch_ids.append(mib_array[i, index:train_end]) + if num_pulses: + microbunch_ids.append(mib_array[i, index:train_end] % num_pulses) + else: + microbunch_ids.append(mib_array[i, index:train_end]) indices.append(slice(index, train_end)) index = train_end + 1 macrobunch_indices.append(indices) diff --git a/tutorial/sxp_config.yaml b/tutorial/sxp_config.yaml index dc855f47..2eb6e2de 100644 --- a/tutorial/sxp_config.yaml +++ b/tutorial/sxp_config.yaml @@ -17,6 +17,7 @@ dataframe: daq: DA03 forward_fill_iterations: 2 num_trains: 1 + num_pulses: 400 x_column: dldPosX corrected_x_column: "X" kx_column: "kx" From eea0b5854b036508d564ab220423d70cd9a074a2 Mon Sep 17 00:00:00 2001 From: rettigl Date: Fri, 18 Oct 2024 00:00:24 +0200 Subject: [PATCH 03/10] use main DAQ setting as default --- sed/loader/sxp/loader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sed/loader/sxp/loader.py b/sed/loader/sxp/loader.py index 6c86887f..f66ff90e 100644 --- a/sed/loader/sxp/loader.py +++ b/sed/loader/sxp/loader.py @@ -535,9 +535,10 @@ def create_dataframe_per_channel( """ channel_dict = self._config["dataframe"]["channels"][channel] # channel parameters - daq = self._config["dataframe"]["channels"][channel].get("daq", "DA03") + main_daq = self._config["dataframe"]["daq"] + channel_daq = self._config["dataframe"]["channels"][channel].get("daq", main_daq) # load file corresponding to daq - h5_file = h5py.File(Path(str(file_path).replace("DA03", daq))) + h5_file = h5py.File(Path(str(file_path).replace(main_daq, channel_daq))) [train_id, np_array] = self.create_numpy_array_per_channel( h5_file, From 5dab55e5e725ed3e8b7cf4fc57c3f9e0bc050532 Mon Sep 17 00:00:00 2001 From: rettigl Date: Fri, 18 Oct 2024 00:01:16 +0200 Subject: [PATCH 04/10] change config back for tutorial data --- .cspell/custom-dictionary.txt | 1 + tutorial/sxp_config.yaml | 48 +++++++++++++++-------------------- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt index 128efab7..8dbebcb2 100644 --- a/.cspell/custom-dictionary.txt +++ b/.cspell/custom-dictionary.txt @@ -84,6 +84,7 @@ dfpart dfpid dictionarized dictmerge +DOOCS dpkg dropna dset diff --git a/tutorial/sxp_config.yaml b/tutorial/sxp_config.yaml index 2eb6e2de..922c8ef6 100644 --- a/tutorial/sxp_config.yaml +++ b/tutorial/sxp_config.yaml @@ -16,8 +16,8 @@ dataframe: ubid_offset: 0 daq: DA03 forward_fill_iterations: 2 - num_trains: 1 - num_pulses: 400 + num_trains: 10 + # num_pulses: 400 # only needed for data from new DAQ x_column: dldPosX corrected_x_column: "X" kx_column: "kx" @@ -52,50 +52,44 @@ dataframe: channels: timeStamp: - daq: "DA03" format: per_train dataset_key: "/INDEX/timestamp" index_key: "/INDEX/trainId" scale: 1000000000 pulseId: - daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/starterCounter" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/starterCounter" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" max_hits: 10000 trainId: - daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/masterCounter" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/masterCounter" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" max_hits: 10000 dldPosX: - daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/x" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/x" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" max_hits: 10000 dldPosY: - daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/y" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/y" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" max_hits: 10000 dldTimeSteps: - daq: "DA03" format: per_electron - dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/t" - index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_CPP_TEST:daqOutput/daqOutput/trainId" + dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/t" + index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId" max_hits: 10000 -# delayStage: -# format: per_train -# dataset_key: "/CONTROL/SCS_ILH_LAS/MDL/OPTICALDELAY_PP800/actualPosition/value" -# index_key: "/INDEX/trainId" - test: - daq: DA02 - format: per_pulse - dataset_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/intensitySa3TD" - index_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/trainId" + delayStage: + format: per_train + dataset_key: "/CONTROL/SCS_ILH_LAS/MDL/OPTICALDELAY_PP800/actualPosition/value" + index_key: "/INDEX/trainId" +# test: +# daq: DA02 # change DAQ for a channel +# format: per_pulse +# dataset_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/intensitySa3TD" +# index_key: "/INSTRUMENT/SA3_XTD10_XGM/XGM/DOOCS:output/data/trainId" stream_name_prefixes: DA03: "RAW-R" From ad37fa975c8c7d43a708be7aaaacdf01e1751c25 Mon Sep 17 00:00:00 2001 From: rettigl Date: Fri, 18 Oct 2024 00:17:03 +0200 Subject: [PATCH 05/10] move sxp_config to config --- {tutorial => sed/config}/sxp_config.yaml | 0 tutorial/5_sxp_workflow.ipynb | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename {tutorial => sed/config}/sxp_config.yaml (100%) diff --git a/tutorial/sxp_config.yaml b/sed/config/sxp_config.yaml similarity index 100% rename from tutorial/sxp_config.yaml rename to sed/config/sxp_config.yaml diff --git a/tutorial/5_sxp_workflow.ipynb b/tutorial/5_sxp_workflow.ipynb index f7a89220..baf8b585 100644 --- a/tutorial/5_sxp_workflow.ipynb +++ b/tutorial/5_sxp_workflow.ipynb @@ -32,7 +32,7 @@ }, "outputs": [], "source": [ - "local_path = Path(sed.__file__).parent.parent / \"tutorial/\"\n", + "local_path = Path(sed.__file__).parent / \"config/\"\n", "config_file = local_path / \"sxp_config.yaml\"\n", "assert config_file.exists()" ] @@ -394,7 +394,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.19" } }, "nbformat": 4, From 3ca66a37e59f3ad4ae4cc6d94c394e531ac6535f Mon Sep 17 00:00:00 2001 From: rettigl Date: Fri, 18 Oct 2024 23:03:19 +0200 Subject: [PATCH 06/10] add SXP example dataset and update notebook --- .cspell/custom-dictionary.txt | 1 + ...xp_config.yaml => sxp_example_config.yaml} | 11 + sed/dataset/datasets.json | 3 + tutorial/5_sxp_workflow.ipynb | 272 ++++++++++++++---- 4 files changed, 235 insertions(+), 52 deletions(-) rename sed/config/{sxp_config.yaml => sxp_example_config.yaml} (86%) diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt index 8dbebcb2..747cf180 100644 --- a/.cspell/custom-dictionary.txt +++ b/.cspell/custom-dictionary.txt @@ -382,6 +382,7 @@ xarray xaxis xcirc xdata +XFEL xind Xinstrument xlabel diff --git a/sed/config/sxp_config.yaml b/sed/config/sxp_example_config.yaml similarity index 86% rename from sed/config/sxp_config.yaml rename to sed/config/sxp_example_config.yaml index 922c8ef6..716afc9b 100644 --- a/sed/config/sxp_config.yaml +++ b/sed/config/sxp_example_config.yaml @@ -28,6 +28,7 @@ dataframe: tof_ns_column: dldTime corrected_tof_column: "tm" bias_column: "sampleBias" + delay_column: "delayStage" tof_binwidth: 6.875E-12 # in seconds tof_binning: 0 jitter_cols: ["dldPosX", "dldPosY", "dldTimeSteps"] @@ -98,3 +99,13 @@ dataframe: beamtime_dir: sxp: "/gpfs/exfel/exp/SXP/" + +histogram: + # number of bins used for histogram visualization + bins: [80, 80, 80, 80] + # default axes to use for histogram visualization. + # Axes names starting with "@" refer to keys in the "dataframe" section + axes: ["@x_column", "@y_column", "@tof_column", "@delay_column"] + # default ranges to use for histogram visualization (in unbinned detector coordinates) + ranges: [[0, 4000], [0, 4000], [1000, 28000], [-1000, 1000]] + diff --git a/sed/dataset/datasets.json b/sed/dataset/datasets.json index 3a2c1289..47bb6708 100644 --- a/sed/dataset/datasets.json +++ b/sed/dataset/datasets.json @@ -21,6 +21,9 @@ "energycal_2020_07_20" ] }, + "Au_Mica": { + "url": "https://cloud.fhi-berlin.mpg.de:8443/dl/fiTjjP8DcLUXN84myzTXFHjt/sxp_example_data.zip" + }, "Test": { "url": "http://test.com/files/file.zip", "subdirs": [ diff --git a/tutorial/5_sxp_workflow.ipynb b/tutorial/5_sxp_workflow.ipynb index baf8b585..341a607c 100644 --- a/tutorial/5_sxp_workflow.ipynb +++ b/tutorial/5_sxp_workflow.ipynb @@ -1,5 +1,20 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial for binning data from the SXP instrument at the European XFEL" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparation\n", + "### Import necessary libraries" + ] + }, { "cell_type": "code", "execution_count": null, @@ -10,16 +25,28 @@ }, "outputs": [], "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", "from pathlib import Path\n", + "import os\n", + "import xarray as xr\n", + "import numpy as np\n", + "\n", + "from sed import SedProcessor\n", + "from sed.dataset import dataset\n", "\n", - "#%matplotlib inline\n", "%matplotlib widget\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import os\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get data paths\n", + "The paths are such that if you are on Maxwell, it uses those. Otherwise data is downloaded in current directory from Zenodo.\n", "\n", - "import sed\n", - "from sed import SedProcessor" + "Generally, if it is your beamtime, you can both read the raw data and write to processed directory. However, for the public data, you can not write to processed directory." ] }, { @@ -32,16 +59,23 @@ }, "outputs": [], "source": [ - "local_path = Path(sed.__file__).parent / \"config/\"\n", - "config_file = local_path / \"sxp_config.yaml\"\n", - "assert config_file.exists()" + "beamtime_dir = \"/gpfs/exfel/exp/SXP/202302/p004316/\" # on Maxwell\n", + "if os.path.exists(beamtime_dir) and os.access(beamtime_dir, os.R_OK):\n", + " path = beamtime_dir + \"/raw/\"\n", + " buffer_path = \"Au_Mica/processed/\"\n", + "else:\n", + " # data_path can be defined and used to store the data in a specific location\n", + " dataset.get(\"Au_Mica\") # Put in Path to a storage of at least 10 GByte free space.\n", + " path = dataset.dir\n", + " buffer_path = path + \"/processed/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Loading Data" + "### Config setup\n", + "Here we get the path to the config file and setup the relevant directories. This can also be done directly in the config file." ] }, { @@ -50,36 +84,57 @@ "metadata": {}, "outputs": [], "source": [ - "config = {\n", + "# pick the default configuration file for SXP@XFEL\n", + "config_file = Path('../sed/config/sxp_example_config.yaml')\n", + "assert config_file.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# here we setup a dictionary that will be used to override the path configuration\n", + "config_override = {\n", " \"core\": {\n", " \"paths\": {\n", - " \"data_raw_dir\": \"/gpfs/exfel/exp/SXP/202302/p004316/raw/\",\n", - " \"data_parquet_dir\": os.path.expanduser(\"~\") + \"/sxp_parquet/\",\n", - " }\n", - " }\n", - "}\n", - "sp = SedProcessor(\n", - " runs=[\"0058\", \"0059\", \"0060\", \"0061\"],\n", - " config=config,\n", - " user_config=config_file,\n", - " system_config={},\n", - " collect_metadata=False,\n", - ")\n", - "sp.add_jitter()" + " \"data_raw_dir\": path,\n", + " \"data_parquet_dir\": buffer_path,\n", + " },\n", + " },\n", + "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Inspect dataframe" + "### cleanup previous config files\n", + "In this notebook, we will show how calibration parameters can be generated. Therefore we want to clean the local directory of previously generated files.\n", + "\n", + "**WARNING** running the cell below will delete the \"sed_config.yaml\" file in the local directory. If these contain precious calibration parameters, **DO NOT RUN THIS CELL**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "local_folder_config = Path('./sed_config.yaml')\n", + "if local_folder_config.exists():\n", + " os.remove(local_folder_config)\n", + " print(f'deleted local config file {local_folder_config}')\n", + "assert not local_folder_config.exists()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Train IDs in scans " + "## Load Au/Mica data\n", + "Now we load a couple of scans from Au 4f core levels. Data will be processed to parquet format first, if not existing yet, and then loaded into the processor." ] }, { @@ -88,17 +143,20 @@ "metadata": {}, "outputs": [], "source": [ - "plt.figure()\n", - "ids=sp.dataframe.trainId.compute().values\n", - "plt.plot(ids)\n", - "plt.show()" + "sp = SedProcessor(\n", + " runs=[\"0058\", \"0059\", \"0060\", \"0061\"],\n", + " config=config_override,\n", + " system_config=config_file,\n", + " collect_metadata=False,\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Dataframe columns" + "## Inspect the dataframe\n", + "We first try to get an overview of the structure of the data. For this, we look at the loaded dataframe:" ] }, { @@ -114,7 +172,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Histograms" + "### Train IDs in scans \n", + "Next, let's look at the trainIDs contained in these runs" ] }, { @@ -123,10 +182,27 @@ "metadata": {}, "outputs": [], "source": [ - "axes = [\"dldPosX\", \"dldPosY\", \"dldTimeSteps\", \"delayStage\"]\n", - "bins = [100, 100, 100, 100]\n", - "ranges = [(0, 4000), (0, 4000), (1000, 28000), (-1000, 1000)]\n", - "sp.view_event_histogram(dfpid=3, axes=axes, bins=bins, ranges=ranges)" + "plt.figure()\n", + "ids=sp.dataframe.trainId.compute().values\n", + "plt.plot(ids)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Channel Histograms\n", + "Let's look at the single histograms of the main data channels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sp.view_event_histogram(dfpid=3)" ] }, { @@ -134,7 +210,7 @@ "metadata": {}, "source": [ "### PulseIds, ElectronIds\n", - "More hits at later microbunches" + "To get a better understanding of the structure of the data, lets look at the histograms of microbunches and electrons. We see that we have more hits at later microbunches, and only few multi-hits." ] }, { @@ -153,7 +229,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Spectrum vs. MicrobunchId" + "## Spectrum vs. MicrobunchId\n", + "Let's check the TOF spectrum as function of microbunch ID, to understand if the increasing hit probability has any influence on the spectrum." ] }, { @@ -174,8 +251,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Last bunch contains unusually many events \n", - "garbage events, filter away" + "### Filter events\n", + "We see that the last microbunch has unusually many hits. These are DAQ artifacts, so we filter them away" ] }, { @@ -187,6 +264,13 @@ "sp.filter_column(\"pulseId\", max_value=756)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the histogram looks clean" + ] + }, { "cell_type": "code", "execution_count": null, @@ -205,7 +289,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Background changes with microbunchID / Intensity" + "We also see that the background below the Au 4f core levels slightly changes with microbunch ID. The origin of this is not quite clear yet." ] }, { @@ -224,7 +308,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Energy Calibration" + "## Energy Calibration\n", + "We now load a bias series, where the sample bias was varied, effectively shifting the energy spectra. This allows us to calibrate the conversion between the digital values of the dld and the energy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### time-of-flight spectrum\n", + "to compare with what we see on the measurement computer, we might want to plot the time-of-flight spectrum. This is done here." ] }, { @@ -236,6 +329,13 @@ "sp.append_tof_ns_axis()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, to determine proper binning ranges, let's have again a look at the event histograms:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -248,13 +348,20 @@ "sp.view_event_histogram(dfpid=1, axes=axes, bins=bins, ranges=ranges)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load energy calibration files\n", + "We now load a range of runs sequentially, that were recorded with different sample bias values, and load them afterwards into an xarray" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import xarray as xr\n", "runs = [\"0074\", \"0073\", \"0072\", \"0071\", \"0070\", \"0064\", \"0065\", \"0066\", \"0067\", \"0068\", \"0069\"]\n", "biases = np.arange(962, 951, -1)\n", "data = []\n", @@ -270,6 +377,14 @@ "biasSeries = xr.concat(data, dim=xr.DataArray(biases, dims=\"sampleBias\", name=\"sampleBias\"))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load bias series\n", + "Now we load the bias series xarray into the processor for calibration" + ] + }, { "cell_type": "code", "execution_count": null, @@ -279,6 +394,14 @@ "sp.load_bias_series(binned_data=biasSeries)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### find calibration parameters\n", + "We now will fit the tof-energy relation. This is done by finding the maxima of a peak in the tof spectrum, and then fitting the square root relation to obtain the calibration parameters. " + ] + }, { "cell_type": "code", "execution_count": null, @@ -308,12 +431,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "sp.save_energy_calibration()" + "### Save calibration\n", + "Now we save the calibration parameters into a local configuration file, that will be loaded in the next step" ] }, { @@ -322,14 +444,15 @@ "metadata": {}, "outputs": [], "source": [ - "sp.append_energy_axis()" + "sp.save_energy_calibration()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "W4f core levels, Energy vs. Delay" + "## Bin data with energy axis\n", + "Now that we have the calibration parameters, we can generate the energy axis for our dataset. We need to load it again, and apply the calibration" ] }, { @@ -344,6 +467,13 @@ "sp.append_energy_axis()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can bin as function fo energy and delay stage position" + ] + }, { "cell_type": "code", "execution_count": null, @@ -362,12 +492,50 @@ "metadata": {}, "outputs": [], "source": [ - "res_sub = res - res.loc[{\"delayStage\": slice(-135, -133)}].mean(axis=1)\n", + "fig, axs = plt.subplots(1, 1, figsize=(4, 3), constrained_layout=True)\n", + "res.plot(ax=axs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correct delay stage offset.\n", + "We can also offset the zero delay of the delay stage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sp.add_delay_offset(constant=126.9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "axes = ['energy', \"delayStage\"]\n", + "bins = [200, 100]\n", + "ranges = [[-37,-31], [-8, 8]]\n", + "res = sp.compute(bins=bins, axes=axes, ranges=ranges, normalize_to_acquisition_time=\"delayStage\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res_sub = res - res.loc[{\"delayStage\": slice(-8, -1)}].mean(axis=1)\n", "fig, axs = plt.subplots(3, 1, figsize=(4, 8), constrained_layout=True)\n", "res.plot(ax=axs[0])\n", "res_sub.plot(ax=axs[1])\n", - "res_sub.loc[{\"energy\":slice(-32.5,-32)}].sum(axis=0).plot(ax=axs[2])\n", - "plt.title(\"W4f core levels, energy vs. delayStage\")" + "res_sub.loc[{\"energy\":slice(-32.5,-32)}].sum(axis=0).plot(ax=axs[2])" ] }, { From e69e081f1278038a724b0b964d819f7ffe1e4ec7 Mon Sep 17 00:00:00 2001 From: rettigl Date: Fri, 18 Oct 2024 23:12:10 +0200 Subject: [PATCH 07/10] add sxp tutorial to documentation --- .github/workflows/documentation.yml | 4 +-- docs/index.rst | 1 + docs/scripts/build_sxp_parquets.py | 45 +++++++++++++++++++++++++++++ docs/scripts/download_data.py | 1 + sed/config/sxp_example_config.yaml | 1 - 5 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 docs/scripts/build_sxp_parquets.py diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 6c417d67..744a6d70 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -63,7 +63,6 @@ jobs: run: | cp -r $GITHUB_WORKSPACE/tutorial $GITHUB_WORKSPACE/docs/ cp -r $GITHUB_WORKSPACE/sed/config $GITHUB_WORKSPACE/docs/sed - rm $GITHUB_WORKSPACE/docs/tutorial/5_sxp_workflow.ipynb # To be included later # - name: Cache docs build @@ -79,10 +78,11 @@ jobs: cd $GITHUB_WORKSPACE/docs poetry run python scripts/download_data.py - - name: build Flash parquet files + - name: build parquet files run: | cd $GITHUB_WORKSPACE/docs poetry run python scripts/build_flash_parquets.py + poetry run python scripts/build_sxp_parquets.py - name: build Sphinx docs run: poetry run sphinx-build -b html $GITHUB_WORKSPACE/docs $GITHUB_WORKSPACE/_build diff --git a/docs/index.rst b/docs/index.rst index b61dcb31..7f51f4c8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,6 +10,7 @@ Single-Event DataFrame (SED) documentation tutorial/2_conversion_pipeline_for_example_time-resolved_ARPES_data tutorial/3_metadata_collection_and_export_to_NeXus tutorial/4_hextof_workflow.ipynb + tutorial/5_sxp_workflow.ipynb tutorial/6_binning_with_time-stamped_data tutorial/7_correcting_orthorhombic_symmetry tutorial/8_jittering_tutorial diff --git a/docs/scripts/build_sxp_parquets.py b/docs/scripts/build_sxp_parquets.py new file mode 100644 index 00000000..bc1e8f97 --- /dev/null +++ b/docs/scripts/build_sxp_parquets.py @@ -0,0 +1,45 @@ +from pathlib import Path + +import sed +from sed import SedProcessor +from sed.dataset import dataset + +config_file = Path(sed.__file__).parent / "config/sxp_example_config.yaml" + +dataset.get("Au_Mica", root_dir="./tutorial") +data_path = dataset.dir + + +config_override = { + "core": { + "paths": { + "data_raw_dir": data_path, + "data_parquet_dir": data_path + "/processed/", + }, + }, +} + +runs = [ + "0058", + "0059", + "0060", + "0061", + "0074", + "0073", + "0072", + "0071", + "0070", + "0064", + "0065", + "0066", + "0067", + "0068", + "0069", +] +for run in runs: + sp = SedProcessor( + runs=run, + config=config_override, + system_config=config_file, + collect_metadata=False, + ) diff --git a/docs/scripts/download_data.py b/docs/scripts/download_data.py index 97b2a63c..cf1e042e 100644 --- a/docs/scripts/download_data.py +++ b/docs/scripts/download_data.py @@ -5,3 +5,4 @@ dataset.get("WSe2", remove_zip=True, root_dir=root_dir) dataset.get("Gd_W110", remove_zip=True, root_dir=root_dir) dataset.get("TaS2", remove_zip=True, root_dir=root_dir) +dataset.get("Au_Mica", remove_zip=True, root_dir=root_dir) diff --git a/sed/config/sxp_example_config.yaml b/sed/config/sxp_example_config.yaml index 716afc9b..3c7e0f90 100644 --- a/sed/config/sxp_example_config.yaml +++ b/sed/config/sxp_example_config.yaml @@ -108,4 +108,3 @@ histogram: axes: ["@x_column", "@y_column", "@tof_column", "@delay_column"] # default ranges to use for histogram visualization (in unbinned detector coordinates) ranges: [[0, 4000], [0, 4000], [1000, 28000], [-1000, 1000]] - From 39669d7b3f6c0e1115595ebdfd4d33cdf80de676 Mon Sep 17 00:00:00 2001 From: rettigl Date: Tue, 22 Oct 2024 19:45:50 +0200 Subject: [PATCH 08/10] update dataset URL --- sed/dataset/datasets.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sed/dataset/datasets.json b/sed/dataset/datasets.json index 47bb6708..914213a5 100644 --- a/sed/dataset/datasets.json +++ b/sed/dataset/datasets.json @@ -22,7 +22,7 @@ ] }, "Au_Mica": { - "url": "https://cloud.fhi-berlin.mpg.de:8443/dl/fiTjjP8DcLUXN84myzTXFHjt/sxp_example_data.zip" + "url": "https://zenodo.org/records/13952965/files/Au_Mica_SXP.zip" }, "Test": { "url": "http://test.com/files/file.zip", From e2b7c3124f68da4709af74945bbe62b0821b4155 Mon Sep 17 00:00:00 2001 From: rettigl Date: Mon, 11 Nov 2024 17:36:22 +0100 Subject: [PATCH 09/10] add review changes --- docs/scripts/build_sxp_parquets.py | 10 ++++---- tutorial/5_sxp_workflow.ipynb | 37 ++++++++---------------------- 2 files changed, 15 insertions(+), 32 deletions(-) diff --git a/docs/scripts/build_sxp_parquets.py b/docs/scripts/build_sxp_parquets.py index bc1e8f97..dd870148 100644 --- a/docs/scripts/build_sxp_parquets.py +++ b/docs/scripts/build_sxp_parquets.py @@ -24,17 +24,17 @@ "0059", "0060", "0061", - "0074", - "0073", - "0072", - "0071", - "0070", "0064", "0065", "0066", "0067", "0068", "0069", + "0070", + "0071", + "0072", + "0073", + "0074", ] for run in runs: sp = SedProcessor( diff --git a/tutorial/5_sxp_workflow.ipynb b/tutorial/5_sxp_workflow.ipynb index 341a607c..fb2a2f12 100644 --- a/tutorial/5_sxp_workflow.ipynb +++ b/tutorial/5_sxp_workflow.ipynb @@ -148,6 +148,7 @@ " config=config_override,\n", " system_config=config_file,\n", " collect_metadata=False,\n", + " verbose=True,\n", ")" ] }, @@ -229,8 +230,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Spectrum vs. MicrobunchId\n", - "Let's check the TOF spectrum as function of microbunch ID, to understand if the increasing hit probability has any influence on the spectrum." + "We can also inspect the counts per train as function of the trainId and the pulseId, which gives us a good idea about the evolution of the count rate over the run(s)" ] }, { @@ -239,36 +239,20 @@ "metadata": {}, "outputs": [], "source": [ - "axes = [\"dldTimeSteps\", \"pulseId\"]\n", - "bins = [200, 800]\n", - "ranges = [(8000, 28000), (0, 800)]\n", - "res = sp.compute(bins=bins, axes=axes, ranges=ranges)\n", "plt.figure()\n", - "res.plot(robust=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Filter events\n", - "We see that the last microbunch has unusually many hits. These are DAQ artifacts, so we filter them away" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sp.filter_column(\"pulseId\", max_value=756)" + "axes = [\"trainId\", \"pulseId\"]\n", + "bins = [100, 100]\n", + "ranges = [(ids.min()+1, ids.max()), (0, 800)]\n", + "res = sp.compute(bins=bins, axes=axes, ranges=ranges)\n", + "res.plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now the histogram looks clean" + "## Spectrum vs. MicrobunchId\n", + "Let's check the TOF spectrum as function of microbunch ID, to understand if the increasing hit probability has any influence on the spectrum." ] }, { @@ -289,7 +273,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We also see that the background below the Au 4f core levels slightly changes with microbunch ID. The origin of this is not quite clear yet." + "We see that the background below the Au 4f core levels slightly changes with microbunch ID. The origin of this is not quite clear yet." ] }, { @@ -372,7 +356,6 @@ " ranges = [(1000, 25000)]\n", " res = sp.compute(bins=bins, axes=axes, ranges=ranges)\n", " data.append(res)\n", - " res.plot()\n", "\n", "biasSeries = xr.concat(data, dim=xr.DataArray(biases, dims=\"sampleBias\", name=\"sampleBias\"))" ] From 52a06c16a997362ffb80bcd3210fb9520a9264eb Mon Sep 17 00:00:00 2001 From: rettigl Date: Mon, 11 Nov 2024 18:17:00 +0100 Subject: [PATCH 10/10] update lockfile --- poetry.lock | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/poetry.lock b/poetry.lock index fda5d737..f63c7c04 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1531,22 +1531,22 @@ arrow = ">=0.15.0" [[package]] name = "jedi" -version = "0.19.1" +version = "0.19.2" description = "An autocompletion tool for Python that can be used for text editors." optional = false python-versions = ">=3.6" files = [ - {file = "jedi-0.19.1-py2.py3-none-any.whl", hash = "sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0"}, - {file = "jedi-0.19.1.tar.gz", hash = "sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd"}, + {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"}, + {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"}, ] [package.dependencies] -parso = ">=0.8.3,<0.9.0" +parso = ">=0.8.4,<0.9.0" [package.extras] docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx (==1.8.5)", "sphinx-rtd-theme (==0.4.3)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"] qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] -testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] +testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"] [[package]] name = "jinja2" @@ -1578,15 +1578,18 @@ files = [ [[package]] name = "json5" -version = "0.9.25" +version = "0.9.27" description = "A Python implementation of the JSON5 data format." optional = true -python-versions = ">=3.8" +python-versions = ">=3.8.0" files = [ - {file = "json5-0.9.25-py3-none-any.whl", hash = "sha256:34ed7d834b1341a86987ed52f3f76cd8ee184394906b6e22a1e0deb9ab294e8f"}, - {file = "json5-0.9.25.tar.gz", hash = "sha256:548e41b9be043f9426776f05df8635a00fe06104ea51ed24b67f908856e151ae"}, + {file = "json5-0.9.27-py3-none-any.whl", hash = "sha256:17b43d78d3a6daeca4d7030e9bf22092dba29b1282cc2d0cfa56f6febee8dc93"}, + {file = "json5-0.9.27.tar.gz", hash = "sha256:5a19de4a6ca24ba664dc7d50307eb73ba9a16dea5d6bde85677ae85d3ed2d8e0"}, ] +[package.extras] +dev = ["build (==1.2.1)", "coverage (==7.5.3)", "mypy (==1.10.0)", "pip (==24.1)", "pylint (==3.2.3)", "ruff (==0.5.1)", "twine (==5.1.1)", "uv (==0.2.13)"] + [[package]] name = "jsonpointer" version = "3.0.0"