From 4500e2398e58d2d6e3d27a3d71d2503948279df5 Mon Sep 17 00:00:00 2001 From: Sander Vandenhaute Date: Fri, 20 Dec 2024 14:26:58 +0100 Subject: [PATCH] make cp2k interface more robust towards large input files --- psiflow/reference/_cp2k.py | 64 +++++++++++++++++++++++----------- psiflow/reference/reference.py | 12 ++++++- tests/test_reference.py | 12 +++++++ 3 files changed, 67 insertions(+), 21 deletions(-) diff --git a/psiflow/reference/_cp2k.py b/psiflow/reference/_cp2k.py index 4c128d4..cd84130 100644 --- a/psiflow/reference/_cp2k.py +++ b/psiflow/reference/_cp2k.py @@ -139,15 +139,11 @@ def parse_cp2k_output( return geometry -# typeguarding for some reason incompatible with WQ -def cp2k_singlepoint_pre( +def _prepare_input( geometry: Geometry, cp2k_input_dict: dict = {}, properties: tuple = (), - cp2k_command: str = "", - stdout: str = "", - stderr: str = "", - parsl_resource_specification: Optional[dict] = None, + outputs: list = [], ): from psiflow.reference._cp2k import ( dict_to_str, @@ -160,18 +156,33 @@ def cp2k_singlepoint_pre( if "forces" in properties: cp2k_input_dict["force_eval"]["print"] = {"FORCES": {}} cp2k_input_str = dict_to_str(cp2k_input_dict) + with open(outputs[0], 'w') as f: + f.write(cp2k_input_str) + + +prepare_input = python_app(_prepare_input, executors=['default_threads']) + + +# typeguarding for some reason incompatible with WQ +def cp2k_singlepoint_pre( + cp2k_command: str = "", + stdout: str = "", + stderr: str = "", + inputs: list = [], + parsl_resource_specification: Optional[dict] = None, +): + tmp_command = 'mytmpdir=$(mktemp -d 2>/dev/null || mktemp -d -t "mytmpdir")' + cd_command = "cd $mytmpdir" + cp_command = "cp {} cp2k.inp".format(inputs[0].filepath) - # see https://unix.stackexchange.com/questions/30091/fix-or-alternative-for-mktemp-in-os-x - tmp_command = 'mytmpdir=$(mktemp -d 2>/dev/null || mktemp -d -t "mytmpdir");' - cd_command = "cd $mytmpdir;" - write_command = 'echo "{}" > cp2k.inp;'.format(cp2k_input_str) command_list = [ tmp_command, cd_command, - write_command, - cp2k_command, + cp_command, + cp2k_command ] - return " ".join(command_list) + + return ' && '.join(command_list) @typeguard.typechecked @@ -222,13 +233,26 @@ def _create_apps(self): app_pre = bash_app(cp2k_singlepoint_pre, executors=[self.executor]) app_post = python_app(cp2k_singlepoint_post, executors=["default_threads"]) - self.app_pre = partial( - app_pre, - cp2k_input_dict=self.cp2k_input_dict, - properties=tuple(self.outputs), - cp2k_command=cp2k_command, - parsl_resource_specification=wq_resources, - ) + # create wrapped pre app which first parses the input file and writes it to + # disk, then call the actual bash app with the input file as a DataFuture dependency + # This is necessary because for very large structures, the size of the cp2k input + # file is too long to pass as an argument in a command line + def wrapped_app_pre(geometry, stdout: str, stderr: str): + future = prepare_input( + geometry, + cp2k_input_dict=self.cp2k_input_dict, + properties=tuple(self.outputs), + outputs=[psiflow.context().new_file('cp2k_', '.inp')], + ) + return app_pre( + cp2k_command=cp2k_command, + stdout=stdout, + stderr=stderr, + inputs=[future.outputs[0]], + parsl_resource_specification=wq_resources, + ) + + self.app_pre = wrapped_app_pre self.app_post = partial( app_post, properties=tuple(self.outputs), diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py index 8479bba..8776faf 100644 --- a/psiflow/reference/reference.py +++ b/psiflow/reference/reference.py @@ -105,7 +105,17 @@ class Reference(Computable): outputs: tuple batch_size: ClassVar[int] = 1 # not really used - def compute(self, dataset: Dataset, *outputs: Optional[Union[str, tuple]]): + def compute( + self, + arg: Union[Dataset, Geometry, AppFuture, list], + *outputs: Optional[Union[str, tuple]], + ): + if isinstance(arg, Dataset): + dataset = arg + elif isinstance(arg, list): + dataset = Dataset(arg) + elif isinstance(arg, AppFuture) or isinstance(arg, Geometry): + dataset = Dataset([arg]) compute_outputs = compute_dataset(dataset, dataset.length(), self) if len(outputs) == 0: outputs_ = tuple(self.outputs) diff --git a/tests/test_reference.py b/tests/test_reference.py index 29b5eb9..2e1c162 100644 --- a/tests/test_reference.py +++ b/tests/test_reference.py @@ -330,6 +330,18 @@ def test_cp2k_failure(context, tmp_path): assert "ABORT" in log # verify error is captured +def test_cp2k_memory(context, simple_cp2k_input): + reference = CP2K(simple_cp2k_input) + geometry = Geometry.from_data( + numbers=np.ones(4000), + positions=np.random.uniform(0, 20, size=(4000, 3)), + cell=20 * np.eye(3), # box way too large + ) + energy, forces = reference.compute(geometry) + energy, forces = energy.result(), forces.result() + assert np.all(np.isnan(energy)) + + @pytest.mark.filterwarnings("ignore:Original input file not found") def test_cp2k_timeout(context, simple_cp2k_input): reference = CP2K(simple_cp2k_input)