Skip to content

Commit 627e53f

Browse files
feat: add function that returns cif filenames based on chemical formula
1 parent a51b553 commit 627e53f

File tree

3 files changed

+121
-0
lines changed

3 files changed

+121
-0
lines changed

news/get-cif.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
**Added:**
2+
3+
* Function that returns a list of cif filenames based on a given chemical formula.
4+
5+
**Changed:**
6+
7+
* <news item>
8+
9+
**Deprecated:**
10+
11+
* <news item>
12+
13+
**Removed:**
14+
15+
* <news item>
16+
17+
**Fixed:**
18+
19+
* <news item>
20+
21+
**Security:**
22+
23+
* <news item>

src/diffpy/utils/tools.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from pathlib import Path
55

66
import numpy as np
7+
import requests
8+
from bs4 import BeautifulSoup
79
from scipy.optimize import dual_annealing
810
from scipy.signal import convolve
911
from xraydb import material_mu
@@ -214,6 +216,61 @@ def get_package_info(package_names, metadata=None):
214216
return metadata
215217

216218

219+
def fetch_cif_filenames(hill_formula):
220+
"""Fetches a list of CIF filenames from the Crystallography Open Database
221+
(COD) based on the given chemical formula in Hill notation, where elements
222+
are separated by whitespace and the count of 1 is omitted (e.g., "Cl Na").
223+
224+
Parameters
225+
----------
226+
hill_formula : str
227+
The chemical formula in Hill notation.
228+
229+
Returns
230+
-------
231+
list of str
232+
A list of CIF filenames (e.g., ["1000041.cif", "2104025.cif"]).
233+
234+
Raises
235+
------
236+
ValueError
237+
If no CIF files are found for the given formula.
238+
239+
Notes
240+
-----
241+
The data is retrieved from the Crystallography Open Database (COD).
242+
If you use COD data in your research,
243+
please acknowledge the COD project as described at
244+
https://www.crystallography.net/cod/acknowledgements.html.
245+
"""
246+
search_url = (
247+
f"https://www.crystallography.net/cod/"
248+
f"result.php?formula={hill_formula}"
249+
)
250+
response = requests.get(search_url)
251+
if response.status_code != 200:
252+
raise Exception(
253+
f"Failed to retrieve search results. "
254+
f"HTTP status code: {response.status_code}."
255+
)
256+
cif_links = BeautifulSoup(response.text, "html.parser").find_all("a")
257+
cif_filenames = []
258+
for link in cif_links:
259+
href = link.get("href", "")
260+
if href.endswith(".cif"):
261+
filename = href.split("/")[-1]
262+
cif_filenames.append(filename)
263+
if len(cif_filenames) == 0:
264+
raise ValueError(
265+
f"No CIF files found for the given formula: {hill_formula}. "
266+
"Please ensure it's in Hill notation (e.g., 'Cl Na'). "
267+
"You can use ``to_hill_notation`` for conversion. "
268+
"If the formula is correct, it is possible that "
269+
"no CIF files are available for this formula in the COD."
270+
)
271+
return cif_filenames
272+
273+
217274
def get_density_from_cloud(sample_composition, mp_token=""):
218275
"""Function to get material density from the MP or COD database.
219276

tests/test_tools.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import importlib.metadata
22
import json
33
import os
4+
import re
45
from pathlib import Path
56

67
import numpy as np
@@ -11,6 +12,7 @@
1112
check_and_build_global_config,
1213
compute_mu_using_xraydb,
1314
compute_mud,
15+
fetch_cif_filenames,
1416
get_package_info,
1517
get_user_info,
1618
)
@@ -270,6 +272,45 @@ def test_get_package_info(monkeypatch, inputs, expected):
270272
assert actual_metadata == expected
271273

272274

275+
def test_fetch_cif_filenames():
276+
actual_cif_filenames = fetch_cif_filenames("Cl Na")
277+
expected_cif_filenames = [
278+
"1000041.cif",
279+
"2104025.cif",
280+
"2108652.cif",
281+
"2311042.cif",
282+
"4300180.cif",
283+
"4320809.cif",
284+
"7132177.cif",
285+
"9000629.cif",
286+
"9003308.cif",
287+
"9003309.cif",
288+
"9003310.cif",
289+
"9003311.cif",
290+
"9003312.cif",
291+
"9003313.cif",
292+
"9003314.cif",
293+
"9006369.cif",
294+
"9006370.cif",
295+
"9006371.cif",
296+
"9006372.cif",
297+
"9006373.cif",
298+
]
299+
return sorted(actual_cif_filenames) == sorted(expected_cif_filenames)
300+
301+
302+
def test_fetch_cif_filenames_bad():
303+
expected_error_msg = (
304+
"No CIF files found for the given formula: NaCl. "
305+
"Please ensure it's in Hill notation (e.g., 'Cl Na'). "
306+
"You can use ``to_hill_notation`` for conversion. "
307+
"If the formula is correct, it is possible that "
308+
"no CIF files are available for this formula in the COD."
309+
)
310+
with pytest.raises(ValueError, match=re.escape(expected_error_msg)):
311+
fetch_cif_filenames("NaCl")
312+
313+
273314
@pytest.mark.parametrize(
274315
"inputs",
275316
[

0 commit comments

Comments
 (0)