-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuildpanexv1_compsep.py
106 lines (82 loc) · 3.84 KB
/
buildpanexv1_compsep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
# Get a listings of the files in each dataset
# see get-panexv1-file-lists.sh
import json
from pytablewriter import MarkdownTableWriter
# NERSC CMB Globus Guest Collection
ENDPOINT = "53b2a147-ae9d-4bbf-9d18-3b46d133d4bb"
DOMAIN = "g-0a470a.6b7bd8.0ec8"
FOLDER = "panexp_v1_compsep"
RELEASE_NAME = "panexv1-compsep"
dsets = "freefree_f1 ame_a1 galactic_foregrounds_d1s1 ame_a2 galactic_foregrounds_highcomplexity cib_cib1 galactic_foregrounds_lowcomplexity cmb_c3 galactic_foregrounds_mediumcomplexity cmb_c4 ksz_ksz1 co_co1 co_co3 synchrotron_s1 dust_d1 synchrotron_s4 dust_d10 synchrotron_s5 dust_d12 synchrotron_s7 dust_d9 tsz_tsz1".split()
# from https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
if abs(num) < 1024.0:
return f"{num:3.1f} {unit}{suffix}"
num /= 1024.0
return f"{num:.1f} Yi{suffix}"
def get_fileinfo(fname):
fileinfo = fname.split(".")[0].split("_")
freq = str((fileinfo[4]))
pixelization = fileinfo[-1]
return (freq, pixelization)
def write_dataset(dset, n_files, data_size, file_table_rows):
dset_table_header = ["File Name", "Frequency Band (GHz)", "Pixelization", "Size"]
writer = MarkdownTableWriter(
headers=dset_table_header, value_matrix=file_table_rows, margin=1
)
dsettext = dset.replace("_", " ")
dset_text = f"""---
title: "Dataset: PanEx V1 Skies WMAP {dsettext}"
author: "Simons Observatory Collaboration"
description: "PanEx V1 Skies WMAP {dset}"
date_created: "2025-03-18"
seo:
type: Dataset
---
[Back to release](./{RELEASE_NAME}.html#datasets)
See [data access](./{RELEASE_NAME}.html#data-access) on the Data Release page.
Access the data through the Globus web interface: [](https://app.globus.org/file-manager?origin_id={ENDPOINT}&origin_path=%2F{FOLDER}%2F{dset}%2F)
Download the [file manifest](https://{DOMAIN}.data.globus.org/{FOLDER}/{dset}/manifest.json) for the exact file sizes and checksums.
## Files
- Number of files: {n_files}
- Total size: {data_size}
- [JSON format file manifest](https://{DOMAIN}.data.globus.org/{FOLDER}/{dset}/manifest.json)
"""
with open(f"{RELEASE_NAME}-{dset}.md", "w") as f:
f.write(dset_text)
f.write(writer.dumps())
dsets_table_header = ["Link", "Dataset", "Number of Files", "Total Size"]
dsets_table_data = []
for dset in dsets:
dset_table_data = []
# load file list
with open(f"{RELEASE_NAME}-{dset}.json") as f:
file_data = json.load(f)
file_list = file_data["DATA"]
# loop over files, build file table info for dataset
# remove manifest from list
# total up bytes in dataset
total_bytes = 0
n_files = len(file_list) - 1
file_list = [file for file in file_list if file["name"] != "manifest.json"]
file_list.sort(key=lambda x: int(get_fileinfo(x["name"])[0]))
for file_entry in file_list:
fname = file_entry["name"]
total_bytes += file_entry["size"]
fsize = sizeof_fmt(file_entry["size"])
(freq, nside) = get_fileinfo(fname)
flink = (
f"[`{fname}`](https://{DOMAIN}.data.globus.org/{FOLDER}/{dset}/{fname})"
)
dset_table_data.append([flink, freq, nside, fsize])
dset_size = sizeof_fmt(total_bytes)
write_dataset(dset, n_files, dset_size, dset_table_data)
dset_url = f"[Link]({RELEASE_NAME}-{dset}.html)"
dsets_table_data.append([dset_url, f"{dset}", f"`{n_files}`", dset_size])
writer = MarkdownTableWriter(
headers=dsets_table_header, value_matrix=dsets_table_data, margin=1
)
with open(RELEASE_NAME + "-dset-table.md", "w") as f:
f.write(writer.dumps())