|
| 1 | +import os |
| 2 | +from functools import partial |
| 3 | + |
| 4 | +os.environ['JAX_PLATFORMS'] = 'cpu' |
| 5 | +os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '1.0' |
| 6 | +os.environ["XLA_FLAGS"] = f"--xla_force_host_platform_device_count={8}" |
| 7 | + |
| 8 | +from jax._src.partition_spec import PartitionSpec |
| 9 | +from jax.experimental.shard_map import shard_map |
| 10 | + |
| 11 | +from dsa2000_common.common.jax_utils import create_mesh |
| 12 | +from dsa2000_common.common.jvp_linear_op import JVPLinearOp |
| 13 | +from dsa2000_common.common.logging import dsa_logger |
| 14 | +from dsa2000_common.common.mixed_precision_utils import mp_policy |
| 15 | +import itertools |
| 16 | +import time |
| 17 | +from typing import Dict, Any |
| 18 | + |
| 19 | +import jax |
| 20 | +import numpy as np |
| 21 | + |
| 22 | +from dsa2000_cal.ops.residuals import compute_residual_TBC |
| 23 | + |
| 24 | + |
| 25 | +def prepare_data(D: int, Ts, Tm, Cs, Cm) -> Dict[str, Any]: |
| 26 | + num_antennas = 2048 |
| 27 | + baseline_pairs = np.asarray(list(itertools.combinations(range(num_antennas), 2)), |
| 28 | + dtype=np.int32) |
| 29 | + antenna1 = baseline_pairs[:, 0] |
| 30 | + antenna2 = baseline_pairs[:, 1] # [B] |
| 31 | + |
| 32 | + sort_idxs = np.lexsort((antenna1, antenna2)) |
| 33 | + antenna1 = antenna1[sort_idxs] |
| 34 | + antenna2 = antenna2[sort_idxs] |
| 35 | + |
| 36 | + B = antenna1.shape[0] |
| 37 | + vis_model = np.zeros((D, Tm, B, Cm, 2, 2), dtype=mp_policy.vis_dtype) |
| 38 | + vis_data = np.zeros((Ts, B, Cs, 2, 2), dtype=mp_policy.vis_dtype) |
| 39 | + gains = np.zeros((D, Tm, num_antennas, Cm, 2, 2), dtype=mp_policy.gain_dtype) |
| 40 | + return dict( |
| 41 | + vis_model=vis_model, |
| 42 | + vis_data=vis_data, |
| 43 | + gains=gains, |
| 44 | + antenna1=antenna1, |
| 45 | + antenna2=antenna2 |
| 46 | + ) |
| 47 | + |
| 48 | + |
| 49 | +def entry_point(data): |
| 50 | + vis_model = data['vis_model'] |
| 51 | + vis_data = data['vis_data'] |
| 52 | + gains = data['gains'] |
| 53 | + antenna1 = data['antenna1'] |
| 54 | + antenna2 = data['antenna2'] |
| 55 | + |
| 56 | + def fn(gains): |
| 57 | + res = compute_residual_TBC(vis_model=vis_model, vis_data=vis_data, gains=gains, |
| 58 | + antenna1=antenna1, antenna2=antenna2) |
| 59 | + return res |
| 60 | + |
| 61 | + J_bare = JVPLinearOp(fn, linearize=False) |
| 62 | + J = J_bare(gains) |
| 63 | + R = fn(gains) |
| 64 | + g = J.matvec(R, adjoint=True) |
| 65 | + return J.matvec(J.matvec(g), adjoint=True) |
| 66 | + |
| 67 | + |
| 68 | +def build_sharded_entry_point(devices): |
| 69 | + mesh = create_mesh((len(devices),), ('B',), devices) |
| 70 | + P = PartitionSpec |
| 71 | + in_specs = dict( |
| 72 | + vis_model=P(None, None, 'B'), |
| 73 | + vis_data=P(None, 'B'), |
| 74 | + gains=P(), |
| 75 | + antenna1=P('B'), |
| 76 | + antenna2=P('B') |
| 77 | + ) |
| 78 | + out_specs = P('B') |
| 79 | + |
| 80 | + @partial(shard_map, mesh=mesh, in_specs=(in_specs,), out_specs=out_specs) |
| 81 | + def entry_point_sharded(local_data): |
| 82 | + return entry_point(local_data) # [ Tm, _B, Cm, 2, 2] |
| 83 | + |
| 84 | + return entry_point_sharded, mesh |
| 85 | + |
| 86 | + |
| 87 | +def main(): |
| 88 | + cpus = jax.devices("cpu") |
| 89 | + # gpus = jax.devices("cuda") |
| 90 | + cpu = cpus[0] |
| 91 | + # gpu = gpus[0] |
| 92 | + |
| 93 | + entry_point_jit = jax.jit(entry_point) |
| 94 | + sharded_entry_point, mesh = build_sharded_entry_point(cpus) |
| 95 | + sharded_entry_point_jit = jax.jit(sharded_entry_point) |
| 96 | + # Run benchmarking over number of calibration directions |
| 97 | + time_array = [] |
| 98 | + shard_time_array = [] |
| 99 | + d_array = [] |
| 100 | + for D in range(1, 9): |
| 101 | + data = prepare_data(D, Ts=1, Tm=1, Cs=1, Cm=1) |
| 102 | + with jax.default_device(cpu): |
| 103 | + data = jax.device_put(data) |
| 104 | + entry_point_jit_compiled = entry_point_jit.lower(data).compile() |
| 105 | + t0 = time.time() |
| 106 | + for _ in range(3): |
| 107 | + jax.block_until_ready(entry_point_jit_compiled(data)) |
| 108 | + t1 = time.time() |
| 109 | + dt = (t1 - t0) / 3 |
| 110 | + dsa_logger.info(f"TBC: Residual: CPU D={D}: {dt}") |
| 111 | + time_array.append(dt) |
| 112 | + d_array.append(D) |
| 113 | + |
| 114 | + sharded_entry_point_jit_compiled = sharded_entry_point_jit.lower(data).compile() |
| 115 | + t0 = time.time() |
| 116 | + for _ in range(3): |
| 117 | + jax.block_until_ready(sharded_entry_point_jit_compiled(data)) |
| 118 | + t1 = time.time() |
| 119 | + dt = (t1 - t0) / 3 |
| 120 | + dsa_logger.info(f"TBC: Residual (sharded): CPU D={D}: {dt}") |
| 121 | + shard_time_array.append(dt) |
| 122 | + # |
| 123 | + # data = prepare_data(D, Ts=4, Tm=1, Cs=4, Cm=1) |
| 124 | + # with jax.default_device(cpu): |
| 125 | + # data = jax.device_put(data) |
| 126 | + # entry_point_jit_compiled = entry_point_jit.lower(data).compile() |
| 127 | + # t0 = time.time() |
| 128 | + # jax.block_until_ready(entry_point_jit_compiled(data)) |
| 129 | + # t1 = time.time() |
| 130 | + # dsa_logger.info(f"TBC: Subtract (per-GPU): CPU D={D}: {t1 - t0}") |
| 131 | + # |
| 132 | + # sharded_entry_point_jit_compiled = sharded_entry_point_jit.lower(data).compile() |
| 133 | + # t0 = time.time() |
| 134 | + # for _ in range(1): |
| 135 | + # jax.block_until_ready(sharded_entry_point_jit_compiled(data)) |
| 136 | + # t1 = time.time() |
| 137 | + # dt = (t1 - t0) / 1 |
| 138 | + # dsa_logger.info(f"TBC: Subtract (per-GPU sharded): CPU D={D}: {dt}") |
| 139 | + # |
| 140 | + # data = prepare_data(D, Ts=4, Tm=1, Cs=40, Cm=1) |
| 141 | + # with jax.default_device(cpu): |
| 142 | + # data = jax.device_put(data) |
| 143 | + # entry_point_jit_compiled = entry_point_jit.lower(data).compile() |
| 144 | + # t0 = time.time() |
| 145 | + # jax.block_until_ready(entry_point_jit_compiled(data)) |
| 146 | + # t1 = time.time() |
| 147 | + # dsa_logger.info(f"TBC: Subtract (all-GPU): CPU D={D}: {t1 - t0}") |
| 148 | + # |
| 149 | + # sharded_entry_point_jit_compiled = sharded_entry_point_jit.lower(data).compile() |
| 150 | + # t0 = time.time() |
| 151 | + # for _ in range(1): |
| 152 | + # jax.block_until_ready(sharded_entry_point_jit_compiled(data)) |
| 153 | + # t1 = time.time() |
| 154 | + # dt = (t1 - t0) / 1 |
| 155 | + # dsa_logger.info(f"TBC: Subtract (all-GPU sharded): CPU D={D}: {dt}") |
| 156 | + |
| 157 | + # Fit line to data using scipy |
| 158 | + time_array = np.array(time_array) |
| 159 | + d_array = np.array(d_array) |
| 160 | + from scipy.optimize import curve_fit |
| 161 | + |
| 162 | + popt, pcov = curve_fit(lambda x, a, b: a * x ** b, d_array, time_array) |
| 163 | + dsa_logger.info(f"TBC: Fit: {popt}") |
| 164 | + |
| 165 | + shard_time_array = np.array(shard_time_array) |
| 166 | + |
| 167 | + popt, pcov = curve_fit(lambda x, a, b: a * x ** b, d_array, shard_time_array) |
| 168 | + dsa_logger.info(f"TBC: Fit (sharded): {popt}") |
| 169 | + |
| 170 | + |
| 171 | +if __name__ == '__main__': |
| 172 | + main() |
0 commit comments