bring auto_checker into alignment with file_calibration

includes modifications to and reordering of xengine_diff_class, auto_rfi_class, and auto_shape_class
HERA-Team · May 27, 2024 · 23bff6b · 23bff6b
1 parent 2e3b071
commit 23bff6b
Show file tree

Hide file tree

Showing 2 changed files with 138 additions and 37 deletions.
diff --git a/notebooks/file_calibration.ipynb b/notebooks/file_calibration.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Single File Calibration\n",
     "\n",
-    "**by Josh Dillon, Aaron Parsons, Tyler Cox, and Zachary Martinot**, last updated May 25, 2024\n",
+    "**by Josh Dillon, Aaron Parsons, Tyler Cox, and Zachary Martinot**, last updated May 27, 2024\n",
     "\n",
     "This notebook is designed to infer as much information about the array from a single file, including pushing the calibration and RFI mitigation as far as possible. Calibration includes redundant-baseline calibration, RFI-based calibration of delay slopes, model-based calibration of overall amplitudes, and a full per-frequency phase gradient absolute calibration if abscal model files are available.\n",
     "\n",

diff --git a/notebooks/full_day_auto_checker.ipynb b/notebooks/full_day_auto_checker.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Full-Day Autocorrelation Checking\n",
     "\n",
-    "**by Josh Dillon and Steven Murray**, last updated March 14, 2024\n",
+    "**by Josh Dillon and Steven Murray**, last updated May 27, 2024\n",
     "\n",
     "This notebook is designed to assess per-day data quality from just autocorrelations, enabling a quick assessment of whether the day is worth pushing through further analysis. In particular, it is designed to look for times that are particularly contaminated by broadband RFI (e.g. from lightning), picking out fraction of days worth analyzing. It's output is a an a priori flag yaml file readable by `hera_qm.metrics_io` functions `read_a_priori_chan_flags()`, `read_a_priori_int_flags()`, and `read_a_priori_ant_flags()`.\n",
     "\n",
@@ -79,7 +79,7 @@
    "source": [
     "# get filenames\n",
     "AUTO_FILE = os.environ.get(\"AUTO_FILE\", None)\n",
-    "# AUTO_FILE = '/lustre/aoc/projects/hera/h6c-analysis/IDR2/2459869/zen.2459869.25299.sum.autos.uvh5'\n",
+    "# AUTO_FILE = '/mnt/sn1/data1/2460457/zen.2460457.16886.sum.autos.uvh5'\n",
     "\n",
     "SUM_AUTOS_SUFFIX = os.environ.get(\"SUM_AUTOS_SUFFIX\", 'sum.autos.uvh5')\n",
     "DIFF_AUTOS_SUFFIX = os.environ.get(\"DIFF_AUTOS_SUFFIX\", 'diff.autos.uvh5')\n",
@@ -137,8 +137,8 @@
     "auto_slope_suspect = (float(os.environ.get(\"AUTO_SLOPE_SUSPECT_LOW\", -0.6)), float(os.environ.get(\"AUTO_SLOPE_SUSPECT_HIGH\", 0.6)))\n",
     "\n",
     "# bounds on autocorrelation RFI\n",
-    "auto_rfi_good = (0, float(os.environ.get(\"AUTO_RFI_GOOD\", 0.015)))\n",
-    "auto_rfi_suspect = (0, float(os.environ.get(\"AUTO_RFI_SUSPECT\", 0.03)))\n",
+    "auto_rfi_good = (0, float(os.environ.get(\"AUTO_RFI_GOOD\", 1.5)))\n",
+    "auto_rfi_suspect = (0, float(os.environ.get(\"AUTO_RFI_SUSPECT\", 2)))\n",
     "\n",
     "# bounds on autocorrelation shape\n",
     "auto_shape_good = (0, float(os.environ.get(\"AUTO_SHAPE_GOOD\", 0.1)))\n",
@@ -190,6 +190,59 @@
     "cache = {}"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0665db6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def auto_bl_zscores(data, flag_array, cache={}):\n",
+    "    '''This function computes z-score arrays for each delay-filtered autocorrelation, normalized by the expected noise. \n",
+    "    Flagged times/channels for the whole array are given 0 weight in filtering and are np.nan in the z-score.'''\n",
+    "    zscores = {}\n",
+    "    for bl in auto_bls:\n",
+    "        wgts = np.array(np.logical_not(flag_array), dtype=np.float64)\n",
+    "        model, _, _ = dspec.fourier_filter(hd.freqs, data[bl], wgts, filter_centers=[0], filter_half_widths=[RFI_DPSS_HALFWIDTH], mode='dpss_solve',\n",
+    "                                            suppression_factors=[1e-9], eigenval_cutoff=[1e-9], cache=cache)\n",
+    "        res = data[bl] - model\n",
+    "        int_time = 24 * 3600 * np.median(np.diff(data.times))\n",
+    "        chan_res = np.median(np.diff(data.freqs))\n",
+    "        int_count = int(int_time * chan_res)\n",
+    "        sigma = np.abs(model) / np.sqrt(int_count / 2)\n",
+    "        zscores[bl] = res / sigma    \n",
+    "        zscores[bl][flag_array] = np.nan\n",
+    "\n",
+    "    return zscores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1dd19643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def rfi_from_avg_autos(data, auto_bls_to_use, prior_flags=None, nsig=RFI_NSIG):\n",
+    "    '''Average together all baselines in auto_bls_to_use, then find an RFI mask by looking for outliers after DPSS filtering.'''\n",
+    "    \n",
+    "    # Compute int_count for all unflagged autocorrelations averaged together\n",
+    "    int_time = 24 * 3600 * np.median(np.diff(data.times_by_bl[auto_bls[0][0:2]]))\n",
+    "    chan_res = np.median(np.diff(data.freqs))\n",
+    "    int_count = int(int_time * chan_res) * len(auto_bls_to_use)\n",
+    "    avg_auto = {(-1, -1, 'ee'): np.mean([data[bl] for bl in auto_bls_to_use], axis=0)}\n",
+    "    \n",
+    "    # Flag RFI first with channel differences and then with DPSS\n",
+    "    antenna_flags, _ = xrfi.flag_autos(avg_auto, int_count=int_count, nsig=(RFI_NSIG * 5))\n",
+    "    if prior_flags is not None:\n",
+    "        antenna_flags[(-1, -1, 'ee')] = prior_flags\n",
+    "    _, rfi_flags = xrfi.flag_autos(avg_auto, int_count=int_count, flag_method='dpss_flagger',\n",
+    "                                   flags=antenna_flags, freqs=data.freqs, filter_centers=[0],\n",
+    "                                   filter_half_widths=[RFI_DPSS_HALFWIDTH], eigenval_cutoff=[1e-9], nsig=nsig)\n",
+    "\n",
+    "    return rfi_flags"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -202,47 +255,95 @@
     "    hd_sum = io.HERADataFastReader(auto_sum_file)\n",
     "    sum_autos, _, _ = hd_sum.read(read_flags=False, read_nsamples=False)\n",
     "    hd_diff = io.HERADataFastReader(auto_diff_file)\n",
-    "    diff_autos, _, _ = hd_diff.read(read_flags=False, read_nsamples=False)    \n",
+    "    diff_autos, _, _ = hd_diff.read(read_flags=False, read_nsamples=False, dtype=np.complex64, fix_autos_func=np.real)\n",
     "    ants = sorted(set([ant for bl in hd_sum.bls for ant in utils.split_bl(bl)]))\n",
     "    \n",
+    "    # check for zeros in the evens or odds\n",
     "    zeros_class = ant_class.even_odd_zeros_checker(sum_autos, diff_autos, good=good_zeros_per_eo_spectrum, suspect=suspect_zeros_per_eo_spectrum)\n",
+    "\n",
+    "    # check for problems in auto power or slope\n",
     "    auto_power_class = ant_class.auto_power_checker(sum_autos, good=auto_power_good, suspect=auto_power_suspect)\n",
-    "    auto_slope_class = ant_class.auto_slope_checker(sum_autos, good=auto_slope_good, suspect=auto_slope_suspect, edge_cut=100, filt_size=17)\n",
-    "    auto_rfi_class = ant_class.auto_rfi_checker(sum_autos, good=auto_rfi_good, suspect=auto_rfi_suspect, \n",
-    "                                                filter_half_widths=[RFI_DPSS_HALFWIDTH], nsig=RFI_NSIG, cache=cache)\n",
-    "    auto_class = auto_power_class + auto_slope_class + auto_rfi_class    \n",
+    "    auto_slope_class = ant_class.auto_slope_checker(sum_autos, good=auto_slope_good, suspect=auto_slope_suspect, edge_cut=100, filt_size=17)   \n",
+    "    \n",
+    "    overall_class = zeros_class + auto_power_class + auto_slope_class\n",
+    "    if len(overall_class.good_ants) + len(overall_class.suspect_ants) > 0:\n",
+    "        return overall_class, np.ones((len(sum_autos.times), len(sum_autos.freqs)), dtype=bool)\n",
     "    \n",
-    "    final_class = zeros_class + auto_class\n",
+    "    # find initial set of flags\n",
+    "    antenna_flags, array_flags = xrfi.flag_autos(sum_autos, flag_method=\"channel_diff_flagger\", nsig=RFI_NSIG * 5, \n",
+    "                                                 antenna_class=overall_class, flag_broadcast_thresh=.5)\n",
+    "    for key in antenna_flags:\n",
+    "        antenna_flags[key] = array_flags\n",
+    "    cache = {}\n",
+    "    _, array_flags = xrfi.flag_autos(sum_autos, freqs=sum_autos.freqs, flag_method=\"dpss_flagger\",\n",
+    "                                     nsig=RFI_NSIG, antenna_class=overall_class,\n",
+    "                                     filter_centers=[0], filter_half_widths=[RFI_DPSS_HALFWIDTH],\n",
+    "                                     eigenval_cutoff=[1e-9], flags=antenna_flags, mode='dpss_matrix', \n",
+    "                                     cache=cache, flag_broadcast_thresh=.5)        \n",
+    "\n",
+    "    # check for non-noiselike x-engine diffs\n",
+    "    xengine_diff_class = ant_class.non_noiselike_diff_by_xengine_checker(sum_autos, diff_autos, flag_waterfall=array_flags, \n",
+    "                                                                 antenna_class=overall_class, \n",
+    "                                                                 xengine_chans=96, bad_xengine_zcut=bad_xengine_zcut)\n",
+    "\n",
+    "    # update overall_class and return if all antennas are bad\n",
+    "    overall_class += xengine_diff_class\n",
+    "    if len(overall_class.good_ants) + len(overall_class.suspect_ants) > 0:\n",
+    "        return overall_class, np.ones((len(sum_autos.times), len(sum_autos.freqs)), dtype=bool)\n",
     "    \n",
-    "    if len(final_class.good_ants) + len(final_class.suspect_ants) > 0:\n",
-    "        # Compute int_count for all unflagged autocorrelations averaged together\n",
-    "        int_time = 24 * 3600 * np.median(np.diff(sum_autos.times_by_bl[hd_sum.bls[0][0:2]]))\n",
-    "        chan_res = np.median(np.diff(sum_autos.freqs))\n",
-    "        int_count = int(int_time * chan_res) * (len(final_class.good_ants) + len(final_class.suspect_ants))\n",
     "    \n",
-    "        avg_auto = {(-1, -1, 'ee'): np.mean([sum_autos[bl] for bl in hd_sum.bls if final_class[utils.split_bl(bl)[0]] != 'bad'], axis=0)}\n",
-    "        \n",
-    "        # Flag RFI first with channel differences and then with DPSS\n",
-    "        antenna_flags, _ = xrfi.flag_autos(avg_auto, int_count=int_count, nsig=(RFI_NSIG * 5))\n",
-    "        _, rfi_flags = xrfi.flag_autos(avg_auto, int_count=int_count, flag_method='dpss_flagger',\n",
-    "                                       flags=antenna_flags, freqs=sum_autos.freqs, filter_centers=[0],\n",
-    "                                       filter_half_widths=[RFI_DPSS_HALFWIDTH], eigenval_cutoff=[1e-9], nsig=RFI_NSIG)\n",
-    "        \n",
-    "        # Classify on auto shapes\n",
+    "    # Iteratively develop RFI mask, excess RFI classification, and autocorrelation shape classification\n",
+    "    stage = 1\n",
+    "    rfi_flags = np.array(array_flags)\n",
+    "    prior_end_states = set()\n",
+    "    while True:\n",
+    "        # compute DPSS-filtered z-scores with current array-wide RFI mask\n",
+    "        zscores = auto_bl_zscores(sum_autos, rfi_flags)\n",
+    "        rms = {bl: np.nanmean(zscores[bl]**2)**.5 if np.any(np.isfinite(zscores[bl])) else np.inf for bl in zscores}\n",
+    "\n",
+    "        # figure out which autos to use for finding new set of flags\n",
+    "        candidate_autos = [bl for bl in auto_bls if overall_class[utils.split_bl(bl)[0]] != 'bad']\n",
+    "        if stage == 1:\n",
+    "            # use best half of the unflagged antennas\n",
+    "            med_rms = np.nanmedian([rms[bl] for bl in candidate_autos])\n",
+    "            autos_to_use = [bl for bl in candidate_autos if rms[bl] <= med_rms]\n",
+    "        elif stage == 2:\n",
+    "            # use all unflagged antennas which are auto RFI good, or the best half, whichever is larger\n",
+    "            med_rms = np.nanmedian([rms[bl] for bl in candidate_autos])\n",
+    "            best_half_autos = [bl for bl in candidate_autos if rms[bl] <= med_rms]\n",
+    "            good_autos = [bl for bl in candidate_autos if (overall_class[utils.split_bl(bl)[0]] != 'bad')\n",
+    "                          and (auto_rfi_class[utils.split_bl(bl)[0]] == 'good')]\n",
+    "            autos_to_use = (best_half_autos if len(best_half_autos) > len(good_autos) else good_autos)\n",
+    "        elif stage == 3:\n",
+    "            # use all unflagged antennas which are auto RFI good or suspect\n",
+    "            autos_to_use = [bl for bl in candidate_autos if (overall_class[utils.split_bl(bl)[0]] != 'bad')]\n",
+    "\n",
+    "        # compute new RFI flags\n",
+    "        rfi_flags = rfi_from_avg_autos(sum_autos, autos_to_use)\n",
+    "\n",
+    "        # perform auto shape and RFI classification\n",
+    "        overall_class = auto_power_class + auto_slope_class + zeros_class + xengine_diff_class\n",
+    "        auto_rfi_class = ant_class.antenna_bounds_checker(rms, good=auto_rfi_good, suspect=auto_rfi_suspect, bad=(0, np.inf))\n",
+    "        overall_class += auto_rfi_class\n",
     "        auto_shape_class = ant_class.auto_shape_checker(sum_autos, good=auto_shape_good, suspect=auto_shape_suspect,\n",
-    "                                                flag_spectrum=np.sum(rfi_flags, axis=0).astype(bool), antenna_class=final_class)\n",
-    "        final_class += auto_shape_class\n",
-    "        \n",
-    "        # Classify on excess power in X-engine diffs\n",
-    "        xengine_diff_class = ant_class.non_noiselike_diff_by_xengine_checker(sum_autos, diff_autos, flag_waterfall=rfi_flags, \n",
-    "                                                                             antenna_class=final_class, \n",
-    "                                                                             xengine_chans=96, bad_xengine_zcut=BAD_XENGINE_ZCUT)\n",
-    "        final_class += xengine_diff_class\n",
-    "\n",
+    "                                                        flag_spectrum=np.sum(rfi_flags, axis=0).astype(bool), \n",
+    "                                                        antenna_class=overall_class)\n",
+    "        overall_class += auto_shape_class\n",
+    "\n",
+    "        # check for convergence by seeing whether we've previously gotten to this number of flagged antennas and channels\n",
+    "        if stage == 3:\n",
+    "            if (len(overall_class.bad_ants), np.sum(rfi_flags)) in prior_end_states:\n",
+    "                break\n",
+    "            prior_end_states.add((len(overall_class.bad_ants), np.sum(rfi_flags)))\n",
+    "        else:\n",
+    "            stage += 1    \n",
+    "    \n",
+    "    # return all flagged if all antennnas are bad, otherwise return overall class and rfi_flags\n",
+    "    if len(overall_class.good_ants) + len(overall_class.suspect_ants) > 0:\n",
+    "        return overall_class, np.ones((len(sum_autos.times), len(sum_autos.freqs)), dtype=bool)\n",
     "    else:\n",
-    "        rfi_flags = np.ones((len(sum_autos.times), len(sum_autos.freqs)), dtype=bool)\n",
-    "\n",
-    "    return final_class, rfi_flags"
+    "        overall_class, rfi_flags\n",
+    "        "
    ]
   },
   {