implement new RFI finding and classification algorithm

HERA-Team · Apr 18, 2024 · db4ab86 · db4ab86
1 parent 80f2c3d
commit db4ab86
Showing 1 changed file with 146 additions and 72 deletions.
diff --git a/notebooks/file_calibration.ipynb b/notebooks/file_calibration.ipynb
@@ -57,6 +57,7 @@
     "from uvtools.plot import plot_antpos, plot_antclass\n",
     "from hera_qm import ant_metrics, ant_class, xrfi\n",
     "from hera_cal import io, utils, redcal, apply_cal, datacontainer, abscal\n",
+    "from hera_filters import dspec\n",
     "from hera_notebook_templates.data import DATA_PATH as HNBT_DATA\n",
     "from IPython.display import display, HTML\n",
     "import linsolve\n",
@@ -155,7 +156,7 @@
     "\n",
     "# parse RFI settings\n",
     "RFI_DPSS_HALFWIDTH = float(os.environ.get(\"RFI_DPSS_HALFWIDTH\", 300e-9))\n",
-    "RFI_NSIG = float(os.environ.get(\"RFI_NSIG\", 6))\n",
+    "RFI_NSIG = float(os.environ.get(\"RFI_NSIG\", 4))\n",
     "\n",
     "# parse abscal settings\n",
     "ABSCAL_MODEL_FILES_GLOB = os.environ.get(\"ABSCAL_MODEL_FILES_GLOB\", None)\n",
@@ -211,8 +212,8 @@
     "auto_slope_suspect = (float(os.environ.get(\"AUTO_SLOPE_SUSPECT_LOW\", -0.6)), float(os.environ.get(\"AUTO_SLOPE_SUSPECT_HIGH\", 0.6)))\n",
     "\n",
     "# bounds on autocorrelation RFI\n",
-    "auto_rfi_good = (0, float(os.environ.get(\"AUTO_RFI_GOOD\", 0.015)))\n",
-    "auto_rfi_suspect = (0, float(os.environ.get(\"AUTO_RFI_SUSPECT\", 0.03)))\n",
+    "auto_rfi_good = (0, float(os.environ.get(\"AUTO_RFI_GOOD\", 1.5)))\n",
+    "auto_rfi_suspect = (0, float(os.environ.get(\"AUTO_RFI_SUSPECT\", 2)))\n",
     "\n",
     "# bounds on autocorrelation shape\n",
     "auto_shape_good = (0, float(os.environ.get(\"AUTO_SHAPE_GOOD\", 0.1)))\n",
@@ -380,81 +381,186 @@
    "id": "102587ce",
    "metadata": {},
    "source": [
-    "### Examine and classify autocorrelation power, slope, and RFI occpancy\n",
+    "### Examine and classify autocorrelation power and slope\n",
     "\n",
-    "These classifiers look for antennas with too high or low power, to steep a slope, or too much excess RFI."
+    "These classifiers look for antennas with too high or low power or to steep a slope."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "bca07198",
+   "id": "ba1b4d4f-633e-46a8-a6f9-b7d600845ae0",
    "metadata": {},
    "outputs": [],
    "source": [
     "auto_power_class = ant_class.auto_power_checker(data, good=auto_power_good, suspect=auto_power_suspect)\n",
     "auto_slope_class = ant_class.auto_slope_checker(data, good=auto_slope_good, suspect=auto_slope_suspect, edge_cut=100, filt_size=17)\n",
-    "cache = {}\n",
-    "auto_rfi_class = ant_class.auto_rfi_checker(data, good=auto_rfi_good, suspect=auto_rfi_suspect, \n",
-    "                                            filter_half_widths=[RFI_DPSS_HALFWIDTH], nsig=RFI_NSIG, cache=cache)\n",
-    "auto_class = auto_power_class + auto_slope_class + auto_rfi_class\n",
-    "if np.all([auto_class[utils.split_bl(bl)[0]] == 'bad' for bl in auto_bls]):\n",
-    "    raise ValueError('All antennas are flagged for bad autocorrelation power/slope/RFI.')"
+    "if np.all([(auto_power_class + auto_slope_class)[utils.split_bl(bl)[0]] == 'bad' for bl in auto_bls]):\n",
+    "    raise ValueError('All antennas are flagged for bad autocorrelation power/slope.')\n",
+    "overall_class = auto_power_class + auto_slope_class + zeros_class + ant_metrics_class + solar_class"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd79bfe5-8ca6-4d6d-87ff-740e4c3fa517",
+   "metadata": {},
+   "source": [
+    "### Examine and classify autocorrelation excess RFI and shape, finding consensus RFI mask along the way\n",
+    "\n",
+    "This classifier iteratively identifies antennas for excess RFI (characterized by RMS of DPSS-filtered autocorrelations after RFI flagging) and bad shape, as determined by a discrepancy with the mean good normalized autocorrelation's shape. Along the way, it iteratively discovers a conensus array-wide RFI mask."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b3e755bf",
+   "id": "6fdc4209-6fe0-4cb7-8564-dda570adecdf",
    "metadata": {},
    "outputs": [],
    "source": [
-    "del cache\n",
-    "malloc_trim()"
+    "def auto_bl_zscores(data, flag_array, cache={}):\n",
+    "    '''This function computes z-score arrays for each delay-filtered autocorrelation, normalized by the expected noise. \n",
+    "    Flagged times/channels for the whole array are given 0 weight in filtering and are np.nan in the z-score.'''\n",
+    "    zscores = {}\n",
+    "    for bl in auto_bls:\n",
+    "        wgts = np.array(np.logical_not(flag_array), dtype=np.float64)\n",
+    "        model, _, _ = dspec.fourier_filter(hd.freqs, data[bl], wgts, filter_centers=[0], filter_half_widths=[RFI_DPSS_HALFWIDTH], mode='dpss_solve',\n",
+    "                                            suppression_factors=[1e-9], eigenval_cutoff=[1e-9], cache=cache)\n",
+    "        res = data[bl] - model\n",
+    "        int_time = 24 * 3600 * np.median(np.diff(data.times))\n",
+    "        chan_res = np.median(np.diff(data.freqs))\n",
+    "        int_count = int(int_time * chan_res)\n",
+    "        sigma = np.abs(model) / np.sqrt(int_count / 2)\n",
+    "        zscores[bl] = res / sigma    \n",
+    "        zscores[bl][flag_array] = np.nan\n",
+    "\n",
+    "    return zscores"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "9464e41e",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d7f374a-8936-43b2-b521-cc68a256a932",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Find and flag RFI"
+    "def rfi_from_avg_autos(data, auto_bls_to_use, prior_flags=None, nsig=RFI_NSIG):\n",
+    "    '''Average together all baselines in auto_bls_to_use, then find an RFI mask by looking for outliers after DPSS filtering.'''\n",
+    "    \n",
+    "    # Compute int_count for all unflagged autocorrelations averaged together\n",
+    "    int_time = 24 * 3600 * np.median(np.diff(data.times_by_bl[auto_bls[0][0:2]]))\n",
+    "    chan_res = np.median(np.diff(data.freqs))\n",
+    "    int_count = int(int_time * chan_res) * len(auto_bls_to_use)\n",
+    "    avg_auto = {(-1, -1, 'ee'): np.mean([data[bl] for bl in auto_bls_to_use], axis=0)}\n",
+    "    \n",
+    "    # Flag RFI first with channel differences and then with DPSS\n",
+    "    antenna_flags, _ = xrfi.flag_autos(avg_auto, int_count=int_count, nsig=(RFI_NSIG * 5))\n",
+    "    if prior_flags is not None:\n",
+    "        antenna_flags[(-1, -1, 'ee')] = prior_flags\n",
+    "    _, rfi_flags = xrfi.flag_autos(avg_auto, int_count=int_count, flag_method='dpss_flagger',\n",
+    "                                   flags=antenna_flags, freqs=data.freqs, filter_centers=[0],\n",
+    "                                   filter_half_widths=[RFI_DPSS_HALFWIDTH], eigenval_cutoff=[1e-9], nsig=nsig)\n",
+    "\n",
+    "    return rfi_flags"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c321e1d7",
+   "id": "3252ed36-41e8-4309-be23-80aeb8b0c234",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Compute int_count for all unflagged autocorrelations averaged together\n",
-    "int_time = 24 * 3600 * np.median(np.diff(data.times_by_bl[auto_bls[0][0:2]]))\n",
-    "chan_res = np.median(np.diff(data.freqs))\n",
-    "final_class = ant_metrics_class + zeros_class + auto_class\n",
-    "if np.all([final_class[utils.split_bl(bl)[0]] == 'bad' for bl in auto_bls]):\n",
-    "    raise ValueError('All antennas are flagged for ant_metrics, zeros, or autos.') \n",
-    "int_count = int(int_time * chan_res) * (len(final_class.good_ants) + len(final_class.suspect_ants))\n",
-    "avg_auto = {(-1, -1, 'ee'): np.mean([data[bl] for bl in auto_bls if final_class[utils.split_bl(bl)[0]] != 'bad'], axis=0)}\n",
-    "# Flag RFI first with channel differences and then with DPSS\n",
-    "antenna_flags, _ = xrfi.flag_autos(avg_auto, int_count=int_count, nsig=(RFI_NSIG * 5))\n",
-    "_, rfi_flags = xrfi.flag_autos(avg_auto, int_count=int_count, flag_method='dpss_flagger',\n",
-    "                               flags=antenna_flags, freqs=data.freqs, filter_centers=[0],\n",
-    "                               filter_half_widths=[RFI_DPSS_HALFWIDTH], eigenval_cutoff=[1e-9], nsig=RFI_NSIG)\n",
-    "malloc_trim()"
+    "# Find starting set of array flags\n",
+    "antenna_flags, array_flags = xrfi.flag_autos(data, flag_method=\"channel_diff_flagger\", nsig=RFI_NSIG * 5, \n",
+    "                                             antenna_class=overall_class, flag_broadcast_thresh=.5)\n",
+    "for key in antenna_flags:\n",
+    "    antenna_flags[key] = array_flags\n",
+    "cache = {}\n",
+    "_, array_flags = xrfi.flag_autos(data, freqs=data.freqs, flag_method=\"dpss_flagger\",\n",
+    "                                 nsig=RFI_NSIG, antenna_class=overall_class,\n",
+    "                                 filter_centers=[0], filter_half_widths=[RFI_DPSS_HALFWIDTH],\n",
+    "                                 eigenval_cutoff=[1e-9], flags=antenna_flags, mode='dpss_matrix', \n",
+    "                                 cache=cache, flag_broadcast_thresh=.5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75531f17-befd-41f1-aaf5-20ee2a18cacb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Iteratively develop RFI mask, excess RFI classification, and autocorrelation shape classification\n",
+    "stage = 1\n",
+    "rfi_flags = np.array(array_flags)\n",
+    "while True:\n",
+    "    # compute DPSS-filtered z-scores with current array-wide RFI mask\n",
+    "    zscores = auto_bl_zscores(data, rfi_flags)\n",
+    "    rms = {bl: np.nanmean(zscores[bl]**2)**.5 if np.any(np.isfinite(zscores[bl])) else np.inf for bl in zscores}\n",
+    "    \n",
+    "    # figure out which autos to use for finding new set of flags\n",
+    "    candidate_autos = [bl for bl in auto_bls if overall_class[utils.split_bl(bl)[0]] != 'bad']\n",
+    "    if stage == 1:\n",
+    "        # use best half of the unflagged antennas\n",
+    "        med_rms = np.nanmedian([rms[bl] for bl in candidate_autos])\n",
+    "        autos_to_use = [bl for bl in candidate_autos if rms[bl] <= med_rms]\n",
+    "        stage += 1  # advance to stage 2\n",
+    "    elif stage == 2:\n",
+    "        # use all unflagged antennas which are auto RFI good, or the best half, whichever is larger\n",
+    "        med_rms = np.nanmedian([rms[bl] for bl in candidate_autos])\n",
+    "        best_half_autos = [bl for bl in candidate_autos if rms[bl] <= med_rms]\n",
+    "        good_autos = [bl for bl in candidate_autos if (overall_class[utils.split_bl(bl)[0]] != 'bad')\n",
+    "                      and (auto_rfi_class[utils.split_bl(bl)[0]] == 'good')]\n",
+    "        autos_to_use = (best_half_autos if len(best_half_autos) > len(good_autos) else good_autos)\n",
+    "        stage += 1  # advance to stage 3\n",
+    "    elif stage == 3:\n",
+    "        # use all unflagged antennas which are auto RFI good or suspect\n",
+    "        autos_to_use = [bl for bl in candidate_autos if (overall_class[utils.split_bl(bl)[0]] != 'bad')]\n",
+    "\n",
+    "    # compute new RFI flags\n",
+    "    rfi_flags = rfi_from_avg_autos(data, autos_to_use)\n",
+    "\n",
+    "    # perform auto shape and RFI classification\n",
+    "    overall_class = auto_power_class + auto_slope_class + zeros_class + ant_metrics_class + solar_class\n",
+    "    auto_rfi_class = ant_class.antenna_bounds_checker(rms, good=auto_rfi_good, suspect=auto_rfi_suspect, bad=(0, np.inf))\n",
+    "    overall_class += auto_rfi_class\n",
+    "    auto_shape_class = ant_class.auto_shape_checker(data, good=auto_shape_good, suspect=auto_shape_suspect,\n",
+    "                                                    flag_spectrum=np.sum(rfi_flags, axis=0).astype(bool), \n",
+    "                                                    antenna_class=overall_class)\n",
+    "    overall_class += auto_shape_class\n",
+    "    \n",
+    "    # check for convergence on flagged antennas and channels \n",
+    "    if stage == 3:\n",
+    "        if (len(overall_class.bad_ants) == n_current_bad_ants) and (np.sum(rfi_flags) == current_flagged_chan_times):\n",
+    "            break\n",
+    "    current_flagged_chan_times = np.sum(rfi_flags)\n",
+    "    n_current_bad_ants = len(overall_class.bad_ants)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d8eae3de",
+   "id": "1b3f992f-402b-4fb8-aac9-4e82149cb4ad",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def rfi_plot():\n",
+    "auto_class = auto_power_class + auto_slope_class + auto_rfi_class + auto_shape_class\n",
+    "if np.all([auto_class[utils.split_bl(bl)[0]] == 'bad' for bl in auto_bls]):\n",
+    "    raise ValueError('All antennas are flagged for bad autos power/slope/rfi/shape.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2e3af97-94bc-4ed1-abb4-1fbf03ab1b0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def rfi_plot(cls, flags=rfi_flags):\n",
+    "    avg_auto = {(-1, -1, 'ee'): np.mean([data[bl] for bl in auto_bls if not cls[utils.split_bl(bl)[0]] == 'bad'], axis=0)}\n",
     "    plt.figure(figsize=(12, 5), dpi=100)\n",
-    "    plt.semilogy(hd.freqs / 1e6, np.where(rfi_flags, np.nan, avg_auto[(-1, -1, 'ee')])[1], label = 'Average Good or Suspect Autocorrelation', zorder=100)\n",
-    "    plt.semilogy(hd.freqs / 1e6, np.where(False, np.nan, avg_auto[(-1, -1, 'ee')])[1], 'r', lw=.5, label=f'{np.sum(rfi_flags[0])} Channels Flagged for RFI')\n",
+    "    plt.semilogy(hd.freqs / 1e6, np.where(flags, np.nan, avg_auto[(-1, -1, 'ee')])[0], label = 'Average Good or Suspect Autocorrelation', zorder=100)\n",
+    "    plt.semilogy(hd.freqs / 1e6, np.where(False, np.nan, avg_auto[(-1, -1, 'ee')])[0], 'r', lw=.5, label=f'{np.sum(flags[0])} Channels Flagged for RFI')\n",
     "    plt.legend()\n",
     "    plt.xlabel('Frequency (MHz)')\n",
     "    plt.ylabel('Uncalibrated Autocorrelation')\n",
@@ -478,7 +584,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "rfi_plot()"
+    "rfi_plot(overall_class)"
    ]
   },
   {
@@ -502,45 +608,13 @@
     "\n",
     "    axes[0].set_ylabel('Raw Autocorrelation', fontsize=12)\n",
     "    axes[1].legend([matplotlib.lines.Line2D([0], [0], color=color) for color in colors], \n",
-    "                   [cls.capitalize() for cls in auto_class.quality_classes], ncol=1, fontsize=12, loc='upper right', framealpha=1)\n",
+    "                   [cl.capitalize() for cl in cls.quality_classes], ncol=1, fontsize=12, loc='upper right', framealpha=1)\n",
     "    plt.tight_layout()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "32f9153c",
-   "metadata": {},
-   "source": [
-    "### Classify antennas based on shapes, excluding RFI-contamined channels"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e005cb17",
-   "metadata": {
-    "code_folding": []
-   },
-   "outputs": [],
-   "source": [
-    "auto_shape_class = ant_class.auto_shape_checker(data, good=auto_shape_good, suspect=auto_shape_suspect,\n",
-    "                                                flag_spectrum=np.sum(rfi_flags, axis=0).astype(bool), antenna_class=final_class)\n",
-    "auto_class += auto_shape_class"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7a13a063",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_class = ant_metrics_class + solar_class + zeros_class + auto_class"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d99ad916",
+   "id": "5066f32a-5831-4dce-be8a-b28fad7584dc",
    "metadata": {},
    "source": [
     "# *Figure 2: Plot of autocorrelations with classifications*\n",