From e26d8f20a83f72412f7df194f37b8b5ed3b71842 Mon Sep 17 00:00:00 2001
From: Jai Ram Rideout <jai.rideout@gmail.com>
Date: Fri, 30 Jun 2017 16:49:18 -0700
Subject: [PATCH] ENH: experimental QIIME 2 mapping file support (#80)

Fixes #79.
---
 CHANGELOG.md                            |  7 +++
 src/App.gs                              | 13 +++--
 src/Benchmark.gs                        |  2 +-
 src/{QiimeFormat.gs => Qiime1Format.gs} | 16 +++---
 src/Qiime2Format.gs                     | 75 +++++++++++++++++++++++++
 src/SimulatedData.gs                    |  2 +-
 src/Validate.gs                         |  4 ++
 7 files changed, 105 insertions(+), 14 deletions(-)
 rename src/{QiimeFormat.gs => Qiime1Format.gs} (91%)
 create mode 100644 src/Qiime2Format.gs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b48b46f..eff6669 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 **Note on versioning:** the version numbers used here match the version numbers displayed to users in the Chrome Web Store. Sometimes there are gaps between release versions (e.g., version 2 jumps to version 5). This happens because each separate upload of Keemei to the web store increments the version number, and sometimes multiple uploads are necessary before a release is finalized (e.g., if the release is reviewed by an add-ons advisor and updates are required before it can go public). Therefore, the version numbering used here in the changelog and tagged GitHub releases will match the public release version displayed in the web store.
 
+## Version 13 (2017-06-28)
+
+This release adds **experimental** support for validating [QIIME 2](https://qiime2.org) mapping files.
+
+### Features
+* Added **experimental** support for validating [QIIME 2 mapping files](https://docs.qiime2.org/2017.6/tutorials/metadata/#metadata-from-a-text-file) ([#79](https://github.com/biocore/Keemei/issues/79))
+
 ## Version 12 (2016-06-17)
 
 Minor beta release with [Keemei paper](http://dx.doi.org/10.1186/s13742-016-0133-6) citation information.
diff --git a/src/App.gs b/src/App.gs
index 7415462..93b0535 100644
--- a/src/App.gs
+++ b/src/App.gs
@@ -4,7 +4,8 @@ function onInstall(e) {
 
 function onOpen(e) {
   SpreadsheetApp.getUi().createAddonMenu()
-      .addItem("Validate QIIME mapping file", "validateQiime")
+      .addItem("Validate QIIME 1 mapping file", "validateQiime1")
+      .addItem("Validate QIIME 2 mapping file", "validateQiime2")
       .addItem("Validate SRGD file", "validateSrgd")
       .addItem("Validate Qiita sample template (experimental)", "validateQiitaSampleTemplate")
       .addSeparator()
@@ -12,14 +13,18 @@ function onOpen(e) {
       .addItem("About", "about")
       .addSeparator()
       .addSubMenu(SpreadsheetApp.getUi().createMenu("Developer tools")
-          .addItem("Create simulated QIIME mapping file dataset", "createSimulatedData")
+          .addItem("Create simulated QIIME 1 mapping file dataset", "createSimulatedData")
           .addItem("Run benchmarks: dataset size and error rate", "runDatasetSizeBenchmarks")
           .addItem("Run benchmarks: rule size", "runRuleSizeBenchmarks"))
       .addToUi();
 };
 
-function validateQiime() {
-  validate_(getQiimeFormatSpec_);
+function validateQiime1() {
+  validate_(getQiime1FormatSpec_);
+};
+
+function validateQiime2() {
+  validate_(getQiime2FormatSpec_);
 };
 
 function validateSrgd() {
diff --git a/src/Benchmark.gs b/src/Benchmark.gs
index bbd8567..f8ac6d9 100644
--- a/src/Benchmark.gs
+++ b/src/Benchmark.gs
@@ -83,7 +83,7 @@ function runDatasetSizeBenchmarks() {
       var runtime = row[j + 1];
       if (runtime === "") {
         // Depends on core Keemei API.
-        var report = validate_(getQiimeFormatSpec_, sheet);
+        var report = validate_(getQiime1FormatSpec_, sheet);
 
         var numInvalidCells = Object.keys(report.validationResults).length;
         var expectedNumInvalidCells = (parseInt(rowCount, 10) * numColumns) * errorProportion;
diff --git a/src/QiimeFormat.gs b/src/Qiime1Format.gs
similarity index 91%
rename from src/QiimeFormat.gs
rename to src/Qiime1Format.gs
index 686135d..34b10dc 100644
--- a/src/QiimeFormat.gs
+++ b/src/Qiime1Format.gs
@@ -1,4 +1,4 @@
-function getQiimeFormatSpec_(sheetData) {
+function getQiime1FormatSpec_(sheetData) {
   var requiredHeaders = {
     "#SampleID": [0, "first"],
     "BarcodeSequence": [1, "second"],
@@ -7,9 +7,9 @@ function getQiimeFormatSpec_(sheetData) {
   };
 
   return {
-    format: "QIIME mapping file",
+    format: "QIIME 1 mapping file",
     headerRowIdx: 0,
-    dataStartRowIdx: getQiimeDataStartRowIdx_(sheetData),
+    dataStartRowIdx: getQiime1DataStartRowIdx_(sheetData),
     headerValidation: [
       {
         validator: findMissingValues_,
@@ -22,11 +22,11 @@ function getQiimeFormatSpec_(sheetData) {
       {
         // #SampleID is an invalid column header name, so we'll only check header names
         // if they aren't required headers. Assume the required header names are valid.
-        validator: findInvalidQiimeColumns_,
+        validator: findInvalidQiime1Columns_,
         args: [requiredHeaders]
       },
       {
-        validator: findMisplacedQiimeColumns_,
+        validator: findMisplacedQiime1Columns_,
         args: [requiredHeaders]
       },
       {
@@ -121,7 +121,7 @@ function getPrimerValidators_() {
   ];
 };
 
-function getQiimeDataStartRowIdx_(sheetData) {
+function getQiime1DataStartRowIdx_(sheetData) {
   for (var i = 1; i < sheetData.length; i++) {
     if (!startsWith_(sheetData[i][0], "#")) {
       break;
@@ -130,7 +130,7 @@ function getQiimeDataStartRowIdx_(sheetData) {
   return i;
 };
 
-function findInvalidQiimeColumns_(valueToPositions, ignoredValues) {
+function findInvalidQiime1Columns_(valueToPositions, ignoredValues) {
   var invalidCells = {};
   var message = [
     Utilities.formatString("Invalid column header name. Only alphanumeric and underscore characters are allowed. The first character must be a letter.")
@@ -153,7 +153,7 @@ function findInvalidQiimeColumns_(valueToPositions, ignoredValues) {
   return invalidCells;
 };
 
-function findMisplacedQiimeColumns_(valueToPositions, requiredHeaders) {
+function findMisplacedQiime1Columns_(valueToPositions, requiredHeaders) {
   var invalidCells = {};
   for (var value in valueToPositions) {
     if (valueToPositions.hasOwnProperty(value)) {
diff --git a/src/Qiime2Format.gs b/src/Qiime2Format.gs
new file mode 100644
index 0000000..1c0857d
--- /dev/null
+++ b/src/Qiime2Format.gs
@@ -0,0 +1,75 @@
+function getQiime2FormatSpec_(sheetData) {
+  // TODO: this isn't the best place to put this type of validation.
+  // There isn't a hook yet to error if there are missing data rows
+  // for a file format, so use an ad-hoc check for now.
+  if (sheetData.length < 2) {
+    var ui = SpreadsheetApp.getUi();
+    ui.alert("Missing data",
+             "This sheet must have at least two rows in order to be validated. " +
+             "The first row contains the header and subsequent rows contain data.",
+             ui.ButtonSet.OK);
+    return null;
+  }
+
+  var axisLabelRegex = /[^\/\\*<>?|$]/ig;
+
+  var formatSpec = {
+    format: "QIIME 2 mapping file",
+
+    // TODO: update when blank lines and comments are supported
+    headerRowIdx: 0,
+    dataStartRowIdx: 1,
+
+    headerValidation: [
+      {
+        validator: findDuplicates_,
+        args: ["Duplicate column label"]
+      },
+      {
+        validator: findInvalidCharacters_,
+        args: [axisLabelRegex, "errors", "column label"]
+      },
+      {
+        validator: findEmpty_,
+        args: ["errors"]
+      },
+      {
+        validator: findLeadingTrailingWhitespace_,
+        args: []
+      }
+    ],
+    columnValidation: {
+      "default": [
+        {
+          validator: findLeadingTrailingWhitespace_,
+          args: []
+        }
+      ],
+      columns: {}
+    }
+  };
+
+  // TODO: update when blank lines and comments are supported
+  var idColumnLabel = sheetData[0][0];
+
+  formatSpec.columnValidation.columns[idColumnLabel] = [
+    {
+      validator: findDuplicates_,
+      args: ["Duplicate identifier"]
+    },
+    {
+      validator: findInvalidCharacters_,
+      args: [axisLabelRegex, "errors", "identifier"]
+    },
+    {
+      validator: findEmpty_,
+      args: ["errors"]
+    },
+    {
+      validator: findLeadingTrailingWhitespace_,
+      args: []
+    }
+  ];
+
+  return formatSpec;
+};
diff --git a/src/SimulatedData.gs b/src/SimulatedData.gs
index cb5e179..2565310 100644
--- a/src/SimulatedData.gs
+++ b/src/SimulatedData.gs
@@ -181,7 +181,7 @@ var ERROR_TRANSFORMS = [
 
   /*
    * Substitute first character with invalid character.
-   * $ is invalid across all QIIME mapping file columns.
+   * $ is invalid across all QIIME 1 mapping file columns.
    * We replace a character instead of appending to keep
    * barcodes the same length, otherwise additional cells
    * could be marked invalid in rare cases. Replacing a
diff --git a/src/Validate.gs b/src/Validate.gs
index 8ba5ebd..ceb2e35 100644
--- a/src/Validate.gs
+++ b/src/Validate.gs
@@ -7,6 +7,10 @@ function validate_(formatSpecFunction, sheet) {
 
   var formatSpec = formatSpecFunction(sheetData);
 
+  if (!formatSpec) {
+    return {};
+  }
+
   var report = {
     format: formatSpec.format,
     validationResults: mergeValidationResults_([