Allow strings in propertywise tests (#911)

eggrobin · web-flow · commit addf0c992050 · 2024-09-06T18:25:21.000+02:00
* Allow strings in Propertywise tests

* spots

* stringAt

* After macchiati’s review

* spots
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -525,6 +525,15 @@ private static void propertywiseAlikeLine(
         }
     }
 
+    private static String stringAt(UnicodeSet set, int i) {
+        final int codePointsSize = set.size() - set.strings().size();
+        if (i < codePointsSize) {
+            return Character.toString(set.charAt(i));
+        } else {
+            return set.strings().stream().skip(i - codePointsSize).findFirst().get();
+        }
+    }
+
     private static void propertywiseCorrespondenceLine(
             Set<String> ignoredProperties,
             UnicodeSet firstSet,
@@ -538,13 +547,13 @@ private static void propertywiseCorrespondenceLine(
         final List<UnicodeSet> sets = new ArrayList<>();
         sets.add(firstSet);
         expectToken(":", pp, source);
+
+        // Index of the first set of multi-character strings (and of the first multi-character
+        // reference string).
+        // This is `m` in the documentation in UnicodeInvariantTest.txt.
+        int firstMultiCharacterIndex = -1;
         do {
             final var set = parseUnicodeSet(source, pp);
-            if (set.hasStrings()) {
-                throw new BackwardParseException(
-                        "Set should contain only single code points for property comparison",
-                        pp.getIndex());
-            }
             if (set.size() != firstSet.size()) {
                 throw new BackwardParseException(
                         "Sets should have the same size for property correspondence (got "
@@ -554,18 +563,41 @@ private static void propertywiseCorrespondenceLine(
                                 + ")",
                         pp.getIndex());
             }
+            if (set.hasStrings() && set.strings().size() != set.size()) {
+                throw new BackwardParseException(
+                        "Sets should be all strings or all code points for property correspondence",
+                        pp.getIndex());
+            }
+            if (firstMultiCharacterIndex == -1) {
+                if (set.hasStrings()) {
+                    firstMultiCharacterIndex = sets.size();
+                }
+            } else if (!set.hasStrings()) {
+                throw new BackwardParseException(
+                        "Code points should come before strings in property correspondence",
+                        pp.getIndex());
+            }
             sets.add(set);
         } while (Lookahead.oneToken(pp, source).accept(":"));
-        final List<Integer> referenceCodePoints = new ArrayList<>();
+        if (firstMultiCharacterIndex == -1) {
+            firstMultiCharacterIndex = sets.size();
+        }
+        final List<String> referenceCodePoints = new ArrayList<>();
         expectToken("CorrespondTo", pp, source);
         do {
             final var referenceSet = parseUnicodeSet(source, pp);
-            if (referenceSet.hasStrings() || referenceSet.size() != 1) {
+            if (referenceSet.size() != 1) {
+                throw new BackwardParseException(
+                        "reference should be a single code point or string for property correspondence",
+                        pp.getIndex());
+            }
+            if (referenceSet.hasStrings()
+                    != (referenceCodePoints.size() >= firstMultiCharacterIndex)) {
                 throw new BackwardParseException(
-                        "reference should be a single code point for property correspondence",
+                        "Strings should correspond to strings for property correspondence",
                         pp.getIndex());
             }
-            referenceCodePoints.add(referenceSet.charAt(0));
+            referenceCodePoints.add(referenceSet.iterator().next());
         } while (Lookahead.oneToken(pp, source).accept(":"));
         if (referenceCodePoints.size() != sets.size()) {
             throw new BackwardParseException(
@@ -608,8 +640,8 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
                 expectedDifference = expectedPropertyDifferences.get(alias);
             }
             if (expectedDifference != null) {
-                for (int k = 0; k < sets.size(); ++k) {
-                    final int rk = referenceCodePoints.get(k);
+                for (int k = 0; k < firstMultiCharacterIndex; ++k) {
+                    final int rk = referenceCodePoints.get(k).codePointAt(0);
                     final String pRk = property.getValue(rk);
                     if (!Objects.equals(pRk, expectedDifference.referenceValueAlias)) {
                         errorMessageLines.add(
@@ -638,9 +670,9 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
                     }
                 }
             } else {
-                for (int k = 0; k < sets.size(); ++k) {
+                for (int k = 0; k < firstMultiCharacterIndex; ++k) {
                     final UnicodeSet set = sets.get(k);
-                    final int rk = referenceCodePoints.get(k);
+                    final int rk = referenceCodePoints.get(k).codePointAt(0);
                     final String pRk = property.getValue(rk);
                     loop_over_set:
                     for (int i = 0; i < set.size(); ++i) {
@@ -652,10 +684,9 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
                         Integer lMatchingForReference = null;
                         for (int l = 0; l < sets.size(); ++l) {
                             final boolean pCkEqualsCl =
-                                    Objects.equals(pCk, Character.toString(sets.get(l).charAt(i)));
+                                    Objects.equals(pCk, stringAt(sets.get(l), i));
                             final boolean pRkEqualsRl =
-                                    Objects.equals(
-                                            pRk, Character.toString(referenceCodePoints.get(l)));
+                                    Objects.equals(pRk, referenceCodePoints.get(l));
                             if (pRkEqualsRl) {
                                 lMatchingForReference = l;
                                 if (pCkEqualsCl) {
@@ -685,8 +716,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
                                             + ")\t=\t"
                                             + pCk
                                             + "\t≠\t"
-                                            + Character.toString(
-                                                    sets.get(lMatchingForReference).charAt(i))
+                                            + stringAt(sets.get(lMatchingForReference), i)
                                             + "\twhereas\t"
                                             + property.getName()
                                             + "("
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -147,16 +147,19 @@
 # CorrespondTo <R₁> : ... : <Rₙ>
 # [   UpTo: <Property> (<SValue> vs <RValue>) {, <Property> (<SValue> vs <RValue>) }]
 #
-# The Sₖ must be Unicode sets of equal size with no strings.  They are considered in code
-# point order for the correspondence check (item 2 below).
-# The references Rₖ must be Unicode sets each containing a single code point; by a slight abuse of
-# notation we refer to the code point as Rₖ in the explanation below.
+# The Sₖ must be Unicode sets of equal size, either with no strings or only strings.
+# They are considered in code point order for the correspondence check (item 2 below).
+# The references Rₖ must be Unicode sets each containing a single code point or a single string;
+# by a slight abuse of notation we refer to the code point or string as Rₖ in the explanation below.
+# For some m in 2 .. n, the following must hold:
+# a. Rₖ is a code point and Sₖ must contain only code points for k ≤ m, and
+# b. Rₖ is a string and Sₖ must contain only strings for m < k ≤ n, and
 # For every non-ignored property P that does not appear in the optional UpTo clause,
-# checks that for each k in 1 .. n, for the ith character C in Sₖ, either:
+# checks that for each k in 1 .. m, for the ith character C in Sₖ, either:
 # 1. P(C) = P(Rₖ), or
 # 2. for some l in 1 .. n, both:
 #    — P(Rₖ) is equal to Rₗ, and
-#    — P(C) is equal to the ith character in Sₗ.
+#    — P(C) is equal to the ith character (or string, if l > m) in Sₗ.
 # For every non-ignored property P that appears in the UpTo clause, checks all characters in the
 # sets Sₖ have the SValue and all R characters have the RValue.
 #
@@ -174,9 +177,9 @@
                 Propertywise [[α-ω] - [ς]] : [[Α-Ω] - \p{gc=Cn}]
                 CorrespondTo [g]           :  [G]
                     UpTo: Block             (Greek_And_Coptic vs Basic_Latin),
-                        Script            (Greek            vs Latin),
-                        Script_Extensions (Greek            vs Latin),
-                        East_Asian_Width  (Ambiguous        vs Narrow)
+                          Script            (Greek            vs Latin),
+                          Script_Extensions (Greek            vs Latin),
+                          East_Asian_Width  (Ambiguous        vs Narrow)
 #       The modifier letters ʳʷʸ are related to their non-superscripted counterparts in the same way
 #       that ʰ is related to h.  The capitals must be part of the correspondence because they are
 #       property values of the lowercase letters.
@@ -1369,6 +1372,13 @@ Ignoring Unicode_1_Name Confusable_MA:
         CorrespondTo [ⁱ] : [i] : [I]
     end Ignoring;
 
+    Propertywise [ゟ] : [{より}]
+    CorrespondTo [ヿ] : [{コト}]
+        UpTo: Block             (Hiragana vs Katakana),
+              Script            (Hiragana vs Katakana),
+              Script_Extensions (Hiragana vs Katakana),
+              Word_Break        (Other    vs Katakana)
+
 end Ignoring;
 
 end Ignoring;