Skip to content

Commit addf0c9

Browse files
authored
Allow strings in propertywise tests (#911)
* Allow strings in Propertywise tests * spots * stringAt * After macchiati’s review * spots
1 parent 6a34bda commit addf0c9

File tree

2 files changed

+67
-27
lines changed

2 files changed

+67
-27
lines changed

Diff for: unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java

+48-18
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,15 @@ private static void propertywiseAlikeLine(
525525
}
526526
}
527527

528+
private static String stringAt(UnicodeSet set, int i) {
529+
final int codePointsSize = set.size() - set.strings().size();
530+
if (i < codePointsSize) {
531+
return Character.toString(set.charAt(i));
532+
} else {
533+
return set.strings().stream().skip(i - codePointsSize).findFirst().get();
534+
}
535+
}
536+
528537
private static void propertywiseCorrespondenceLine(
529538
Set<String> ignoredProperties,
530539
UnicodeSet firstSet,
@@ -538,13 +547,13 @@ private static void propertywiseCorrespondenceLine(
538547
final List<UnicodeSet> sets = new ArrayList<>();
539548
sets.add(firstSet);
540549
expectToken(":", pp, source);
550+
551+
// Index of the first set of multi-character strings (and of the first multi-character
552+
// reference string).
553+
// This is `m` in the documentation in UnicodeInvariantTest.txt.
554+
int firstMultiCharacterIndex = -1;
541555
do {
542556
final var set = parseUnicodeSet(source, pp);
543-
if (set.hasStrings()) {
544-
throw new BackwardParseException(
545-
"Set should contain only single code points for property comparison",
546-
pp.getIndex());
547-
}
548557
if (set.size() != firstSet.size()) {
549558
throw new BackwardParseException(
550559
"Sets should have the same size for property correspondence (got "
@@ -554,18 +563,41 @@ private static void propertywiseCorrespondenceLine(
554563
+ ")",
555564
pp.getIndex());
556565
}
566+
if (set.hasStrings() && set.strings().size() != set.size()) {
567+
throw new BackwardParseException(
568+
"Sets should be all strings or all code points for property correspondence",
569+
pp.getIndex());
570+
}
571+
if (firstMultiCharacterIndex == -1) {
572+
if (set.hasStrings()) {
573+
firstMultiCharacterIndex = sets.size();
574+
}
575+
} else if (!set.hasStrings()) {
576+
throw new BackwardParseException(
577+
"Code points should come before strings in property correspondence",
578+
pp.getIndex());
579+
}
557580
sets.add(set);
558581
} while (Lookahead.oneToken(pp, source).accept(":"));
559-
final List<Integer> referenceCodePoints = new ArrayList<>();
582+
if (firstMultiCharacterIndex == -1) {
583+
firstMultiCharacterIndex = sets.size();
584+
}
585+
final List<String> referenceCodePoints = new ArrayList<>();
560586
expectToken("CorrespondTo", pp, source);
561587
do {
562588
final var referenceSet = parseUnicodeSet(source, pp);
563-
if (referenceSet.hasStrings() || referenceSet.size() != 1) {
589+
if (referenceSet.size() != 1) {
590+
throw new BackwardParseException(
591+
"reference should be a single code point or string for property correspondence",
592+
pp.getIndex());
593+
}
594+
if (referenceSet.hasStrings()
595+
!= (referenceCodePoints.size() >= firstMultiCharacterIndex)) {
564596
throw new BackwardParseException(
565-
"reference should be a single code point for property correspondence",
597+
"Strings should correspond to strings for property correspondence",
566598
pp.getIndex());
567599
}
568-
referenceCodePoints.add(referenceSet.charAt(0));
600+
referenceCodePoints.add(referenceSet.iterator().next());
569601
} while (Lookahead.oneToken(pp, source).accept(":"));
570602
if (referenceCodePoints.size() != sets.size()) {
571603
throw new BackwardParseException(
@@ -608,8 +640,8 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
608640
expectedDifference = expectedPropertyDifferences.get(alias);
609641
}
610642
if (expectedDifference != null) {
611-
for (int k = 0; k < sets.size(); ++k) {
612-
final int rk = referenceCodePoints.get(k);
643+
for (int k = 0; k < firstMultiCharacterIndex; ++k) {
644+
final int rk = referenceCodePoints.get(k).codePointAt(0);
613645
final String pRk = property.getValue(rk);
614646
if (!Objects.equals(pRk, expectedDifference.referenceValueAlias)) {
615647
errorMessageLines.add(
@@ -638,9 +670,9 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
638670
}
639671
}
640672
} else {
641-
for (int k = 0; k < sets.size(); ++k) {
673+
for (int k = 0; k < firstMultiCharacterIndex; ++k) {
642674
final UnicodeSet set = sets.get(k);
643-
final int rk = referenceCodePoints.get(k);
675+
final int rk = referenceCodePoints.get(k).codePointAt(0);
644676
final String pRk = property.getValue(rk);
645677
loop_over_set:
646678
for (int i = 0; i < set.size(); ++i) {
@@ -652,10 +684,9 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
652684
Integer lMatchingForReference = null;
653685
for (int l = 0; l < sets.size(); ++l) {
654686
final boolean pCkEqualsCl =
655-
Objects.equals(pCk, Character.toString(sets.get(l).charAt(i)));
687+
Objects.equals(pCk, stringAt(sets.get(l), i));
656688
final boolean pRkEqualsRl =
657-
Objects.equals(
658-
pRk, Character.toString(referenceCodePoints.get(l)));
689+
Objects.equals(pRk, referenceCodePoints.get(l));
659690
if (pRkEqualsRl) {
660691
lMatchingForReference = l;
661692
if (pCkEqualsCl) {
@@ -685,8 +716,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
685716
+ ")\t=\t"
686717
+ pCk
687718
+ "\t\t"
688-
+ Character.toString(
689-
sets.get(lMatchingForReference).charAt(i))
719+
+ stringAt(sets.get(lMatchingForReference), i)
690720
+ "\twhereas\t"
691721
+ property.getName()
692722
+ "("

Diff for: unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

+19-9
Original file line numberDiff line numberDiff line change
@@ -147,16 +147,19 @@
147147
# CorrespondTo <R₁> : ... : <Rₙ>
148148
# [ UpTo: <Property> (<SValue> vs <RValue>) {, <Property> (<SValue> vs <RValue>) }]
149149
#
150-
# The Sₖ must be Unicode sets of equal size with no strings. They are considered in code
151-
# point order for the correspondence check (item 2 below).
152-
# The references Rₖ must be Unicode sets each containing a single code point; by a slight abuse of
153-
# notation we refer to the code point as Rₖ in the explanation below.
150+
# The Sₖ must be Unicode sets of equal size, either with no strings or only strings.
151+
# They are considered in code point order for the correspondence check (item 2 below).
152+
# The references Rₖ must be Unicode sets each containing a single code point or a single string;
153+
# by a slight abuse of notation we refer to the code point or string as Rₖ in the explanation below.
154+
# For some m in 2 .. n, the following must hold:
155+
# a. Rₖ is a code point and Sₖ must contain only code points for k ≤ m, and
156+
# b. Rₖ is a string and Sₖ must contain only strings for m < k ≤ n, and
154157
# For every non-ignored property P that does not appear in the optional UpTo clause,
155-
# checks that for each k in 1 .. n, for the ith character C in Sₖ, either:
158+
# checks that for each k in 1 .. m, for the ith character C in Sₖ, either:
156159
# 1. P(C) = P(Rₖ), or
157160
# 2. for some l in 1 .. n, both:
158161
# — P(Rₖ) is equal to Rₗ, and
159-
# — P(C) is equal to the ith character in Sₗ.
162+
# — P(C) is equal to the ith character (or string, if l > m) in Sₗ.
160163
# For every non-ignored property P that appears in the UpTo clause, checks all characters in the
161164
# sets Sₖ have the SValue and all R characters have the RValue.
162165
#
@@ -174,9 +177,9 @@
174177
Propertywise [[α-ω] - [ς]] : [[Α-Ω] - \p{gc=Cn}]
175178
CorrespondTo [g] : [G]
176179
UpTo: Block (Greek_And_Coptic vs Basic_Latin),
177-
Script (Greek vs Latin),
178-
Script_Extensions (Greek vs Latin),
179-
East_Asian_Width (Ambiguous vs Narrow)
180+
Script (Greek vs Latin),
181+
Script_Extensions (Greek vs Latin),
182+
East_Asian_Width (Ambiguous vs Narrow)
180183
# The modifier letters ʳʷʸ are related to their non-superscripted counterparts in the same way
181184
# that ʰ is related to h. The capitals must be part of the correspondence because they are
182185
# property values of the lowercase letters.
@@ -1369,6 +1372,13 @@ Ignoring Unicode_1_Name Confusable_MA:
13691372
CorrespondTo [ⁱ] : [i] : [I]
13701373
end Ignoring;
13711374

1375+
Propertywise [ゟ] : [{より}]
1376+
CorrespondTo [ヿ] : [{コト}]
1377+
UpTo: Block (Hiragana vs Katakana),
1378+
Script (Hiragana vs Katakana),
1379+
Script_Extensions (Hiragana vs Katakana),
1380+
Word_Break (Other vs Katakana)
1381+
13721382
end Ignoring;
13731383

13741384
end Ignoring;

0 commit comments

Comments
 (0)