Skip to content

Commit 8ac84e0

Browse files
authored
Fix JSP failures with scx (#615)
* Fix JSP failures with scx * Prevent deva,beng from working; run spotless * Add exemplars as second example * Spotless * Fixes for Markus's review * Fix Bangla comment also
1 parent 2c0cd74 commit 8ac84e0

File tree

4 files changed

+207
-29
lines changed

4 files changed

+207
-29
lines changed

UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java

+112-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
package org.unicode.jsp;
22

3+
import com.google.common.base.Joiner;
4+
import com.google.common.collect.Multimap;
5+
import com.google.common.collect.TreeMultimap;
36
import com.ibm.icu.dev.util.UnicodeMap;
47
import com.ibm.icu.lang.CharSequences;
58
import com.ibm.icu.lang.UCharacter;
@@ -12,13 +15,19 @@
1215
import com.ibm.icu.text.Transform;
1316
import com.ibm.icu.text.UTF16;
1417
import com.ibm.icu.text.UnicodeSet;
18+
import com.ibm.icu.text.UnicodeSetIterator;
19+
import com.ibm.icu.util.LocaleData;
1520
import com.ibm.icu.util.ULocale;
1621
import com.ibm.icu.util.VersionInfo;
1722
import java.nio.charset.Charset;
1823
import java.util.ArrayList;
1924
import java.util.Arrays;
25+
import java.util.Collection;
2026
import java.util.List;
2127
import java.util.Locale;
28+
import java.util.Map.Entry;
29+
import java.util.Set;
30+
import java.util.TreeSet;
2231
import org.unicode.idna.Idna.IdnaType;
2332
import org.unicode.idna.Idna2003;
2433
import org.unicode.idna.Idna2008;
@@ -28,9 +37,13 @@
2837
import org.unicode.props.UnicodeProperty.BaseProperty;
2938
import org.unicode.props.UnicodeProperty.Factory;
3039
import org.unicode.props.UnicodeProperty.SimpleProperty;
40+
import org.unicode.text.utility.Utility;
3141

3242
public class XPropertyFactory extends UnicodeProperty.Factory {
3343

44+
private static final Joiner JOIN_COMMAS = Joiner.on(",");
45+
private static final boolean DEBUG_MULTI = false;
46+
3447
static final UnicodeSet ALL =
3548
new UnicodeSet("[[:^C:][:Cc:][:Cf:][:noncharactercodepoint:]]").freeze();
3649

@@ -96,6 +109,7 @@ public final Factory add2(UnicodeProperty sp) {
96109
add(
97110
new CodepointTransformProperty(
98111
new Transform<Integer, String>() {
112+
@Override
99113
public String transform(Integer source) {
100114
return Normalizer.normalize(source, Normalizer.NFC);
101115
}
@@ -105,6 +119,7 @@ public String transform(Integer source) {
105119
add(
106120
new CodepointTransformProperty(
107121
new Transform<Integer, String>() {
122+
@Override
108123
public String transform(Integer source) {
109124
return Normalizer.normalize(source, Normalizer.NFD);
110125
}
@@ -114,6 +129,7 @@ public String transform(Integer source) {
114129
add(
115130
new CodepointTransformProperty(
116131
new Transform<Integer, String>() {
132+
@Override
117133
public String transform(Integer source) {
118134
return Normalizer.normalize(source, Normalizer.NFKC);
119135
}
@@ -123,6 +139,7 @@ public String transform(Integer source) {
123139
add(
124140
new CodepointTransformProperty(
125141
new Transform<Integer, String>() {
142+
@Override
126143
public String transform(Integer source) {
127144
return Normalizer.normalize(source, Normalizer.NFKD);
128145
}
@@ -133,6 +150,7 @@ public String transform(Integer source) {
133150
add(
134151
new StringTransformProperty(
135152
new StringTransform() {
153+
@Override
136154
public String transform(String source) {
137155
return UCharacter.foldCase(source, true);
138156
}
@@ -142,6 +160,7 @@ public String transform(String source) {
142160
add(
143161
new StringTransformProperty(
144162
new StringTransform() {
163+
@Override
145164
public String transform(String source) {
146165
return UCharacter.toLowerCase(ULocale.ROOT, source);
147166
}
@@ -151,6 +170,7 @@ public String transform(String source) {
151170
add(
152171
new StringTransformProperty(
153172
new StringTransform() {
173+
@Override
154174
public String transform(String source) {
155175
return UCharacter.toUpperCase(ULocale.ROOT, source);
156176
}
@@ -160,6 +180,7 @@ public String transform(String source) {
160180
add(
161181
new StringTransformProperty(
162182
new StringTransform() {
183+
@Override
163184
public String transform(String source) {
164185
return UCharacter.toTitleCase(ULocale.ROOT, source, null);
165186
}
@@ -170,6 +191,7 @@ public String transform(String source) {
170191
add(
171192
new StringTransformProperty(
172193
new StringTransform() {
194+
@Override
173195
public String transform(String source) {
174196
StringBuilder b = new StringBuilder();
175197
for (int cp : CharSequences.codePoints(source)) {
@@ -184,6 +206,7 @@ public String transform(String source) {
184206
add(
185207
new StringTransformProperty(
186208
new StringTransform() {
209+
@Override
187210
public String transform(String source) {
188211
String result = NFM.nfm.get(source);
189212
return result == null ? source : result;
@@ -201,6 +224,7 @@ public String transform(String source) {
201224
add(
202225
new CodepointTransformProperty(
203226
new Transform<Integer, String>() {
227+
@Override
204228
public String transform(Integer source) {
205229
return UnicodeUtilities.getSubheader().getSubheader(source);
206230
}
@@ -239,6 +263,9 @@ public String transform(Integer source) {
239263
.setMain("bmp", "bmp", UnicodeProperty.BINARY, "6.0"));
240264

241265
addCollationProperty();
266+
addExamplarProperty(LocaleData.ES_STANDARD, "exem", "exemplar");
267+
addExamplarProperty(LocaleData.ES_AUXILIARY, "exema", "exemplar_aux");
268+
addExamplarProperty(LocaleData.ES_PUNCTUATION, "exemp", "exemplar_punct");
242269

243270
// set up the special script property
244271
UnicodeProperty scriptProp = base.getProperty("sc");
@@ -251,7 +278,8 @@ public String transform(Integer source) {
251278
.setMain("Script_Extensions", "scx", UnicodeProperty.ENUMERATED, "1.1")
252279
.addValueAliases(
253280
ScriptTester.getScriptSpecialsAlternates(),
254-
AliasAddAction.IGNORE_IF_MISSING));
281+
AliasAddAction.IGNORE_IF_MISSING)
282+
.setMultivalued(true));
255283

256284
CachedProps cp = CachedProps.CACHED_PROPS;
257285
for (String prop : cp.getAvailable()) {
@@ -289,6 +317,81 @@ public String transform(Integer source) {
289317
.setMain("RGI_Emoji", "RGI_Emoji", UnicodeProperty.BINARY, "13.0"));
290318
}
291319

320+
private void addExamplarProperty(
321+
int exemplarType, String propertyAbbreviation, String propertyName) {
322+
Multimap<Integer, String> data = TreeMultimap.create();
323+
Set<String> localeSet = new TreeSet<>();
324+
325+
for (ULocale ulocale : ULocale.getAvailableLocales()) {
326+
if (!ulocale.getCountry().isEmpty() || !ulocale.getVariant().isEmpty()) {
327+
continue;
328+
// we want to skip cases where characters are in the parent locale, but there is no
329+
// ULocale parentLocale = ulocale.getParent();
330+
}
331+
UnicodeSet exemplarSet = LocaleData.getExemplarSet(ulocale, 0, exemplarType);
332+
if (!ulocale.getScript().isEmpty()) {
333+
// we can't find out the parent locale or defaultContent locale in ICU, so we hack
334+
// it
335+
String langLocale = ulocale.getLanguage();
336+
UnicodeSet langExemplarSet =
337+
LocaleData.getExemplarSet(new ULocale(langLocale), 0, exemplarType);
338+
if (langExemplarSet.equals(exemplarSet)) {
339+
continue;
340+
}
341+
}
342+
String locale = ulocale.toLanguageTag();
343+
localeSet.add(locale);
344+
for (UnicodeSetIterator it = new UnicodeSetIterator(exemplarSet); it.nextRange(); ) {
345+
if (it.codepoint == UnicodeSetIterator.IS_STRING) {
346+
// flatten
347+
int cp = 0;
348+
for (int i = 0; i < it.string.length(); i += Character.charCount(cp)) {
349+
cp = it.string.codePointAt(i);
350+
data.put(cp, locale);
351+
}
352+
} else {
353+
for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) {
354+
data.put(cp, locale);
355+
}
356+
}
357+
}
358+
}
359+
360+
// convert to UnicodeMap
361+
UnicodeMap<String> unicodeMap = new UnicodeMap<>();
362+
for (Entry<Integer, Collection<String>> entry : data.asMap().entrySet()) {
363+
String value = JOIN_COMMAS.join(entry.getValue()).intern();
364+
unicodeMap.put(entry.getKey(), value);
365+
}
366+
if (DEBUG_MULTI) {
367+
System.out.println("\n" + propertyName);
368+
for (UnicodeMap.EntryRange<String> entry : unicodeMap.entryRanges()) {
369+
System.out.println(
370+
Utility.hex(entry.codepoint)
371+
+ (entry.codepoint == entry.codepointEnd
372+
? ""
373+
: "-" + Utility.hex(entry.codepointEnd))
374+
+ " ;\t"
375+
+ entry.value);
376+
}
377+
}
378+
379+
// put locales into right format
380+
String[] localeList = localeSet.toArray(new String[localeSet.size()]);
381+
String[][] locales = new String[][] {localeList, localeList}; // abbreviations are the same
382+
383+
add(
384+
new UnicodeProperty.UnicodeMapProperty()
385+
.set(unicodeMap)
386+
.setMain(
387+
propertyName,
388+
propertyAbbreviation,
389+
UnicodeProperty.ENUMERATED,
390+
"1.1")
391+
.addValueAliases(locales, AliasAddAction.ADD_MAIN_ALIAS)
392+
.setMultivalued(true));
393+
}
394+
292395
private void addCollationProperty() {
293396
RuleBasedCollator c = UnicodeSetUtilities.RAW_COLLATOR;
294397
// (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
@@ -652,6 +755,7 @@ public StringTransformProperty(
652755
setUniformUnassigned(hasUniformUnassigned);
653756
}
654757

758+
@Override
655759
protected String _getValue(int codepoint) {
656760
return transform.transform(UTF16.valueOf(codepoint));
657761
}
@@ -666,6 +770,7 @@ public CodepointTransformProperty(
666770
setUniformUnassigned(hasUniformUnassigned);
667771
}
668772

773+
@Override
669774
protected String _getValue(int codepoint) {
670775
return transform.transform(codepoint);
671776
}
@@ -682,6 +787,7 @@ public static class EncodingProperty extends SimpleProperty {
682787
encoder = new CharEncoder(charset, false, false);
683788
}
684789

790+
@Override
685791
protected String _getValue(int codepoint) {
686792
int len = encoder.getValue(codepoint, temp, 0);
687793
if (len < 0) {
@@ -697,6 +803,7 @@ protected String _getValue(int codepoint) {
697803
return result.toString();
698804
}
699805

806+
@Override
700807
public boolean isDefault(int codepoint) {
701808
int len = encoder.getValue(codepoint, temp, 0);
702809
return len < 0;
@@ -716,6 +823,7 @@ public static class EncodingPropertyBoolean extends SimpleProperty {
716823
encoder = new CharEncoder(charset, true, true);
717824
}
718825

826+
@Override
719827
protected String _getValue(int codepoint) {
720828
return (encoder.getValue(codepoint, null, 0) > 0) ? "Yes" : "No";
721829
}
@@ -731,6 +839,7 @@ public XPropertyFactory.UnicodeSetProperty set(UnicodeSet set) {
731839
return this;
732840
}
733841

842+
@Override
734843
protected UnicodeMap<String> _getUnicodeMap() {
735844
UnicodeMap<String> result = new UnicodeMap<String>();
736845
result.putAll(unicodeSet, "Yes");
@@ -743,10 +852,12 @@ public XPropertyFactory.UnicodeSetProperty set(String string) {
743852
return set(new UnicodeSet(string).freeze());
744853
}
745854

855+
@Override
746856
protected String _getValue(int codepoint) {
747857
return YESNO_ARRAY[unicodeSet.contains(codepoint) ? 0 : 1];
748858
}
749859

860+
@Override
750861
protected List _getAvailableValues(List result) {
751862
return YESNO;
752863
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package org.unicode.jsptest;
2+
3+
import com.ibm.icu.text.UnicodeSet;
4+
import org.junit.jupiter.api.Test;
5+
import org.unicode.jsp.UnicodeSetUtilities;
6+
import org.unicode.unittest.TestFmwkMinusMinus;
7+
8+
public class TestMultivalued extends TestFmwkMinusMinus {
9+
@Test
10+
public void TestScx1Script() {
11+
String unicodeSetString = "\\p{scx=deva}";
12+
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);
13+
14+
UnicodeSet mustContain = new UnicodeSet("[ᳵ।]"); // one character B&D, other B&D&D&G&...
15+
assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain));
16+
17+
UnicodeSet mustNotContain = new UnicodeSet("[ক]"); // one Bangla character
18+
assertFalse(
19+
unicodeSetString + " !contains " + mustNotContain,
20+
parsed.containsAll(mustNotContain));
21+
}
22+
23+
@Test
24+
public void TestScxMulti() {
25+
String unicodeSetString = "\\p{scx=beng,deva}";
26+
String exceptionMessage = null;
27+
try {
28+
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);
29+
} catch (Exception e) {
30+
exceptionMessage = e.getMessage();
31+
}
32+
assertEquals(
33+
"Expected exception",
34+
"Multivalued property values can't contain commas.",
35+
exceptionMessage);
36+
}
37+
38+
@Test
39+
public void TestExemplars() {
40+
String unicodeSetString = "\\p{exem=da}";
41+
UnicodeSet parsed = UnicodeSetUtilities.parseUnicodeSet(unicodeSetString);
42+
43+
UnicodeSet mustContain = new UnicodeSet("[æ]");
44+
assertTrue(unicodeSetString + " contains " + mustContain, parsed.containsAll(mustContain));
45+
46+
UnicodeSet mustNotContain = new UnicodeSet("[ç]");
47+
assertFalse(
48+
unicodeSetString + " !contains " + mustNotContain,
49+
parsed.containsAll(mustNotContain));
50+
}
51+
}

UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java

-8
Original file line numberDiff line numberDiff line change
@@ -413,14 +413,6 @@ public void TestPerMill(final String name, final Charset charset) {
413413
}
414414
}
415415

416-
@Test
417-
public void TestScriptSpecials() {
418-
// UnicodeSet set = UnicodeSetUtilities.parseUnicodeSet("[:scs=Hant:]");
419-
// assertNotEquals("Hant", 0, set.size());
420-
UnicodeSet set2 = UnicodeSetUtilities.parseUnicodeSet("[:scx=Arab,Syrc:]");
421-
assertNotEquals("Arab Syrc", 0, set2.size());
422-
}
423-
424416
@Test
425417
public void TestGC() {
426418
Map<String, R2<String, UnicodeSet>> SPECIAL_GC =

0 commit comments

Comments
 (0)