Skip to content

Commit acfcfde

Browse files
committed
[RFC] Add a locale for grapheme case-insensitive functions
1 parent 142e378 commit acfcfde

10 files changed

+107
-38
lines changed

UPGRADING

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,9 @@ PHP 8.5 UPGRADE NOTES
299299
TransLiterator::getErrorCode(), and TransLiterator::getErrorMessage()
300300
have dropped the false from the return type union. Returning false
301301
was actually never possible.
302-
302+
. grapheme_strpos(), grapheme_stripos(), grapheme_strrpos(),
303+
grapheme_strripos(), grapheme_strstr(), grapheme_stristr() and
304+
grapheme_levenshtein() functions add $locale parameter.
303305
- LDAP:
304306
. ldap_get_option() now accepts a NULL connection, as ldap_set_option(),
305307
to allow retrieval of global options.

ext/intl/grapheme/grapheme_string.c

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -81,19 +81,20 @@ PHP_FUNCTION(grapheme_strlen)
8181
/* {{{ Find position of first occurrence of a string within another */
8282
PHP_FUNCTION(grapheme_strpos)
8383
{
84-
char *haystack, *needle;
85-
size_t haystack_len, needle_len;
84+
char *haystack, *needle, *locale = "";
85+
size_t haystack_len, needle_len, locale_len;
8686
const char *found;
8787
zend_long loffset = 0;
8888
int32_t offset = 0;
8989
size_t noffset = 0;
9090
zend_long ret_pos;
9191

92-
ZEND_PARSE_PARAMETERS_START(2, 3)
92+
ZEND_PARSE_PARAMETERS_START(2, 4)
9393
Z_PARAM_STRING(haystack, haystack_len)
9494
Z_PARAM_STRING(needle, needle_len)
9595
Z_PARAM_OPTIONAL
9696
Z_PARAM_LONG(loffset)
97+
Z_PARAM_STRING(locale, locale_len)
9798
ZEND_PARSE_PARAMETERS_END();
9899

99100
if ( OUTSIDE_STRING(loffset, haystack_len) ) {
@@ -121,7 +122,7 @@ PHP_FUNCTION(grapheme_strpos)
121122
}
122123

123124
/* do utf16 part of the strpos */
124-
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
125+
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0, locale /* last */ );
125126

126127
if ( ret_pos >= 0 ) {
127128
RETURN_LONG(ret_pos);
@@ -134,19 +135,20 @@ PHP_FUNCTION(grapheme_strpos)
134135
/* {{{ Find position of first occurrence of a string within another, ignoring case differences */
135136
PHP_FUNCTION(grapheme_stripos)
136137
{
137-
char *haystack, *needle;
138-
size_t haystack_len, needle_len;
138+
char *haystack, *needle, *locale = "";
139+
size_t haystack_len, needle_len, locale_len = 0;
139140
const char *found;
140141
zend_long loffset = 0;
141142
int32_t offset = 0;
142143
zend_long ret_pos;
143144
int is_ascii;
144145

145-
ZEND_PARSE_PARAMETERS_START(2, 3)
146+
ZEND_PARSE_PARAMETERS_START(2, 4)
146147
Z_PARAM_STRING(haystack, haystack_len)
147148
Z_PARAM_STRING(needle, needle_len)
148149
Z_PARAM_OPTIONAL
149150
Z_PARAM_LONG(loffset)
151+
Z_PARAM_STRING(locale, locale_len)
150152
ZEND_PARSE_PARAMETERS_END();
151153

152154
if ( OUTSIDE_STRING(loffset, haystack_len) ) {
@@ -185,7 +187,7 @@ PHP_FUNCTION(grapheme_stripos)
185187
}
186188

187189
/* do utf16 part of the strpos */
188-
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
190+
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0, locale /*last */ );
189191

190192
if ( ret_pos >= 0 ) {
191193
RETURN_LONG(ret_pos);
@@ -200,17 +202,19 @@ PHP_FUNCTION(grapheme_stripos)
200202
PHP_FUNCTION(grapheme_strrpos)
201203
{
202204
char *haystack, *needle;
203-
size_t haystack_len, needle_len;
205+
char *locale = "";
206+
size_t haystack_len, needle_len, locale_len;
204207
zend_long loffset = 0;
205208
int32_t offset = 0;
206209
zend_long ret_pos;
207210
int is_ascii;
208211

209-
ZEND_PARSE_PARAMETERS_START(2, 3)
212+
ZEND_PARSE_PARAMETERS_START(2, 4)
210213
Z_PARAM_STRING(haystack, haystack_len)
211214
Z_PARAM_STRING(needle, needle_len)
212215
Z_PARAM_OPTIONAL
213216
Z_PARAM_LONG(loffset)
217+
Z_PARAM_STRING(locale, locale_len)
214218
ZEND_PARSE_PARAMETERS_END();
215219

216220
if ( OUTSIDE_STRING(loffset, haystack_len) ) {
@@ -242,7 +246,7 @@ PHP_FUNCTION(grapheme_strrpos)
242246
/* else we need to continue via utf16 */
243247
}
244248

245-
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
249+
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1, locale /* last */);
246250

247251
if ( ret_pos >= 0 ) {
248252
RETURN_LONG(ret_pos);
@@ -257,18 +261,19 @@ PHP_FUNCTION(grapheme_strrpos)
257261
/* {{{ Find position of last occurrence of a string within another, ignoring case */
258262
PHP_FUNCTION(grapheme_strripos)
259263
{
260-
char *haystack, *needle;
261-
size_t haystack_len, needle_len;
264+
char *haystack, *needle, *locale = "";
265+
size_t haystack_len, needle_len, locale_len = 0;
262266
zend_long loffset = 0;
263267
int32_t offset = 0;
264268
zend_long ret_pos;
265269
int is_ascii;
266270

267-
ZEND_PARSE_PARAMETERS_START(2, 3)
271+
ZEND_PARSE_PARAMETERS_START(2, 4)
268272
Z_PARAM_STRING(haystack, haystack_len)
269273
Z_PARAM_STRING(needle, needle_len)
270274
Z_PARAM_OPTIONAL
271275
Z_PARAM_LONG(loffset)
276+
Z_PARAM_STRING(locale, locale_len)
272277
ZEND_PARSE_PARAMETERS_END();
273278

274279
if ( OUTSIDE_STRING(loffset, haystack_len) ) {
@@ -309,7 +314,7 @@ PHP_FUNCTION(grapheme_strripos)
309314
/* else we need to continue via utf16 */
310315
}
311316

312-
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
317+
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1, locale /*last */);
313318

314319
if ( ret_pos >= 0 ) {
315320
RETURN_LONG(ret_pos);
@@ -324,10 +329,10 @@ PHP_FUNCTION(grapheme_strripos)
324329
/* {{{ Returns part of a string */
325330
PHP_FUNCTION(grapheme_substr)
326331
{
327-
char *str;
332+
char *str, *locale = "";
328333
zend_string *u8_sub_str;
329334
UChar *ustr;
330-
size_t str_len;
335+
size_t str_len, locale_len;
331336
int32_t ustr_len;
332337
zend_long lstart = 0, length = 0;
333338
int32_t start = 0;
@@ -339,11 +344,12 @@ PHP_FUNCTION(grapheme_substr)
339344
int32_t (*iter_func)(UBreakIterator *);
340345
bool no_length = true;
341346

342-
ZEND_PARSE_PARAMETERS_START(2, 3)
347+
ZEND_PARSE_PARAMETERS_START(2, 4)
343348
Z_PARAM_STRING(str, str_len)
344349
Z_PARAM_LONG(lstart)
345350
Z_PARAM_OPTIONAL
346351
Z_PARAM_LONG_OR_NULL(length, no_length)
352+
Z_PARAM_STRING(locale, locale_len)
347353
ZEND_PARSE_PARAMETERS_END();
348354

349355
if (lstart < INT32_MIN || lstart > INT32_MAX) {
@@ -537,17 +543,18 @@ PHP_FUNCTION(grapheme_substr)
537543
/* {{{ strstr_common_handler */
538544
static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
539545
{
540-
char *haystack, *needle;
546+
char *haystack, *needle, *locale = "";
541547
const char *found;
542-
size_t haystack_len, needle_len;
548+
size_t haystack_len, needle_len, locale_len = 0;
543549
int32_t ret_pos, uchar_pos;
544550
bool part = false;
545551

546-
ZEND_PARSE_PARAMETERS_START(2, 3)
552+
ZEND_PARSE_PARAMETERS_START(2, 4)
547553
Z_PARAM_STRING(haystack, haystack_len)
548554
Z_PARAM_STRING(needle, needle_len)
549555
Z_PARAM_OPTIONAL
550556
Z_PARAM_BOOL(part)
557+
Z_PARAM_STRING(locale, locale_len)
551558
ZEND_PARSE_PARAMETERS_END();
552559

553560
if ( !f_ignore_case ) {
@@ -574,7 +581,7 @@ static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_cas
574581
}
575582

576583
/* need to work in utf16 */
577-
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
584+
ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0, locale /*last */ );
578585

579586
if ( ret_pos < 0 ) {
580587
RETURN_FALSE;
@@ -919,14 +926,17 @@ PHP_FUNCTION(grapheme_levenshtein)
919926
zend_long cost_ins = 1;
920927
zend_long cost_rep = 1;
921928
zend_long cost_del = 1;
929+
char *locale = "";
930+
size_t locale_len = 0;
922931

923-
ZEND_PARSE_PARAMETERS_START(2, 5)
932+
ZEND_PARSE_PARAMETERS_START(2, 6)
924933
Z_PARAM_STR(string1)
925934
Z_PARAM_STR(string2)
926935
Z_PARAM_OPTIONAL
927936
Z_PARAM_LONG(cost_ins)
928937
Z_PARAM_LONG(cost_rep)
929938
Z_PARAM_LONG(cost_del)
939+
Z_PARAM_STRING(locale, locale_len)
930940
ZEND_PARSE_PARAMETERS_END();
931941

932942
if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
@@ -1043,7 +1053,7 @@ PHP_FUNCTION(grapheme_levenshtein)
10431053
RETVAL_FALSE;
10441054
goto out_bi2;
10451055
}
1046-
UCollator *collator = ucol_open("", &ustatus);
1056+
UCollator *collator = ucol_open(locale, &ustatus);
10471057
if (U_FAILURE(ustatus)) {
10481058
intl_error_set_code(NULL, ustatus);
10491059

ext/intl/grapheme/grapheme_util.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char
9494

9595

9696
/* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
97-
int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last)
97+
int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last, char* locale)
9898
{
9999
UChar *uhaystack = NULL, *uneedle = NULL;
100100
int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0;
@@ -136,7 +136,7 @@ int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle,
136136
}
137137

138138
status = U_ZERO_ERROR;
139-
src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status);
139+
src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, locale, bi, &status);
140140
STRPOS_CHECK_STATUS(status, "Error creating search object");
141141

142142
if(f_ignore_case) {

ext/intl/grapheme/grapheme_util.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ zend_long grapheme_ascii_check(const unsigned char *day, size_t len);
2525
void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char **sub_str, int32_t *sub_str_len);
2626
zend_long grapheme_strrpos_ascii(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset);
2727

28-
int32_t grapheme_strrpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int f_ignore_case);
29-
int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int *puchar_pos, int f_ignore_case, int last);
28+
int32_t grapheme_strrpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int f_ignore_case, char* locale);
29+
int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int *puchar_pos, int f_ignore_case, int last, char* locale);
3030

3131
int32_t grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len );
3232

ext/intl/php_intl.stub.php

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -431,23 +431,23 @@ function numfmt_get_error_message(NumberFormatter $formatter): string {}
431431

432432
function grapheme_strlen(string $string): int|false|null {}
433433

434-
function grapheme_strpos(string $haystack, string $needle, int $offset = 0): int|false {}
434+
function grapheme_strpos(string $haystack, string $needle, int $offset = 0, string $locale = ""): int|false {}
435435

436-
function grapheme_stripos(string $haystack, string $needle, int $offset = 0): int|false {}
436+
function grapheme_stripos(string $haystack, string $needle, int $offset = 0, string $locale = ""): int|false {}
437437

438-
function grapheme_strrpos(string $haystack, string $needle, int $offset = 0): int|false {}
438+
function grapheme_strrpos(string $haystack, string $needle, int $offset = 0, string $locale = ""): int|false {}
439439

440-
function grapheme_strripos(string $haystack, string $needle, int $offset = 0): int|false {}
440+
function grapheme_strripos(string $haystack, string $needle, int $offset = 0, string $locale = ""): int|false {}
441441

442-
function grapheme_substr(string $string, int $offset, ?int $length = null): string|false {}
442+
function grapheme_substr(string $string, int $offset, ?int $length = null, string $locale = ""): string|false {}
443443

444-
function grapheme_strstr(string $haystack, string $needle, bool $beforeNeedle = false): string|false {}
444+
function grapheme_strstr(string $haystack, string $needle, bool $beforeNeedle = false, string $locale = ""): string|false {}
445445

446-
function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = false): string|false {}
446+
function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = false, string $locale = ""): string|false {}
447447

448448
function grapheme_str_split(string $string, int $length = 1): array|false {}
449449

450-
function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}
450+
function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, string $locale = ""): int|false {}
451451

452452
/** @param int $next */
453453
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}

ext/intl/php_intl_arginfo.h

Lines changed: 5 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ext/intl/tests/grapheme_levenshtein.phpt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,14 @@ $nabe = '邊';
5858
$nabe_E0100 = "邊󠄀";
5959
var_dump(grapheme_levenshtein($nabe, $nabe_E0100));
6060

61+
// variable $nabe and $nabe_E0101 is different because that is IVS.
62+
// $nabe_E0101 is variable selector in U+908A U+E0101.
63+
// grapheme_levenshtein can catches different only match strength is u-ks-identic for locale.
64+
// So result is expect to 1.
65+
$nabe = '';
66+
$nabe_E0101 = "\u{908A}\u{E0101}";
67+
var_dump(grapheme_levenshtein($nabe, $nabe_E0101, locale: "ja_JP-u-ks-identic"));
68+
6169
// combining character
6270
var_dump(grapheme_levenshtein("\u{0065}\u{0301}", "\u{00e9}"));
6371

@@ -80,6 +88,7 @@ try {
8088
} catch (ValueError $e) {
8189
echo $e->getMessage() . PHP_EOL;
8290
}
91+
8392
?>
8493
--EXPECTF--
8594
--- Equal ---
@@ -121,6 +130,7 @@ int(2)
121130
--- Variable selector ---
122131
int(1)
123132
int(0)
133+
int(1)
124134
int(0)
125135
--- Corner case ---
126136
grapheme_levenshtein(): Argument #3 ($insertion_cost) must be greater than 0 and less than or equal to %d
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
--TEST--
2+
grapheme_stripos() function locale dependency test
3+
--EXTENSIONS--
4+
intl
5+
--FILE--
6+
<?php
7+
var_dump(grapheme_stripos("abc", "abc", 0));
8+
var_dump(grapheme_stripos("i", "\u{0130}", 0, "tr_TR"));
9+
var_dump(grapheme_stripos("i", "\u{0130}", 0, "en_US"));
10+
?>
11+
--EXPECT--
12+
int(0)
13+
int(0)
14+
bool(false)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
--TEST--
2+
grapheme_stristr() function locale dependency test
3+
--EXTENSIONS--
4+
intl
5+
--FILE--
6+
<?php
7+
var_dump(grapheme_stristr("abc", "abc", 0));
8+
var_dump(grapheme_stristr("i", "\u{0130}", 0, "tr_TR"));
9+
var_dump(grapheme_stristr("i", "\u{0130}", 0, "en_US"));
10+
?>
11+
--EXPECT--
12+
string(3) "abc"
13+
string(1) "i"
14+
bool(false)
15+

0 commit comments

Comments
 (0)