Skip to content

Commit c4fc6a9

Browse files
committed
Fix GH-20262: array_unique() SORT_REGULAR fails to deduplicate with mixed strings
array_unique() with SORT_REGULAR was failing to remove duplicate numeric strings when mixed with alphanumeric strings due to non-transitive comparison issues in the sort-based algorithm. Implemented hash-bucketing optimization for SORT_REGULAR that preserves full type coercion semantics while improving performance from O(n²) to O(n). Closes GH-20262
1 parent 5887c76 commit c4fc6a9

File tree

3 files changed

+361
-0
lines changed

3 files changed

+361
-0
lines changed

ext/standard/array.c

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4964,6 +4964,9 @@ PHP_FUNCTION(array_unique)
49644964
bucket_compare_func_t cmp;
49654965
struct bucketindex *arTmp, *cmpdata, *lastkept;
49664966
uint32_t i, idx;
4967+
zend_long num_key;
4968+
zend_string *str_key;
4969+
zval *val;
49674970

49684971
ZEND_PARSE_PARAMETERS_START(1, 2)
49694972
Z_PARAM_ARRAY(array)
@@ -4976,6 +4979,131 @@ PHP_FUNCTION(array_unique)
49764979
return;
49774980
}
49784981

4982+
if (sort_type == PHP_SORT_REGULAR) {
4983+
/* Hash-bucketing solution for SORT_REGULAR */
4984+
#define UNIQUE_HASH_BUCKETS 256
4985+
4986+
typedef struct {
4987+
zval **values;
4988+
uint32_t count;
4989+
uint32_t capacity;
4990+
} value_bucket;
4991+
4992+
value_bucket *buckets = ecalloc(UNIQUE_HASH_BUCKETS, sizeof(value_bucket));
4993+
cmp = php_get_data_compare_func_unstable(sort_type, 0);
4994+
array_init(return_value);
4995+
4996+
ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(array), num_key, str_key, val) {
4997+
/* Dereference if this is a reference */
4998+
zval *deref_val = val;
4999+
ZVAL_DEREF(deref_val);
5000+
5001+
/* Compute hash for this value */
5002+
zend_ulong hash = 0;
5003+
5004+
if (Z_TYPE_P(deref_val) == IS_LONG) {
5005+
hash = (zend_ulong)Z_LVAL_P(deref_val);
5006+
} else if (Z_TYPE_P(deref_val) == IS_DOUBLE) {
5007+
double dval = Z_DVAL_P(deref_val);
5008+
if (zend_isnan(dval) || zend_isinf(dval)) {
5009+
hash = 0; /* NaN and Inf hash to 0 */
5010+
} else {
5011+
hash = (zend_ulong)(zend_long)dval;
5012+
}
5013+
} else if (Z_TYPE_P(deref_val) == IS_TRUE) {
5014+
hash = 1; /* true hashes like integer 1 */
5015+
} else if (Z_TYPE_P(deref_val) == IS_FALSE) {
5016+
hash = 0; /* false hashes like integer 0 */
5017+
} else if (Z_TYPE_P(deref_val) == IS_NULL) {
5018+
hash = 0; /* null hashes like integer 0 */
5019+
} else if (Z_TYPE_P(deref_val) == IS_STRING) {
5020+
/* Check if numeric string */
5021+
zend_long lval;
5022+
double dval;
5023+
zend_uchar type = is_numeric_string(Z_STRVAL_P(deref_val), Z_STRLEN_P(deref_val), &lval, &dval, 0);
5024+
5025+
if (type == IS_LONG) {
5026+
hash = (zend_ulong)lval; /* '5' and '05' hash the same */
5027+
} else if (type == IS_DOUBLE) {
5028+
hash = (zend_ulong)dval;
5029+
} else {
5030+
/* Non-numeric string */
5031+
if (Z_STRLEN_P(deref_val) == 0) {
5032+
hash = 0; /* Empty string might equal false/null */
5033+
} else {
5034+
hash = zend_string_hash_val(Z_STR_P(deref_val));
5035+
}
5036+
}
5037+
} else if (Z_TYPE_P(deref_val) == IS_OBJECT) {
5038+
/* Hash objects by class name */
5039+
zend_class_entry *ce = Z_OBJCE_P(deref_val);
5040+
hash = zend_string_hash_val(ce->name);
5041+
} else if (Z_TYPE_P(deref_val) == IS_ARRAY) {
5042+
/* Hash arrays by size and first value */
5043+
hash = zend_hash_num_elements(Z_ARRVAL_P(deref_val));
5044+
5045+
/* XOR with hash of first element if it's a simple type */
5046+
zval *first_elem = zend_hash_get_current_data(Z_ARRVAL_P(deref_val));
5047+
if (first_elem) {
5048+
if (Z_TYPE_P(first_elem) == IS_LONG) {
5049+
hash ^= Z_LVAL_P(first_elem);
5050+
} else if (Z_TYPE_P(first_elem) == IS_STRING) {
5051+
hash ^= zend_string_hash_val(Z_STR_P(first_elem));
5052+
}
5053+
}
5054+
} else {
5055+
/* Other types */
5056+
hash = Z_TYPE_P(deref_val);
5057+
}
5058+
5059+
uint32_t bucket_idx = hash % UNIQUE_HASH_BUCKETS;
5060+
value_bucket *bucket = &buckets[bucket_idx];
5061+
5062+
/* Check if duplicate exists in this bucket */
5063+
bool is_duplicate = false;
5064+
for (uint32_t i = 0; i < bucket->count; i++) {
5065+
zval *existing_deref = bucket->values[i];
5066+
ZVAL_DEREF(existing_deref);
5067+
Bucket b1 = {.val = *deref_val}, b2 = {.val = *existing_deref};
5068+
if (cmp(&b1, &b2) == 0) {
5069+
is_duplicate = true;
5070+
break;
5071+
}
5072+
}
5073+
5074+
if (!is_duplicate) {
5075+
/* Add to bucket */
5076+
if (bucket->count >= bucket->capacity) {
5077+
bucket->capacity = bucket->capacity ? bucket->capacity * 2 : 4;
5078+
bucket->values = erealloc(bucket->values, bucket->capacity * sizeof(zval*));
5079+
}
5080+
bucket->values[bucket->count++] = val;
5081+
5082+
/* Add to result */
5083+
if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) {
5084+
ZVAL_DEREF(val);
5085+
}
5086+
Z_TRY_ADDREF_P(val);
5087+
5088+
if (str_key) {
5089+
zend_hash_add_new(Z_ARRVAL_P(return_value), str_key, val);
5090+
} else {
5091+
zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val);
5092+
}
5093+
}
5094+
} ZEND_HASH_FOREACH_END();
5095+
5096+
/* Cleanup buckets */
5097+
for (uint32_t i = 0; i < UNIQUE_HASH_BUCKETS; i++) {
5098+
if (buckets[i].values) {
5099+
efree(buckets[i].values);
5100+
}
5101+
}
5102+
efree(buckets);
5103+
5104+
return;
5105+
}
5106+
49795107
if (sort_type == PHP_SORT_STRING) {
49805108
HashTable seen;
49815109
zend_long num_key;
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
--TEST--
2+
Test array_unique() function : SORT_REGULAR type coercion behavior
3+
--FILE--
4+
<?php
5+
echo "*** Testing array_unique() with SORT_REGULAR ***\n";
6+
7+
// Test 1: Integer and string representations (coerce)
8+
echo "\n-- Integer and string coercion --\n";
9+
var_dump(array_unique([1, "1", 2, "2"], SORT_REGULAR));
10+
11+
// Test 2: Boolean coercion
12+
echo "\n-- Boolean coercion --\n";
13+
var_dump(array_unique([true, 1, false, 0], SORT_REGULAR));
14+
15+
// Test 3: NULL coercion with empty string and "0"
16+
echo "\n-- NULL coercion --\n";
17+
var_dump(array_unique([null, "", false, 0, "0"], SORT_REGULAR));
18+
19+
// Test 4: Float coercion
20+
echo "\n-- Float coercion --\n";
21+
var_dump(array_unique([1, 1.0, "1", "1.0"], SORT_REGULAR));
22+
23+
// Test 5: Numeric strings coerce
24+
echo "\n-- Numeric strings --\n";
25+
var_dump(array_unique(["10", 10, "10.0", 10.0], SORT_REGULAR));
26+
27+
// Test 6: Leading zeros make strings distinct
28+
echo "\n-- Leading zeros --\n";
29+
var_dump(array_unique(["05", "5", 5], SORT_REGULAR));
30+
31+
// Test 7: Partial numeric strings don't coerce
32+
echo "\n-- Partial numeric strings --\n";
33+
var_dump(array_unique(["5abc", "5", 5], SORT_REGULAR));
34+
35+
// Test 8: Whitespace in numeric strings
36+
echo "\n-- Whitespace in numeric strings --\n";
37+
var_dump(array_unique(["5", " 5", "5 ", 5], SORT_REGULAR));
38+
39+
// Test 9: Case sensitivity for non-numeric strings
40+
echo "\n-- Case sensitivity --\n";
41+
var_dump(array_unique(["abc", "ABC", "Abc"], SORT_REGULAR));
42+
43+
// Test 10: Exponential notation coerces
44+
echo "\n-- Exponential notation --\n";
45+
var_dump(array_unique([1000, "1e3", "1000", 1e3], SORT_REGULAR));
46+
47+
// Test 11: Negative numbers
48+
echo "\n-- Negative numbers --\n";
49+
var_dump(array_unique([-5, "-5", -5.0], SORT_REGULAR));
50+
51+
// Test 12: Arrays as values
52+
echo "\n-- Arrays --\n";
53+
var_dump(array_unique([[1, 2], [1, 2], [1, 3]], SORT_REGULAR));
54+
55+
// Test 13: NaN handling (NaN != NaN)
56+
echo "\n-- NaN handling --\n";
57+
var_dump(array_unique([NAN, NAN, 1], SORT_REGULAR));
58+
59+
// Test 14: INF handling
60+
echo "\n-- INF handling --\n";
61+
var_dump(array_unique([INF, INF, -INF, -INF], SORT_REGULAR));
62+
63+
// Test 15: Bug GH-20262 - mixed numeric and alphanumeric
64+
echo "\n-- Bug GH-20262 case --\n";
65+
var_dump(array_unique(['5', '10', '3A', '5'], SORT_REGULAR));
66+
67+
// Test 16: SORT_REGULAR vs SORT_STRING comparison
68+
echo "\n-- SORT_REGULAR vs SORT_STRING --\n";
69+
$input = [true, 1, "1"];
70+
echo "SORT_REGULAR: ";
71+
var_dump(array_unique($input, SORT_REGULAR));
72+
echo "SORT_STRING: ";
73+
var_dump(array_unique($input, SORT_STRING));
74+
75+
echo "\nDone\n";
76+
?>
77+
--EXPECT--
78+
*** Testing array_unique() with SORT_REGULAR ***
79+
80+
-- Integer and string coercion --
81+
array(2) {
82+
[0]=>
83+
int(1)
84+
[2]=>
85+
int(2)
86+
}
87+
88+
-- Boolean coercion --
89+
array(2) {
90+
[0]=>
91+
bool(true)
92+
[2]=>
93+
bool(false)
94+
}
95+
96+
-- NULL coercion --
97+
array(2) {
98+
[0]=>
99+
NULL
100+
[4]=>
101+
string(1) "0"
102+
}
103+
104+
-- Float coercion --
105+
array(1) {
106+
[0]=>
107+
int(1)
108+
}
109+
110+
-- Numeric strings --
111+
array(1) {
112+
[0]=>
113+
string(2) "10"
114+
}
115+
116+
-- Leading zeros --
117+
array(1) {
118+
[0]=>
119+
string(2) "05"
120+
}
121+
122+
-- Partial numeric strings --
123+
array(2) {
124+
[0]=>
125+
string(4) "5abc"
126+
[1]=>
127+
string(1) "5"
128+
}
129+
130+
-- Whitespace in numeric strings --
131+
array(1) {
132+
[0]=>
133+
string(1) "5"
134+
}
135+
136+
-- Case sensitivity --
137+
array(3) {
138+
[0]=>
139+
string(3) "abc"
140+
[1]=>
141+
string(3) "ABC"
142+
[2]=>
143+
string(3) "Abc"
144+
}
145+
146+
-- Exponential notation --
147+
array(1) {
148+
[0]=>
149+
int(1000)
150+
}
151+
152+
-- Negative numbers --
153+
array(1) {
154+
[0]=>
155+
int(-5)
156+
}
157+
158+
-- Arrays --
159+
array(2) {
160+
[0]=>
161+
array(2) {
162+
[0]=>
163+
int(1)
164+
[1]=>
165+
int(2)
166+
}
167+
[2]=>
168+
array(2) {
169+
[0]=>
170+
int(1)
171+
[1]=>
172+
int(3)
173+
}
174+
}
175+
176+
-- NaN handling --
177+
array(3) {
178+
[0]=>
179+
float(NAN)
180+
[1]=>
181+
float(NAN)
182+
[2]=>
183+
int(1)
184+
}
185+
186+
-- INF handling --
187+
array(2) {
188+
[0]=>
189+
float(INF)
190+
[2]=>
191+
float(-INF)
192+
}
193+
194+
-- Bug GH-20262 case --
195+
array(3) {
196+
[0]=>
197+
string(1) "5"
198+
[1]=>
199+
string(2) "10"
200+
[2]=>
201+
string(2) "3A"
202+
}
203+
204+
-- SORT_REGULAR vs SORT_STRING --
205+
SORT_REGULAR: array(1) {
206+
[0]=>
207+
bool(true)
208+
}
209+
SORT_STRING: array(1) {
210+
[0]=>
211+
bool(true)
212+
}
213+
214+
Done
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
--TEST--
2+
Bug GH-20262 (array_unique() with SORT_REGULAR fails to remove duplicates with mixed strings)
3+
--FILE--
4+
<?php
5+
6+
// Original bug report case
7+
$units = ['5', '10', '3A', '5'];
8+
var_dump(array_unique($units, SORT_REGULAR));
9+
10+
?>
11+
--EXPECT--
12+
array(3) {
13+
[0]=>
14+
string(1) "5"
15+
[1]=>
16+
string(2) "10"
17+
[2]=>
18+
string(2) "3A"
19+
}

0 commit comments

Comments
 (0)