Skip to content

Commit beaba10

Browse files
committed
MDEV-39995 JSON_CONTAINS and JSON_EQUALS do not compare strings based on semantic
JSON_CONTAINS, JSON_EQUALS, and JSON_OVERLAPS used raw byte-level comparison (memcmp) for JSON string values, which meant semantically equivalent strings like "A" and "\u0041" were incorrectly treated as different. Fix: add json_string_compare() that decodes Unicode escape sequences before comparing, and fix json_normalize to produce a canonical form for strings with escapes so JSON_EQUALS works correctly. All new code of the whole pull request, including one or several files that are either new files or modified ones, are contributed under the BSD-new license. I am contributing on behalf of my employer Amazon Web Services, Inc.
1 parent b88fe40 commit beaba10

6 files changed

Lines changed: 398 additions & 12 deletions

File tree

include/json_lib.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,15 @@ int json_normalize(DYNAMIC_STRING *result,
476476

477477
int json_skip_array_and_count(json_engine_t *j, int* n_item);
478478

479+
/*
480+
Compare two JSON string values semantically, taking Unicode escape
481+
sequences into account. For example, "A" and "\u0041" are considered equal.
482+
Returns 0 if the strings are equal, non-zero otherwise.
483+
*/
484+
int json_string_compare(CHARSET_INFO *cs,
485+
const uchar *str1, int len1, int escaped1,
486+
const uchar *str2, int len2, int escaped2);
487+
479488
inline static int json_scan_ended(json_engine_t *j)
480489
{
481490
return (j->state == JST_ARRAY_END && j->stack_p == 0);
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#
2+
# MDEV-39995: JSON_CONTAINS and JSON_EQUALS do not compare strings
3+
# based on semantic
4+
#
5+
#
6+
# JSON string values with Unicode escape sequences should be treated
7+
# as semantically equal to their literal equivalents.
8+
# \u0041 is the Unicode escape for 'A'.
9+
#
10+
# JSON_CONTAINS: should return 1 for semantically equal strings
11+
SELECT JSON_CONTAINS('"A"', '"\\u0041"');
12+
JSON_CONTAINS('"A"', '"\\u0041"')
13+
1
14+
SELECT JSON_CONTAINS('"\\u0041"', '"A"');
15+
JSON_CONTAINS('"\\u0041"', '"A"')
16+
1
17+
# JSON_OVERLAPS: should return 1 for semantically equal strings
18+
SELECT JSON_OVERLAPS('"A"', '"\\u0041"');
19+
JSON_OVERLAPS('"A"', '"\\u0041"')
20+
1
21+
# JSON_EQUALS: should return 1 for semantically equal strings
22+
SELECT JSON_EQUALS('"A"', '"\\u0041"');
23+
JSON_EQUALS('"A"', '"\\u0041"')
24+
1
25+
# JSON_UNQUOTE correctly resolves the escape (proving they are the same)
26+
SELECT JSON_UNQUOTE('"A"') = JSON_UNQUOTE('"\\u0041"');
27+
JSON_UNQUOTE('"A"') = JSON_UNQUOTE('"\\u0041"')
28+
1
29+
#
30+
# Additional test from MDEV-39995 comment:
31+
# Using hex literal that represents the bytes of '"\u0041"'
32+
#
33+
SELECT JSON_UNQUOTE('"A"');
34+
JSON_UNQUOTE('"A"')
35+
A
36+
SELECT JSON_UNQUOTE(CAST(0x225C753030343122 AS CHAR));
37+
JSON_UNQUOTE(CAST(0x225C753030343122 AS CHAR))
38+
A
39+
SELECT JSON_CONTAINS('"A"', CAST(0x225C753030343122 AS CHAR));
40+
JSON_CONTAINS('"A"', CAST(0x225C753030343122 AS CHAR))
41+
1
42+
SELECT JSON_CONTAINS(JSON_QUOTE(JSON_UNQUOTE('"A"')),
43+
JSON_QUOTE(JSON_UNQUOTE(CAST(0x225C753030343122 AS CHAR))));
44+
JSON_CONTAINS(JSON_QUOTE(JSON_UNQUOTE('"A"')),
45+
JSON_QUOTE(JSON_UNQUOTE(CAST(0x225C753030343122 AS CHAR))))
46+
1
47+
#
48+
# More Unicode escape equivalences
49+
#
50+
# \u0048\u0065\u006C\u006C\u006F = "Hello"
51+
SELECT JSON_CONTAINS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"');
52+
JSON_CONTAINS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"')
53+
1
54+
SELECT JSON_EQUALS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"');
55+
JSON_EQUALS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"')
56+
1
57+
SELECT JSON_OVERLAPS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"');
58+
JSON_OVERLAPS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"')
59+
1
60+
# Mixed literal and escape in the same string: "H\u0065llo" = "Hello"
61+
SELECT JSON_EQUALS('"Hello"', '"H\\u0065llo"');
62+
JSON_EQUALS('"Hello"', '"H\\u0065llo"')
63+
1
64+
#
65+
# Test within arrays and objects
66+
#
67+
SELECT JSON_CONTAINS('["A", "B"]', '["\\u0041"]');
68+
JSON_CONTAINS('["A", "B"]', '["\\u0041"]')
69+
1
70+
SELECT JSON_CONTAINS('{"key": "A"}', '{"key": "\\u0041"}');
71+
JSON_CONTAINS('{"key": "A"}', '{"key": "\\u0041"}')
72+
1
73+
SELECT JSON_EQUALS('["A", "B"]', '["\\u0041", "\\u0042"]');
74+
JSON_EQUALS('["A", "B"]', '["\\u0041", "\\u0042"]')
75+
1
76+
SELECT JSON_EQUALS('{"key": "A"}', '{"key": "\\u0041"}');
77+
JSON_EQUALS('{"key": "A"}', '{"key": "\\u0041"}')
78+
1
79+
#
80+
# Surrogate pairs: characters above U+FFFF encoded as two \uXXXX escapes.
81+
# U+1F600 (😀) = \uD83D\uDE00
82+
# U+1F60A (😊) = \uD83D\uDE0A
83+
#
84+
SET NAMES utf8mb4;
85+
SELECT JSON_EQUALS('"😀"', '"\\uD83D\\uDE00"');
86+
JSON_EQUALS('"?"', '"\\uD83D\\uDE00"')
87+
1
88+
SELECT JSON_CONTAINS('"😀"', '"\\uD83D\\uDE00"');
89+
JSON_CONTAINS('"?"', '"\\uD83D\\uDE00"')
90+
1
91+
SELECT JSON_OVERLAPS('"😀"', '"\\uD83D\\uDE00"');
92+
JSON_OVERLAPS('"?"', '"\\uD83D\\uDE00"')
93+
1
94+
SELECT JSON_EQUALS('"😊"', '"\\uD83D\\uDE0A"');
95+
JSON_EQUALS('"?"', '"\\uD83D\\uDE0A"')
96+
1
97+
SELECT JSON_CONTAINS('["😀", "hello"]', '["\\uD83D\\uDE00"]');
98+
JSON_CONTAINS('["?", "hello"]', '["\\uD83D\\uDE00"]')
99+
1
100+
SELECT JSON_EQUALS('{"emoji": "😀"}', '{"emoji": "\\uD83D\\uDE00"}');
101+
JSON_EQUALS('{"emoji": "?"}', '{"emoji": "\\uD83D\\uDE00"}')
102+
1
103+
#
104+
# Escaped object keys: \u006B\u0065\u0079 = "key"
105+
#
106+
SELECT JSON_EQUALS('{"key":"A"}', '{"\\u006B\\u0065\\u0079":"A"}');
107+
JSON_EQUALS('{"key":"A"}', '{"\\u006B\\u0065\\u0079":"A"}')
108+
1
109+
SELECT JSON_CONTAINS('{"key":"A"}', '{"\\u006B\\u0065\\u0079":"A"}');
110+
JSON_CONTAINS('{"key":"A"}', '{"\\u006B\\u0065\\u0079":"A"}')
111+
1
112+
#
113+
# BMP non-ASCII: é = U+00E9, literal UTF-8 vs escape
114+
#
115+
SELECT JSON_EQUALS('"é"', '"\\u00E9"');
116+
JSON_EQUALS('"é"', '"\\u00E9"')
117+
1
118+
SELECT JSON_CONTAINS('"é"', '"\\u00E9"');
119+
JSON_CONTAINS('"é"', '"\\u00E9"')
120+
1
121+
SELECT JSON_OVERLAPS('["é"]', '["\\u00E9"]');
122+
JSON_OVERLAPS('["é"]', '["\\u00E9"]')
123+
1
124+
#
125+
# CJK: 中 = U+4E2D
126+
#
127+
SELECT JSON_EQUALS('"中"', '"\\u4E2D"');
128+
JSON_EQUALS('"中"', '"\\u4E2D"')
129+
1
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
--echo #
2+
--echo # MDEV-39995: JSON_CONTAINS and JSON_EQUALS do not compare strings
3+
--echo # based on semantic
4+
--echo #
5+
6+
--echo #
7+
--echo # JSON string values with Unicode escape sequences should be treated
8+
--echo # as semantically equal to their literal equivalents.
9+
--echo # \u0041 is the Unicode escape for 'A'.
10+
--echo #
11+
12+
--echo # JSON_CONTAINS: should return 1 for semantically equal strings
13+
SELECT JSON_CONTAINS('"A"', '"\\u0041"');
14+
SELECT JSON_CONTAINS('"\\u0041"', '"A"');
15+
16+
--echo # JSON_OVERLAPS: should return 1 for semantically equal strings
17+
SELECT JSON_OVERLAPS('"A"', '"\\u0041"');
18+
19+
--echo # JSON_EQUALS: should return 1 for semantically equal strings
20+
SELECT JSON_EQUALS('"A"', '"\\u0041"');
21+
22+
--echo # JSON_UNQUOTE correctly resolves the escape (proving they are the same)
23+
SELECT JSON_UNQUOTE('"A"') = JSON_UNQUOTE('"\\u0041"');
24+
25+
--echo #
26+
--echo # Additional test from MDEV-39995 comment:
27+
--echo # Using hex literal that represents the bytes of '"\u0041"'
28+
--echo #
29+
SELECT JSON_UNQUOTE('"A"');
30+
SELECT JSON_UNQUOTE(CAST(0x225C753030343122 AS CHAR));
31+
SELECT JSON_CONTAINS('"A"', CAST(0x225C753030343122 AS CHAR));
32+
SELECT JSON_CONTAINS(JSON_QUOTE(JSON_UNQUOTE('"A"')),
33+
JSON_QUOTE(JSON_UNQUOTE(CAST(0x225C753030343122 AS CHAR))));
34+
35+
--echo #
36+
--echo # More Unicode escape equivalences
37+
--echo #
38+
--echo # \u0048\u0065\u006C\u006C\u006F = "Hello"
39+
SELECT JSON_CONTAINS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"');
40+
SELECT JSON_EQUALS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"');
41+
SELECT JSON_OVERLAPS('"Hello"', '"\\u0048\\u0065\\u006C\\u006C\\u006F"');
42+
43+
--echo # Mixed literal and escape in the same string: "H\u0065llo" = "Hello"
44+
SELECT JSON_EQUALS('"Hello"', '"H\\u0065llo"');
45+
46+
--echo #
47+
--echo # Test within arrays and objects
48+
--echo #
49+
SELECT JSON_CONTAINS('["A", "B"]', '["\\u0041"]');
50+
SELECT JSON_CONTAINS('{"key": "A"}', '{"key": "\\u0041"}');
51+
SELECT JSON_EQUALS('["A", "B"]', '["\\u0041", "\\u0042"]');
52+
SELECT JSON_EQUALS('{"key": "A"}', '{"key": "\\u0041"}');
53+
54+
--echo #
55+
--echo # Surrogate pairs: characters above U+FFFF encoded as two \uXXXX escapes.
56+
--echo # U+1F600 (😀) = \uD83D\uDE00
57+
--echo # U+1F60A (😊) = \uD83D\uDE0A
58+
--echo #
59+
SET NAMES utf8mb4;
60+
SELECT JSON_EQUALS('"😀"', '"\\uD83D\\uDE00"');
61+
SELECT JSON_CONTAINS('"😀"', '"\\uD83D\\uDE00"');
62+
SELECT JSON_OVERLAPS('"😀"', '"\\uD83D\\uDE00"');
63+
SELECT JSON_EQUALS('"😊"', '"\\uD83D\\uDE0A"');
64+
SELECT JSON_CONTAINS('["😀", "hello"]', '["\\uD83D\\uDE00"]');
65+
SELECT JSON_EQUALS('{"emoji": "😀"}', '{"emoji": "\\uD83D\\uDE00"}');
66+
67+
--echo #
68+
--echo # Escaped object keys: \u006B\u0065\u0079 = "key"
69+
--echo #
70+
SELECT JSON_EQUALS('{"key":"A"}', '{"\\u006B\\u0065\\u0079":"A"}');
71+
SELECT JSON_CONTAINS('{"key":"A"}', '{"\\u006B\\u0065\\u0079":"A"}');
72+
73+
--echo #
74+
--echo # BMP non-ASCII: é = U+00E9, literal UTF-8 vs escape
75+
--echo #
76+
SELECT JSON_EQUALS('"é"', '"\\u00E9"');
77+
SELECT JSON_CONTAINS('"é"', '"\\u00E9"');
78+
SELECT JSON_OVERLAPS('["é"]', '["\\u00E9"]');
79+
80+
--echo #
81+
--echo # CJK: 中 = U+4E2D
82+
--echo #
83+
SELECT JSON_EQUALS('"中"', '"\\u4E2D"');

sql/item_jsonfunc.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1686,12 +1686,10 @@ int Item_func_json_contains::check_contains(json_engine_t *js,
16861686
{
16871687
return FALSE;
16881688
}
1689-
/*
1690-
TODO: make proper json-json comparison here that takes excipient
1691-
into account.
1692-
*/
1693-
return value->value_len == js->value_len &&
1694-
memcmp(value->value, js->value, value->value_len) == 0;
1689+
return json_string_compare(js->s.cs,
1690+
js->value, js->value_len, js->value_escaped,
1691+
value->value, value->value_len,
1692+
value->value_escaped) == 0;
16951693
case JSON_VALUE_NUMBER:
16961694
if (value->value_type == JSON_VALUE_NUMBER)
16971695
{
@@ -4990,8 +4988,10 @@ static bool json_find_overlap_with_scalar(json_engine_t *js, json_engine_t *valu
49904988
}
49914989
else if (js->value_type == JSON_VALUE_STRING)
49924990
{
4993-
return value->value_len == js->value_len &&
4994-
memcmp(value->value, js->value, value->value_len) == 0;
4991+
return json_string_compare(js->s.cs,
4992+
js->value, js->value_len, js->value_escaped,
4993+
value->value, value->value_len,
4994+
value->value_escaped) == 0;
49954995
}
49964996
}
49974997
return value->value_type == js->value_type;

strings/json_lib.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1813,6 +1813,43 @@ int json_unescape(CHARSET_INFO *json_cs,
18131813
}
18141814

18151815

1816+
/*
1817+
Compare two JSON string values semantically, resolving escape sequences.
1818+
If neither string has escapes, falls back to memcmp for speed.
1819+
Returns 0 if strings are equal, non-zero otherwise.
1820+
*/
1821+
int json_string_compare(CHARSET_INFO *cs,
1822+
const uchar *str1, int len1, int escaped1,
1823+
const uchar *str2, int len2, int escaped2)
1824+
{
1825+
json_string_t s1, s2;
1826+
int r1;
1827+
1828+
if (!escaped1 && !escaped2)
1829+
{
1830+
if (len1 != len2)
1831+
return 1;
1832+
return memcmp(str1, str2, len1);
1833+
}
1834+
1835+
json_string_setup(&s1, cs, str1, str1 + len1);
1836+
json_string_setup(&s2, cs, str2, str2 + len2);
1837+
1838+
for (r1= json_read_string_const_chr(&s1); r1 == 0;
1839+
r1= json_read_string_const_chr(&s1))
1840+
{
1841+
int r2= json_read_string_const_chr(&s2);
1842+
if (r2)
1843+
return 1;
1844+
if (s1.c_next != s2.c_next)
1845+
return 1;
1846+
}
1847+
1848+
return (json_read_string_const_chr(&s2) != 0 &&
1849+
s1.error == JE_EOS && s2.error == JE_EOS) ? 0 : 1;
1850+
}
1851+
1852+
18161853
/* When we need to replace a character with the escaping. */
18171854
enum json_esc_char_classes {
18181855
ESC_= 0, /* No need to escape. */

0 commit comments

Comments
 (0)