Skip to content

Commit f834337

Browse files
committed
Add UTF8/UTF16 codepoint count via new simdutf submodule
1 parent 12a2d4f commit f834337

File tree

8 files changed

+211
-49
lines changed

8 files changed

+211
-49
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,6 @@
2222
[submodule "external/oxen-logging"]
2323
path = external/oxen-logging
2424
url = https://github.com/oxen-io/oxen-logging.git
25+
[submodule "external/simdutf"]
26+
path = external/simdutf
27+
url = [email protected]:simdutf/simdutf.git

external/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,3 +193,14 @@ libsession_static_bundle(libzstd_static)
193193
set(JSON_BuildTests OFF CACHE INTERNAL "")
194194
set(JSON_Install ON CACHE INTERNAL "") # Required to export targets that we use
195195
libsession_system_or_submodule(NLOHMANN nlohmann_json nlohmann_json>=3.7.0 nlohmann-json)
196+
197+
set(JSON_BuildTests OFF CACHE INTERNAL "")
198+
set(JSON_Install ON CACHE INTERNAL "") # Required to export targets that we use
199+
200+
function(simdutf_subdir)
201+
set(SIMDUTF_TESTS OFF CACHE BOOL "")
202+
set(SIMDUTF_TOOLS OFF CACHE BOOL "")
203+
set(BUILD_SHARED_LIBS OFF)
204+
add_subdirectory(simdutf)
205+
endfunction()
206+
simdutf_subdir()

external/simdutf

Submodule simdutf added at 7b3f5af

include/session/session_protocol.h

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,12 @@ extern "C" {
1414
#endif
1515

1616
enum {
17-
/// TODO: This comment needs to be updated to be _codepoints_ once libsession implements the
18-
/// character count for the platforms. Currently they use code units but it should be
19-
/// codepoints. This allows the platforms to use their native text representation up until the
20-
/// API boundary where they will convert to UTF8 to have it managed by libsession.
21-
22-
/// Maximum number of UTF16 code units that a standard message can use. If the message exceeds
17+
/// Maximum number of UTF16 code points that a standard message can use. If the message exceeds
2318
/// this then the message must activate the higher character limit feature provided by Session
2419
/// Pro which allows messages up to 10k characters.
2520
PRO_STANDARD_CHARACTER_LIMIT = 2'000,
2621

27-
/// Maximum number of UTF16 code units that a Session Pro entitled user can send in a message.
22+
/// Maximum number of UTF16 code points that a Session Pro entitled user can send in a message.
2823
/// This is not used in the codebase, but is provided for convenience to centralise protocol
2924
/// definitions for users of the library to consume.
3025
PRO_HIGHER_CHARACTER_LIMIT = 10'000,
@@ -118,21 +113,58 @@ typedef struct session_protocol_encrypted_for_destination {
118113
size_t error_len_incl_null_terminator;
119114
} session_protocol_encrypted_for_destination;
120115

121-
/// API: session_protocol/session_protocol_get_pro_features_for_msg
116+
typedef struct session_protocol_pro_features_for_msg {
117+
bool success;
118+
string8 error;
119+
PRO_FEATURES features;
120+
size_t codepoint_count;
121+
} session_protocol_pro_features_for_msg;
122+
123+
/// API: session_protocol/session_protocol_get_pro_features_for_utf8
124+
///
125+
/// Determine the Pro features that are used in a given UTF8 message.
126+
///
127+
/// Inputs:
128+
/// - `utf8` -- the utf8 string to count the number of codepoints in to determine if it needs the
129+
/// higher character limit available in Session Pro
130+
/// - `utf8_size` -- the number of code units (aka. bytes) the string has
131+
/// - `flags` -- extra pro features that are known by clients that they wish to be activated on
132+
/// this message
133+
///
134+
/// Outputs:
135+
/// - `success` -- True if the message was evaluated successfully for PRO features false otherwise.
136+
/// When false, all fields except for `error` should be ignored from the result object.
137+
/// - `error` -- If `success` is false, this is populated with an error code describing the error,
138+
// otherwise it's empty.
139+
/// - `features` -- Session Pro feature flags suitable for writing directly into the protobuf
140+
/// `ProMessage` in `Content`
141+
/// - `codepoint_count` -- Counts the number of unicode codepoints that were in the message.
142+
LIBSESSION_EXPORT
143+
session_protocol_pro_features_for_msg session_protocol_pro_features_for_utf8(
144+
char const* utf8, size_t utf8_size, PRO_EXTRA_FEATURES extra);
145+
146+
/// API: session_protocol/session_protocol_get_pro_features_for_utf16
122147
///
123-
/// Determine the Pro features that are used in a given conversation message.
148+
/// Determine the Pro features that are used in a given UTF16 message.
124149
///
125150
/// Inputs:
126-
/// - `msg_size` -- the size of the message in UTF16 code units to determine if the message requires
127-
/// access to the higher character limit available in Session Pro
151+
/// - `utf8` -- the utf16 string to count the number of codepoints in to determine if it needs the
152+
/// higher character limit available in Session Pro
153+
/// - `utf16_size` -- the number of code units (aka. bytes) the string has
128154
/// - `flags` -- extra pro features that are known by clients that they wish to be activated on
129155
/// this message
130156
///
131157
/// Outputs:
132-
/// - Session Pro feature flags suitable for writing directly into the protobuf `ProMessage` in
133-
/// `Content`
158+
/// - `success` -- True if the message was evaluated successfully for PRO features false otherwise.
159+
/// When false, all fields except for `error` should be ignored from the result object.
160+
/// - `error` -- If `success` is false, this is populated with an error code describing the error,
161+
// otherwise it's empty.
162+
/// - `features` -- Session Pro feature flags suitable for writing directly into the protobuf
163+
/// `ProMessage` in `Content`
164+
/// - `codepoint_count` -- Counts the number of unicode codepoints that were in the message.
134165
LIBSESSION_EXPORT
135-
PRO_FEATURES session_protocol_get_pro_features_for_msg(size_t msg_size, PRO_EXTRA_FEATURES flags);
166+
session_protocol_pro_features_for_msg session_protocol_pro_features_for_utf16(
167+
uint16_t const* utf16, size_t utf16_size, PRO_EXTRA_FEATURES extra);
136168

137169
/// API: session_protocol_encrypt_for_1o1
138170
///

include/session/session_protocol.hpp

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,35 @@ struct DecryptEnvelopeKey {
163163
std::span<std::span<const uint8_t>> ed25519_privkeys;
164164
};
165165

166-
/// API: session_protocol/get_pro_features_for_msg
166+
struct ProFeaturesForMsg
167+
{
168+
bool success;
169+
std::string_view error;
170+
PRO_FEATURES features;
171+
size_t codepoint_count;
172+
};
173+
174+
/// API: session_protocol/get_pro_features_for_utf8
175+
///
176+
/// Determine the Pro features that are used in a given conversation message.
177+
///
178+
/// Inputs:
179+
/// - `msg_size` -- the size of the message in UTF8 code units to determine if the message requires
180+
/// access to the higher character limit available in Session Pro
181+
/// - `flags` -- extra pro features that are known by clients that they wish to be activated on
182+
/// this message
183+
///
184+
/// Outputs:
185+
/// - `success` -- True if the message was evaluated successfully for PRO features false otherwise.
186+
/// When false, all fields except for `error` should be ignored from the result object.
187+
/// - `error` -- If `success` is false, this is populated with an error code describing the error,
188+
// otherwise it's empty.
189+
/// - `features` -- Session Pro feature flags suitable for writing directly into the protobuf
190+
/// `ProMessage` in `Content`
191+
/// - `codepoint_count` -- Counts the number of unicode codepoints that were in the message.
192+
ProFeaturesForMsg get_pro_features_for_utf8(char const *utf8, size_t utf8_size, PRO_EXTRA_FEATURES flags);
193+
194+
/// API: session_protocol/get_pro_features_for_utf16
167195
///
168196
/// Determine the Pro features that are used in a given conversation message.
169197
///
@@ -174,9 +202,15 @@ struct DecryptEnvelopeKey {
174202
/// this message
175203
///
176204
/// Outputs:
177-
/// - Session Pro feature flags suitable for writing directly into the protobuf `ProMessage` in
178-
/// `Content`
179-
PRO_FEATURES get_pro_features_for_msg(size_t msg_size, PRO_EXTRA_FEATURES flags);
205+
/// - `success` -- True if the message was evaluated successfully for PRO features false otherwise.
206+
/// When false, all fields except for `error` should be ignored from the result object.
207+
/// - `error` -- If `success` is false, this is populated with an error code describing the error,
208+
// otherwise it's empty.
209+
/// - `features` -- Session Pro feature flags suitable for writing directly into the protobuf
210+
/// `ProMessage` in `Content`
211+
/// - `codepoint_count` -- Counts the number of unicode codepoints that were in the message.
212+
ProFeaturesForMsg get_pro_features_for_utf16(
213+
char16_t const* utf16, size_t utf8_size, PRO_EXTRA_FEATURES flags);
180214

181215
/// API: session_protocol/encrypt_for_1o1
182216
///

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ target_link_libraries(crypto
9494
libsodium::sodium-internal
9595
nlohmann_json::nlohmann_json
9696
libsession::protos
97+
simdutf
9798
)
9899

99100
target_link_libraries(config

src/session_protocol.cpp

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <fmt/core.h>
22
#include <oxenc/hex.h>
33
#include <session/config/groups/keys.h>
4+
#include <simdutf.h>
45
#include <sodium/crypto_sign_ed25519.h>
56
#include <sodium/randombytes.h>
67

@@ -16,21 +17,47 @@
1617
#include "WebSocketResources.pb.h"
1718
#include "session/export.h"
1819

19-
namespace session {
20-
21-
PRO_FEATURES get_pro_features_for_msg(size_t msg_size, PRO_EXTRA_FEATURES extra) {
22-
PRO_FEATURES result = PRO_FEATURES_NIL;
23-
24-
if (msg_size > PRO_STANDARD_CHARACTER_LIMIT)
25-
result |= PRO_FEATURES_10K_CHARACTER_LIMIT;
20+
namespace {
21+
session::ProFeaturesForMsg pro_features_for_utf8_or_16(
22+
const void* utf, size_t utf_size, PRO_EXTRA_FEATURES extra, bool is_utf8) {
23+
session::ProFeaturesForMsg result = {};
24+
simdutf::result validate = is_utf8 ? simdutf::validate_utf8_with_errors(
25+
reinterpret_cast<const char*>(utf), utf_size)
26+
: simdutf::validate_utf16_with_errors(
27+
reinterpret_cast<const char16_t*>(utf), utf_size);
28+
if (validate.is_ok()) {
29+
result.success = true;
30+
result.codepoint_count =
31+
is_utf8 ? simdutf::count_utf8(reinterpret_cast<const char*>(utf), utf_size)
32+
: simdutf::count_utf16(reinterpret_cast<const char16_t*>(utf), utf_size);
33+
if (result.codepoint_count > PRO_STANDARD_CHARACTER_LIMIT)
34+
result.features |= PRO_FEATURES_10K_CHARACTER_LIMIT;
35+
36+
if (extra & PRO_EXTRA_FEATURES_ANIMATED_AVATAR)
37+
result.features |= PRO_FEATURES_ANIMATED_AVATAR;
38+
39+
if (extra & PRO_EXTRA_FEATURES_PRO_BADGE)
40+
result.features |= PRO_FEATURES_PRO_BADGE;
41+
42+
assert((result.features & ~PRO_FEATURES_ALL) == 0);
43+
} else {
44+
result.error = simdutf::error_to_string(validate.error);
45+
}
46+
return result;
47+
}
48+
}; // namespace
2649

27-
if (extra & PRO_EXTRA_FEATURES_ANIMATED_AVATAR)
28-
result |= PRO_FEATURES_ANIMATED_AVATAR;
50+
namespace session {
2951

30-
if (extra & PRO_EXTRA_FEATURES_PRO_BADGE)
31-
result |= PRO_FEATURES_PRO_BADGE;
52+
ProFeaturesForMsg pro_features_for_utf8(
53+
const char* utf, size_t utf_size, PRO_EXTRA_FEATURES extra) {
54+
ProFeaturesForMsg result = pro_features_for_utf8_or_16(utf, utf_size, extra, /*is_utf8*/ true);
55+
return result;
56+
}
3257

33-
assert((result & ~PRO_FEATURES_ALL) == 0);
58+
ProFeaturesForMsg pro_features_for_utf16(
59+
const uint16_t* utf, size_t utf_size, PRO_EXTRA_FEATURES extra) {
60+
ProFeaturesForMsg result = pro_features_for_utf8_or_16(utf, utf_size, extra, /*is_utf8*/ false);
3461
return result;
3562
}
3663

@@ -390,8 +417,10 @@ DecryptedEnvelope decrypt_envelope(
390417
// the source is a Session public key (see: encrypt_for_destination)
391418
const std::string& source = envelope.source();
392419
if (source.size() != result.envelope.source.max_size())
393-
throw std::runtime_error(fmt::format(
394-
"Parse envelope failed, source had unexpected size ({} bytes)", source.size()));
420+
throw std::runtime_error(
421+
fmt::format(
422+
"Parse envelope failed, source had unexpected size ({} bytes)",
423+
source.size()));
395424
std::memcpy(result.envelope.source.data(), source.data(), source.size());
396425
result.envelope.flags |= ENVELOPE_FLAGS_SOURCE;
397426
}
@@ -546,8 +575,28 @@ DecryptedEnvelope decrypt_envelope(
546575
using namespace session;
547576

548577
LIBSESSION_C_API
549-
PRO_FEATURES session_protocol_get_pro_features_for_msg(size_t msg_size, PRO_EXTRA_FEATURES flags) {
550-
PRO_FEATURES result = get_pro_features_for_msg(msg_size, flags);
578+
session_protocol_pro_features_for_msg session_protocol_pro_features_for_utf8(
579+
const char* utf, size_t utf_size, PRO_EXTRA_FEATURES extra) {
580+
ProFeaturesForMsg result_cpp = pro_features_for_utf8_or_16(utf, utf_size, extra, /*is_utf8*/ true);
581+
session_protocol_pro_features_for_msg result = {
582+
.success = result_cpp.success,
583+
.error = {const_cast<char*>(result_cpp.error.data()), result_cpp.error.size()},
584+
.features = result_cpp.features,
585+
.codepoint_count = result_cpp.codepoint_count,
586+
};
587+
return result;
588+
}
589+
590+
LIBSESSION_C_API
591+
session_protocol_pro_features_for_msg session_protocol_pro_features_for_utf16(
592+
const uint16_t* utf, size_t utf_size, PRO_EXTRA_FEATURES extra) {
593+
ProFeaturesForMsg result_cpp = pro_features_for_utf8_or_16(utf, utf_size, extra, /*is_utf8*/ false);
594+
session_protocol_pro_features_for_msg result = {
595+
.success = result_cpp.success,
596+
.error = {const_cast<char*>(result_cpp.error.data()), result_cpp.error.size()},
597+
.features = result_cpp.features,
598+
.codepoint_count = result_cpp.codepoint_count,
599+
};
551600
return result;
552601
}
553602

tests/test_session_protocol.cpp

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -95,21 +95,52 @@ TEST_CASE("Session protocol helpers C API", "[session-protocol][helpers]") {
9595
// Do tests that require no setup
9696
SECTION("Ensure get pro fetaures detects large message") {
9797
// Try a message below the size threshold
98-
PRO_FEATURES features = session_protocol_get_pro_features_for_msg(
99-
PRO_STANDARD_CHARACTER_LIMIT,
100-
PRO_EXTRA_FEATURES_PRO_BADGE | PRO_EXTRA_FEATURES_ANIMATED_AVATAR);
101-
REQUIRE(features == (PRO_FEATURES_PRO_BADGE | PRO_FEATURES_ANIMATED_AVATAR));
98+
{
99+
auto msg = std::string(PRO_STANDARD_CHARACTER_LIMIT, 'a');
100+
session_protocol_pro_features_for_msg pro_msg =
101+
session_protocol_pro_features_for_utf8(
102+
msg.data(),
103+
msg.size(),
104+
PRO_EXTRA_FEATURES_PRO_BADGE | PRO_EXTRA_FEATURES_ANIMATED_AVATAR);
105+
REQUIRE(pro_msg.success);
106+
REQUIRE(pro_msg.features == (PRO_FEATURES_PRO_BADGE | PRO_FEATURES_ANIMATED_AVATAR));
107+
REQUIRE(pro_msg.codepoint_count == msg.size());
108+
}
109+
110+
// Try an invalid message
111+
{
112+
std::string_view msg = "\xFF";
113+
session_protocol_pro_features_for_msg pro_msg = session_protocol_pro_features_for_utf8(
114+
msg.data(), msg.size(), PRO_FEATURES_NIL);
115+
REQUIRE(!pro_msg.success);
116+
REQUIRE(pro_msg.error.size);
117+
}
102118

103119
// Try a message exceeding the size threshold
104-
features = session_protocol_get_pro_features_for_msg(
105-
PRO_STANDARD_CHARACTER_LIMIT + 1,
106-
PRO_EXTRA_FEATURES_PRO_BADGE | PRO_EXTRA_FEATURES_ANIMATED_AVATAR);
107-
REQUIRE(features == (PRO_FEATURES_10K_CHARACTER_LIMIT | PRO_FEATURES_PRO_BADGE |
108-
PRO_FEATURES_ANIMATED_AVATAR));
120+
{
121+
auto msg = std::string(PRO_STANDARD_CHARACTER_LIMIT + 1, 'a');
122+
session_protocol_pro_features_for_msg pro_msg = session_protocol_pro_features_for_utf8(
123+
msg.data(),
124+
msg.size(),
125+
PRO_EXTRA_FEATURES_PRO_BADGE | PRO_EXTRA_FEATURES_ANIMATED_AVATAR);
126+
REQUIRE(pro_msg.success);
127+
REQUIRE(pro_msg.features == (PRO_FEATURES_10K_CHARACTER_LIMIT | PRO_FEATURES_PRO_BADGE |
128+
PRO_FEATURES_ANIMATED_AVATAR));
129+
REQUIRE(pro_msg.codepoint_count == msg.size());
130+
}
109131

110132
// Try asking for just one extra feature
111-
features = session_protocol_get_pro_features_for_msg(100, PRO_EXTRA_FEATURES_PRO_BADGE);
112-
REQUIRE(features == PRO_FEATURES_PRO_BADGE);
133+
{
134+
auto msg = std::string(PRO_STANDARD_CHARACTER_LIMIT, 'a');
135+
session_protocol_pro_features_for_msg pro_msg =
136+
session_protocol_pro_features_for_utf8(
137+
msg.data(),
138+
msg.size(),
139+
PRO_EXTRA_FEATURES_PRO_BADGE);
140+
REQUIRE(pro_msg.success);
141+
REQUIRE(pro_msg.features == PRO_FEATURES_PRO_BADGE);
142+
REQUIRE(pro_msg.codepoint_count == msg.size());
143+
}
113144
}
114145

115146
// Tests that require some setup code
@@ -367,17 +398,17 @@ TEST_CASE("Session protocol helpers C API", "[session-protocol][helpers]") {
367398
std::string large_message;
368399
large_message.resize(PRO_STANDARD_CHARACTER_LIMIT + 1);
369400

370-
PRO_FEATURES features =
371-
get_pro_features_for_msg(large_message.size(), PRO_EXTRA_FEATURES_PRO_BADGE);
372-
REQUIRE(features == (PRO_FEATURES_10K_CHARACTER_LIMIT | PRO_FEATURES_PRO_BADGE));
401+
session_protocol_pro_features_for_msg pro_msg = session_protocol_pro_features_for_utf8(
402+
large_message.data(), large_message.size(), PRO_EXTRA_FEATURES_PRO_BADGE);
403+
REQUIRE(pro_msg.features == (PRO_FEATURES_10K_CHARACTER_LIMIT | PRO_FEATURES_PRO_BADGE));
373404

374405
SerialisedProtobufContentWithProForTesting protobuf_content_with_pro_and_features =
375406
build_protobuf_content_with_session_pro(
376407
/*data_body*/ large_message,
377408
/*user_rotating_privkey*/ user_pro_ed_sk,
378409
/*pro_backend_privkey*/ pro_backend_ed_sk,
379410
/*pro_expiry_unix_ts*/ timestamp_s,
380-
features);
411+
pro_msg.features);
381412

382413
// Encrypt content
383414
session_protocol_encrypted_for_destination encrypt_result =

0 commit comments

Comments
 (0)