Skip to content

Commit 90bfa41

Browse files
committed
fix: update space handling in token_split and improve ltrim/rtrim functions
1 parent 6b675a5 commit 90bfa41

3 files changed

Lines changed: 7 additions & 7 deletions

File tree

ggml

Submodule ggml updated from 404fcb9 to ce1c721

src/tokenize_util.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -877,12 +877,12 @@ std::vector<std::string> token_split(const std::string& text) {
877877
// ` ?[^\s\p{L}\p{N}]+[\r\n]*`
878878
{
879879
// ` [^\s\p{L}\p{N}]+[\r\n]*`
880-
if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
880+
if (cp == U' ' && i + 1 < cps.size() && !is_space(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
881881
std::string token = codepoint_to_utf8(cp);
882882
token += codepoint_to_utf8(cps[i + 1]);
883883
i += 2;
884884

885-
while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
885+
while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !is_space(cps[i])) {
886886
token += codepoint_to_utf8(cps[i]);
887887
++i;
888888
}
@@ -898,11 +898,11 @@ std::vector<std::string> token_split(const std::string& text) {
898898

899899
// `[^\s\p{L}\p{N}]+[\r\n]*`
900900
std::string token;
901-
if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
901+
if (!is_letter(cps[i]) && !is_number(cps[i]) && !is_space(cps[i])) {
902902
std::string token = codepoint_to_utf8(cp);
903903
++i;
904904

905-
while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
905+
while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !is_space(cps[i])) {
906906
token += codepoint_to_utf8(cps[i]);
907907
++i;
908908
}

src/util.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -409,14 +409,14 @@ void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float
409409

410410
std::string ltrim(const std::string& s) {
411411
auto it = std::find_if(s.begin(), s.end(), [](int ch) {
412-
return !std::isspace(ch);
412+
return !std::isspace(static_cast<unsigned char>(ch));
413413
});
414414
return std::string(it, s.end());
415415
}
416416

417417
std::string rtrim(const std::string& s) {
418418
auto it = std::find_if(s.rbegin(), s.rend(), [](int ch) {
419-
return !std::isspace(ch);
419+
return !std::isspace(static_cast<unsigned char>(ch));
420420
});
421421
return std::string(s.begin(), it.base());
422422
}

0 commit comments

Comments
 (0)