60 lines
1.7 KiB
Plaintext
60 lines
1.7 KiB
Plaintext
is_utf_cont :: inline (b: u8) -> bool {
|
|
return (b & 0xc0) == 0x80;
|
|
}
|
|
|
|
parse_unicode :: (str: string) -> result: u16, success: bool {
|
|
val, success, remainder := string_to_int(str, base = 16);
|
|
if !success || val > 0xFFFF || remainder.count return 0, false;
|
|
return xx val, true;
|
|
}
|
|
|
|
encode_utf8 :: (val: u16, result: *u8) -> len: u8 {
|
|
if val & 0xF800 {
|
|
result[0] = xx (0xE0 | ((val & 0xF000) >> 12));
|
|
result[1] = xx (0x80 | ((val & 0x0FC0) >> 6));
|
|
result[2] = xx (0x80 | (val & 0x003F));
|
|
return 3;
|
|
} else if val & 0x0F80 {
|
|
result[0] = xx (0xC0 | ((val & 0x0FC0) >> 6));
|
|
result[1] = xx (0x80 | (val & 0x003F));
|
|
return 2;
|
|
} else {
|
|
result[0] = xx (val & 0x7F);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
is_valid_utf8 :: (str: string) -> valid:bool {
|
|
for i: 0..str.count-1 {
|
|
cur := str[i];
|
|
if cur >= 0x80 {
|
|
// Must be between 0xc2 and 0xf4 inclusive to be valid
|
|
if (cur - 0xc2) > (0xf4 - 0xc2) return false;
|
|
|
|
if cur < 0xe0 { // 2-byte sequence
|
|
if i + 1 >= str.count || !is_utf_cont(str[i+1]) return false;
|
|
i += 1;
|
|
} else if cur < 0xf0 { // 3-byte sequence
|
|
if i + 2 >= str.count || !is_utf_cont(str[i+1]) || !is_utf_cont(str[i+2]) return false;
|
|
|
|
// Check for surrogate chars
|
|
if cur == 0xed && str[i+1] > 0x9f return false;
|
|
// ToDo: Check if total >= 0x800
|
|
// uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
|
|
i += 2;
|
|
} else { // 4-byte sequence
|
|
if i + 3 >= str.count || !is_utf_cont(str[i+1]) || !is_utf_cont(str[i+2]) || !is_utf_cont(str[i+3]) return false;
|
|
// Make sure its in valid range (0x10000 - 0x10ffff)
|
|
if cur == 0xf0 && str[i + 1] < 0x90 return false;
|
|
if cur == 0xf4 && str[i + 1] > 0x8f return false;
|
|
i += 3;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#scope_file
|
|
|
|
#import "Basic";
|