-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUnicode.hpp
More file actions
160 lines (151 loc) · 5.1 KB
/
Unicode.hpp
File metadata and controls
160 lines (151 loc) · 5.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#pragma once
#define PCL_UNICODE
/// @brief Unicode utility functions and constants for PClib.
namespace pcuni{
inline bool isHighSurrogate(char16_t ch)
{return ch>=0xD800&&ch<=0xDBFF;}
inline bool isLowSurrogate(char16_t ch)
{return ch>=0xDC00&&ch<=0xDFFF;}
inline int getUnicodePlane(char16_t ch){return 0;}
/// @brief Get the Unicode plane for surrogates.
inline int getUnicodePlane(char16_t high,char16_t low)
{
if(!isHighSurrogate(high)||!isLowSurrogate(low)) // invalid
return -1;
unsigned int code=((high-0xD800)<<10)+(low-0xDC00)+0x10000;
return code>>16;
}
unsigned int surrogateToCode(char16_t high,char16_t low)
{
if(!isHighSurrogate(high)||!isLowSurrogate(low)) // invalid
return -1;
unsigned int u10=high-0xD800;
unsigned int l10=low-0xDC00;
return 0x10000+(u10<<10)+l10;
}
struct range{unsigned short first,last;};
bool biSearch(unsigned short u,const range *table,int max)
{
int l=0,r=max;
while(l<r)
{
int m=(l+r)/2;
if(u<table[m].first) r=m;
else if(u>table[m].last) l=m+1;
else return true;
}
return false;
}
/* Combining characters (width 0) - from Markus Kuhn's wcwidth */
const range combining[]=
{
{0x0300,0x036F},{0x0483,0x0489},{0x0591,0x05BD},{0x05BF,0x05BF},
{0x05C1,0x05C2},{0x05C4,0x05C5},{0x05C7,0x05C7},{0x0610,0x061A},
{0x064B,0x065F},{0x0670,0x0670},{0x06D6,0x06DC},{0x06DF,0x06E4},
{0x06E7,0x06E8},{0x06EA,0x06ED},{0x0711,0x0711},{0x0730,0x074A},
{0x07A6,0x07B0},{0x07EB,0x07F3},{0x0816,0x0819},{0x081B,0x0823},
{0x0825,0x0827},{0x0829,0x082D},{0x0859,0x085B},{0x08D3,0x08E1},
{0x08E3,0x0903},{0x093A,0x093C},{0x093E,0x094F},{0x0951,0x0957},
{0x0962,0x0963},{0x0981,0x0983},{0x09BC,0x09BC},{0x09BE,0x09C4},
{0x09C7,0x09C8},{0x09CB,0x09CD},{0x09D7,0x09D7},{0x09E2,0x09E3},
{0x0A01,0x0A03},{0x0A3C,0x0A3C},{0x0A3E,0x0A42},{0x0A47,0x0A48},
{0x0A4B,0x0A4D},{0x0A51,0x0A51},{0x0A70,0x0A71},{0x0A75,0x0A75},
{0x0A81,0x0A83},{0x0ABC,0x0ABC},{0x0ABE,0x0AC5},{0x0AC7,0x0AC9},
{0x0ACB,0x0ACD},{0x0AE2,0x0AE3},{0x0B01,0x0B03},{0x0B3C,0x0B3C},
{0x0B3E,0x0B44},{0x0B47,0x0B48},{0x0B4B,0x0B4D},{0x0B56,0x0B57},
{0x0B62,0x0B63},{0x0B82,0x0B82},{0x0BBE,0x0BC2},{0x0BC6,0x0BC8},
{0x0BCA,0x0BCD},{0x0BD7,0x0BD7},{0x0C00,0x0C04},{0x0C3E,0x0C44},
{0x0C46,0x0C48},{0x0C4A,0x0C4D},{0x0C55,0x0C56},{0x0C62,0x0C63},
{0x0C81,0x0C83},{0x0CBC,0x0CBC},{0x0CBE,0x0CC4},{0x0CC6,0x0CC8},
{0x0CCA,0x0CCD},{0x0CD5,0x0CD6},{0x0CE2,0x0CE3},{0x0D01,0x0D03},
{0x0D3B,0x0D3C},{0x0D3E,0x0D44},{0x0D46,0x0D48},{0x0D4A,0x0D4D},
{0x0D57,0x0D57},{0x0D62,0x0D63},{0x0D81,0x0D83},{0x0DCA,0x0DCA},
{0x0DCF,0x0DD4},{0x0DD6,0x0DD6},{0x0DD8,0x0DDF},{0x0DF2,0x0DF3},
{0x0E31,0x0E31},{0x0E34,0x0E3A},{0x0E47,0x0E4E},{0x0EB1,0x0EB1},
{0x0EB4,0x0EBC},{0x0EC8,0x0ECD},{0x0F18,0x0F19},{0x0F35,0x0F35},
{0x0F37,0x0F37},{0x0F39,0x0F39},{0x0F3E,0x0F3F},{0x0F71,0x0F84},
{0x0F86,0x0F8B},{0x0F90,0x0F95},{0x0F97,0x0F97},{0x0F99,0x0FAD},
{0x0FB1,0x0FB7},{0x0FB9,0x0FB9},{0x20D0,0x20DC},{0x20E1,0x20E1},
{0x302A,0x302F},{0x3099,0x309A},{0xFE20,0xFE2F}
};
/* East Asian wide/fullwidth ranges (width 2) */
const range wide[]=
{
{0x1100,0x115F},{0x2329,0x232A},{0x2E80,0xA4CF},
{0xAC00,0xD7A3},{0xF900,0xFAFF},{0xFE10,0xFE19},
{0xFE30,0xFE6F},{0xFF00,0xFF60},{0xFFE0,0xFFE6}
};
#define pcUNI_SURROGATE_WIDTH 2
/**
* @brief Get display width for a single UTF-16 code unit in a monospace console:
* - 0 for NUL, control codes and combining marks
* - 2 for East Asian wide/fullwidth characters, surrogates
* - 1 otherwise
*/
int charWidthInConsole(char16_t ch)
{
const unsigned short u=static_cast<unsigned short>(ch);
// C0/C1 control characters
if(u==0) return 0;
if(u<0x20) return 0;
if(u>=0x7f&&u<0xa0) return 0;
// Surrogates
if(u>=0xD800&&u<=0xDFFF)
return pcUNI_SURROGATE_WIDTH;
// Combining marks
if(biSearch(u,combining,sizeof(combining)/sizeof(range)))
return 0;
// East Asian Wide/Fullwidth -> width 2
if(biSearch(u,wide,sizeof(wide)/sizeof(range)))
return 2;
return 1; // default
}
void uft16_to_uft8(const char16_t* u16str,char* u8str)
{
typedef unsigned int uint;
typedef unsigned short ushort;
if(!u16str||!u8str) return;
const ushort* s=reinterpret_cast<const ushort*>(u16str);
int idx=0;
int i=0;
while(s[i]!=0)
{
uint code;
ushort w1=s[i++];
if(w1>=0xD800&&w1<=0xDBFF) // high surrogate
{
ushort w2=s[i];
if(w2>=0xDC00&&w2<=0xDFFF) // valid low surrogate
code=0x10000u+((-0xD800u+w1)<<10)+(-0xDC00u+w2),i++;
else // invalid sequence -> replacement char
code=0xFFFDu;
}
else if(w1>=0xDC00&&w1<=0xDFFF) // unexpected low surrogate
code=0xFFFDu;
else
code=w1;
if(code<=0x7F)
u8str[idx++]=char(code);
else if(code<=0x7FF)
u8str[idx++]=char(0xC0|(code>>6)),
u8str[idx++]=char(0x80|(code&0x3F));
else if(code<=0xFFFF)
u8str[idx++]=char(0xE0|(code>>12)),
u8str[idx++]=char(0x80|((code>>6)&0x3F)),
u8str[idx++]=char(0x80|(code&0x3F));
else
u8str[idx++]=char(0xF0|(code>>18)),
u8str[idx++]=char(0x80|((code>>12)&0x3F)),
u8str[idx++]=char(0x80|((code>>6)&0x3F)),
u8str[idx++]=char(0x80|(code&0x3F));
}
u8str[idx]='\0';
}
void uft16_to_uft8(const char16_t u16ch,char* u8str)
{
char16_t s[2];
s[0]=u16ch;
s[1]=u'\0';
uft16_to_uft8(s,u8str);
}
} // namespace pcuni