42 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
43 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
44 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
45 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
46 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
47 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
50 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
52 template<
typename octet_type>
53 inline uint8_t mask8(octet_type oc)
55 return static_cast<uint8_t
>(0xff & oc);
57 template<
typename u16_type>
58 inline uint16_t mask16(u16_type oc)
60 return static_cast<uint16_t
>(0xffff & oc);
62 template<
typename octet_type>
63 inline bool is_trail(octet_type oc)
65 return ((mask8(oc) >> 6) == 0x2);
68 template <
typename u16>
69 inline bool is_lead_surrogate(u16 cp)
71 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
74 template <
typename u16>
75 inline bool is_trail_surrogate(u16 cp)
77 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
80 template <
typename u16>
81 inline bool is_surrogate(u16 cp)
83 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
86 template <
typename u32>
87 inline bool is_code_point_valid(u32 cp)
89 return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
92 template <
typename octet_iterator>
93 inline typename std::iterator_traits<octet_iterator>::difference_type
94 sequence_length(octet_iterator lead_it)
96 uint8_t lead = mask8(*lead_it);
99 else if ((lead >> 5) == 0x6)
101 else if ((lead >> 4) == 0xe)
103 else if ((lead >> 3) == 0x1e)
109 template <
typename octet_difference_type>
110 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
116 else if (cp < 0x800) {
120 else if (cp < 0x10000) {
128 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
132 template <
typename octet_iterator>
133 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
137 *code_point = mask8(*it);
140 return NOT_ENOUGH_ROOM;
143 template <
typename octet_iterator>
144 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
146 utf_error ret_code = NOT_ENOUGH_ROOM;
149 uint32_t cp = mask8(*it);
152 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
159 ret_code = INCOMPLETE_SEQUENCE;
162 ret_code = NOT_ENOUGH_ROOM;
168 template <
typename octet_iterator>
169 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
171 utf_error ret_code = NOT_ENOUGH_ROOM;
174 uint32_t cp = mask8(*it);
177 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
187 ret_code = INCOMPLETE_SEQUENCE;
190 ret_code = NOT_ENOUGH_ROOM;
193 ret_code = INCOMPLETE_SEQUENCE;
196 ret_code = NOT_ENOUGH_ROOM;
202 template <
typename octet_iterator>
203 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
205 utf_error ret_code = NOT_ENOUGH_ROOM;
208 uint32_t cp = mask8(*it);
211 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
214 cp += (mask8(*it) << 6) & 0xfff;
224 ret_code = INCOMPLETE_SEQUENCE;
227 ret_code = NOT_ENOUGH_ROOM;
230 ret_code = INCOMPLETE_SEQUENCE;
233 ret_code = NOT_ENOUGH_ROOM;
236 ret_code = INCOMPLETE_SEQUENCE;
239 ret_code = NOT_ENOUGH_ROOM;
245 template <
typename octet_iterator>
246 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
250 octet_iterator original_it = it;
254 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
255 octet_difference_type length = sequence_length(it);
260 utf_error err = UTF8_OK;
263 err = get_sequence_1(it, end, &cp);
266 err = get_sequence_2(it, end, &cp);
269 err = get_sequence_3(it, end, &cp);
272 err = get_sequence_4(it, end, &cp);
276 if (err == UTF8_OK) {
278 if (is_code_point_valid(cp)) {
279 if (!is_overlong_sequence(cp, length)){
287 err = OVERLONG_SEQUENCE;
290 err = INVALID_CODE_POINT;
298 template <
typename octet_iterator>
299 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
300 return validate_next(it, end, 0);
308 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
310 template <
typename octet_iterator>
311 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
313 octet_iterator result = start;
314 while (result != end) {
315 internal::utf_error err_code = internal::validate_next(result, end);
316 if (err_code != internal::UTF8_OK)
322 template <
typename octet_iterator>
323 inline bool is_valid(octet_iterator start, octet_iterator end)
325 return (find_invalid(start, end) == end);
328 template <
typename octet_iterator>
329 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
332 ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
333 ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
334 ((it != end) && (internal::mask8(*it)) == bom[2])
339 template <
typename octet_iterator>
340 inline bool is_bom (octet_iterator it)
343 (internal::mask8(*it++)) == bom[0] &&
344 (internal::mask8(*it++)) == bom[1] &&
345 (internal::mask8(*it)) == bom[2]