123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- // UTF-8 code points reader
- //
- // Platform: ISO C++ 98/11
- // $Id$
- //
- // (c) __vic 2013
- #ifndef __VIC_UTF8_READER_H
- #define __VIC_UTF8_READER_H
- #include<__vic/defs.h>
- #include<__vic/unicode.h>
- #include<__vic/utf8/defs.h>
- #include<__vic/utf8/status.h>
- #include<__vic/bits.h>
- namespace __vic { namespace utf8 {
- //////////////////////////////////////////////////////////////////////////////
- template<class ByteReader>
- class reader
- {
- ByteReader r;
- bool read_byte(unsigned char &b) { return r.read(b); }
- public:
- typedef ByteReader byte_reader_type;
- ByteReader &get_byte_reader() { return r; }
- const ByteReader &get_byte_reader() const { return r; }
- #if __cpp_variadic_templates && __cpp_rvalue_references
- template<class... Args>
- explicit reader(Args&&... args) : r(std::forward<Args>(args)...) {}
- #else
- reader() {}
- explicit reader(ByteReader r) : r(r) {}
- #endif
- status_t parse(unicode_t & );
- bool read(unicode_t &cp) { return throw_if_error(parse(cp)); }
- };
- //////////////////////////////////////////////////////////////////////////////
- //----------------------------------------------------------------------------
- template<class ByteReader>
- status_t reader<ByteReader>::parse(unicode_t &cp)
- {
- unsigned char b;
- if(!read_byte(b)) return status::eof;
- // Two short paths for the most frequent cases and generic case
- if((b & 0x80) == 0) cp = b; // 0xxxxxxx - 1 byte
- else if((b & 0xE0) == 0xC0) // 110xxxxx - 2 bytes
- {
- unicode_t ch = (b & 0x1F) << 6;
- if(!read_byte(b) || !is_continuation_byte(b))
- return status::truncated_code_point;
- ch |= b & 0x3F;
- if(ch < 0x80) return status::overlong_encoding;
- cp = ch;
- }
- else if((b & 0xE0) == 0xE0 // 111zzzzx - 3 or more bytes
- && (b & 0x1E) != 0x1E) // at least one z is 0
- {
- unsigned seqlen = 3;
- for(unsigned char mask = 0x10; b & mask; mask >>= 1)
- seqlen++; // count leading 1-bits
- if(seqlen > 4) return status::code_point_too_big;
- // bits from the starting byte
- unicode_t ch = __vic::get_lsbs(b, 7 - seqlen);
- for(int i = seqlen; --i;) // continuation bytes
- {
- if(!read_byte(b) || !is_continuation_byte(b))
- return status::truncated_code_point;
- ch <<= 6;
- ch |= b & 0x3F;
- }
- if(ch < length_thresholds[seqlen-2])
- return status::overlong_encoding;
- cp = ch;
- }
- else return status::no_leading_byte; // not a start byte
- return status::ok;
- }
- //----------------------------------------------------------------------------
- template<class ByteReader>
- inline reader<ByteReader> make_reader(ByteReader r)
- {
- return reader<ByteReader>(r);
- }
- //----------------------------------------------------------------------------
- }} // namespace
- #endif // header guard
|