reader.h 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. // UTF-8 code points reader
  2. //
  3. // Platform: ISO C++ 98/11
  4. // $Id$
  5. //
  6. // (c) __vic 2013
  7. #ifndef __VIC_UTF8_READER_H
  8. #define __VIC_UTF8_READER_H
  9. #include<__vic/defs.h>
  10. #include<__vic/unicode.h>
  11. #include<__vic/utf8/defs.h>
  12. #include<__vic/utf8/status.h>
  13. #include<__vic/bits.h>
  14. namespace __vic { namespace utf8 {
  15. //////////////////////////////////////////////////////////////////////////////
  16. template<class ByteReader>
  17. class reader
  18. {
  19. ByteReader r;
  20. bool read_byte(unsigned char &b) { return r.read(b); }
  21. public:
  22. typedef ByteReader byte_reader_type;
  23. ByteReader &get_byte_reader() { return r; }
  24. const ByteReader &get_byte_reader() const { return r; }
  25. #if __cpp_variadic_templates && __cpp_rvalue_references
  26. template<class... Args>
  27. explicit reader(Args&&... args) : r(std::forward<Args>(args)...) {}
  28. #else
  29. reader() {}
  30. explicit reader(ByteReader r) : r(r) {}
  31. #endif
  32. status_t parse(unicode_t & );
  33. bool read(unicode_t &cp) { return throw_if_error(parse(cp)); }
  34. };
  35. //////////////////////////////////////////////////////////////////////////////
  36. //----------------------------------------------------------------------------
  37. template<class ByteReader>
  38. status_t reader<ByteReader>::parse(unicode_t &cp)
  39. {
  40. unsigned char b;
  41. if(!read_byte(b)) return status::eof;
  42. // Two short paths for the most frequent cases and generic case
  43. if((b & 0x80) == 0) cp = b; // 0xxxxxxx - 1 byte
  44. else if((b & 0xE0) == 0xC0) // 110xxxxx - 2 bytes
  45. {
  46. unicode_t ch = (b & 0x1F) << 6;
  47. if(!read_byte(b) || !is_continuation_byte(b))
  48. return status::truncated_code_point;
  49. ch |= b & 0x3F;
  50. if(ch < 0x80) return status::overlong_encoding;
  51. cp = ch;
  52. }
  53. else if((b & 0xE0) == 0xE0 // 111zzzzx - 3 or more bytes
  54. && (b & 0x1E) != 0x1E) // at least one z is 0
  55. {
  56. unsigned seqlen = 3;
  57. for(unsigned char mask = 0x10; b & mask; mask >>= 1)
  58. seqlen++; // count leading 1-bits
  59. if(seqlen > 4) return status::code_point_too_big;
  60. // bits from the starting byte
  61. unicode_t ch = __vic::get_lsbs(b, 7 - seqlen);
  62. for(int i = seqlen; --i;) // continuation bytes
  63. {
  64. if(!read_byte(b) || !is_continuation_byte(b))
  65. return status::truncated_code_point;
  66. ch <<= 6;
  67. ch |= b & 0x3F;
  68. }
  69. if(ch < length_thresholds[seqlen-2])
  70. return status::overlong_encoding;
  71. cp = ch;
  72. }
  73. else return status::no_leading_byte; // not a start byte
  74. return status::ok;
  75. }
  76. //----------------------------------------------------------------------------
  77. template<class ByteReader>
  78. inline reader<ByteReader> make_reader(ByteReader r)
  79. {
  80. return reader<ByteReader>(r);
  81. }
  82. //----------------------------------------------------------------------------
  83. }} // namespace
  84. #endif // header guard