utf8.cpp 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. #include<__vic/utf8/reader.h>
  2. #include<__vic/utf8/writer.h>
  3. #include<__vic/utf8/exceptions.h>
  4. #include<__vic/readers/string.h>
  5. #include<__vic/readers/cstring.h>
  6. #include<__vic/writers/string.h>
  7. #include<__vic/iterator.h>
  8. #include<string>
  9. #include<iostream>
  10. #include<exception>
  11. #include<cassert>
  12. namespace tests {
  13. using __vic::unicode_t;
  14. #if __cpp_variadic_templates && __cpp_rvalue_references
  15. typedef __vic::utf8::reader<__vic::string_reader> utf8_string_reader;
  16. typedef __vic::utf8::writer<__vic::string_writer> utf8_string_writer;
  17. typedef __vic::utf8::reader<__vic::cstring_reader> utf8_cstring_reader;
  18. #else
  19. struct utf8_string_reader : __vic::utf8::reader<__vic::string_reader>
  20. {
  21. explicit utf8_string_reader(const std::string &s)
  22. : __vic::utf8::reader<__vic::string_reader>(__vic::string_reader(s)) {}
  23. };
  24. struct utf8_string_writer : __vic::utf8::writer<__vic::string_writer>
  25. {
  26. explicit utf8_string_writer(std::string &s)
  27. : __vic::utf8::writer<__vic::string_writer>(__vic::string_writer(s)) {}
  28. };
  29. struct utf8_cstring_reader : __vic::utf8::reader<__vic::cstring_reader>
  30. {
  31. explicit utf8_cstring_reader(const char *s)
  32. : __vic::utf8::reader<__vic::cstring_reader>(__vic::cstring_reader(s)) {}
  33. };
  34. #endif
  35. //----------------------------------------------------------------------------
  36. void read_write()
  37. {
  38. // Unicode code points for u8"Я люблю UTF-8"
  39. const unicode_t str[] = { 0x042F, 0x20,
  40. 0x043B, 0x044E, 0x0431, 0x043B, 0x044E, 0x20,
  41. 0x55, 0x54, 0x46, 0x2D, 0x38
  42. };
  43. // The same string in UTF-8
  44. const char check[] = "\xD0\xAF" "\x20"
  45. "\xD0\xBB\xD1\x8E\xD0\xB1\xD0\xBB\xD1\x8E" "\x20"
  46. "\x55\x54\x46\x2D\x38";
  47. std::string s;
  48. utf8_string_writer w(s);
  49. for(const unicode_t *p = str; p != __vic::end(str); p++)
  50. w.write(*p);
  51. assert(s == check);
  52. utf8_string_reader r(s);
  53. const unicode_t *p = str;
  54. size_t n = __vic::array_size(str);
  55. for(unicode_t ch; r.read(ch); p++, n--)
  56. {
  57. assert(n != 0);
  58. assert(ch == *p);
  59. }
  60. assert(n == 0); // all str elements are read
  61. }
  62. //----------------------------------------------------------------------------
  63. void long_code_point()
  64. {
  65. const unicode_t euro = 0x20AC; // euro sign
  66. const char euro_utf8[] = "\xE2\x82\xAC";
  67. utf8_cstring_reader r(euro_utf8);
  68. unicode_t ch;
  69. r.read(ch);
  70. assert(ch == euro);
  71. }
  72. //----------------------------------------------------------------------------
  73. template<class UTF8Reader>
  74. bool is_valid(UTF8Reader r)
  75. {
  76. unicode_t cp;
  77. for(;;)
  78. switch(r.parse(cp))
  79. {
  80. case __vic::utf8::status::ok: break;
  81. case __vic::utf8::status::eof: return true;
  82. default: return false;
  83. }
  84. }
  85. //----------------------------------------------------------------------------
  86. void valid_encoding_test()
  87. {
  88. const char str[] = "\xD0\xB4""\xD0\xB0""\x20""\x6F""\x6B"; // u8"да ok"
  89. assert(is_valid(utf8_cstring_reader(str)));
  90. }
  91. //----------------------------------------------------------------------------
  92. template<class UTF8Reader>
  93. __vic::utf8::status_t parse_utf8(UTF8Reader &r)
  94. {
  95. unicode_t cp;
  96. for(;;)
  97. switch(__vic::utf8::status_t st = r.parse(cp))
  98. {
  99. case __vic::utf8::status::ok: break;
  100. case __vic::utf8::status::eof: return __vic::utf8::status::ok;
  101. default: return st;
  102. }
  103. }
  104. //----------------------------------------------------------------------------
  105. size_t offset(const utf8_cstring_reader &r, const char *begin)
  106. {
  107. return r.get_byte_reader().position() - begin;
  108. }
  109. //----------------------------------------------------------------------------
  110. void code_point_too_big_test()
  111. {
  112. const char str[] = "\xD0\xB4""\xD0\xB0""\x20""\x6F""\x6B"
  113. "\xFC\x83\xBF\xBF\xBF\xBF"; // <- 6 bytes length code point
  114. utf8_cstring_reader r(str);
  115. assert(parse_utf8(r) == __vic::utf8::status::code_point_too_big);
  116. assert(offset(r, str) >= 8);
  117. }
  118. //----------------------------------------------------------------------------
  119. void truncated_code_point_test()
  120. {
  121. const char str[] = "\xD0" // <- truncated code point
  122. "\xD0\xB0""\x20""\x6F""\x6B";
  123. utf8_cstring_reader r(str);
  124. assert(parse_utf8(r) == __vic::utf8::status::truncated_code_point);
  125. assert(offset(r, str) == 2);
  126. }
  127. //----------------------------------------------------------------------------
  128. void no_leading_byte_test()
  129. {
  130. const char str[] = "\xD0\xB4""\xB0" // <- continuation byte first
  131. "\x20""\x6F""\x6B";
  132. utf8_cstring_reader r(str);
  133. assert(parse_utf8(r) == __vic::utf8::status::no_leading_byte);
  134. assert(offset(r, str) == 3);
  135. }
  136. //----------------------------------------------------------------------------
  137. void overlong_encoding_test()
  138. {
  139. const char str[] = "\xD0\xB4""\xD0\xB0""\x20""\x6F""\x6B"
  140. "\xF0\x80\x80\xAF"; // <- overlong encoding of U+002F
  141. utf8_cstring_reader r(str);
  142. assert(parse_utf8(r) == __vic::utf8::status::overlong_encoding);
  143. assert(offset(r, str) >= 8);
  144. }
  145. //----------------------------------------------------------------------------
  146. template<class UTF8Reader>
  147. size_t code_point_count(UTF8Reader r)
  148. {
  149. size_t len = 0;
  150. unicode_t cp;
  151. while(r.read(cp)) len++;
  152. return len;
  153. }
  154. //----------------------------------------------------------------------------
  155. void code_point_count_test()
  156. {
  157. const char str[] = "\xD0\xB4""\xD0\xB0""\x20""\x6F""\x6B"; // u8"да ok"
  158. assert(code_point_count(utf8_cstring_reader(str)) == 5);
  159. const char str_with_nul[] = { 'A', '\0', 'B' };
  160. assert(code_point_count(utf8_string_reader(
  161. std::string(str_with_nul, sizeof str_with_nul))) == 3);
  162. }
  163. //----------------------------------------------------------------------------
  164. size_t code_point_length(const char *s)
  165. {
  166. utf8_cstring_reader r(s);
  167. unicode_t cp;
  168. if(r.read(cp)) return offset(r, s);
  169. return 0;
  170. }
  171. //----------------------------------------------------------------------------
  172. void code_point_length_test()
  173. {
  174. const char ch1[] = ".",
  175. ch2[] = "\xD0\x90", // UTF-8: А
  176. ch3[] = "\xE2\x82\xAC", // euro sign
  177. ch4[] = "\xF0"; // First byte of the 4-byte-length char
  178. assert(code_point_length(ch1) == 1);
  179. assert(code_point_length(ch2) == 2);
  180. assert(code_point_length(ch3) == 3);
  181. try
  182. {
  183. code_point_length(ch4);
  184. assert(false);
  185. }
  186. catch(const __vic::utf8::truncated_code_point & ) {} // OK
  187. catch(...) { assert(false); }
  188. }
  189. //----------------------------------------------------------------------------
  190. std::string replace_invalid(const char *str, unicode_t ch)
  191. {
  192. utf8_cstring_reader r(str);
  193. std::string res;
  194. bool skip_continuation_bytes = false;
  195. for(;;)
  196. {
  197. const char *begin = r.get_byte_reader().position();
  198. unicode_t cp;
  199. switch(r.parse(cp))
  200. {
  201. case __vic::utf8::status::ok:
  202. res.append(begin, r.get_byte_reader().position() - begin);
  203. skip_continuation_bytes = false;
  204. break;
  205. case __vic::utf8::status::eof:
  206. return res;
  207. case __vic::utf8::status::no_leading_byte:
  208. if(skip_continuation_bytes) break;
  209. // [[fallthrough]];
  210. default:
  211. utf8_string_writer(res).write(ch);
  212. skip_continuation_bytes = true;
  213. }
  214. }
  215. }
  216. //----------------------------------------------------------------------------
  217. void replace_invalid_test()
  218. {
  219. // The first is a Euro sign w/o leading byte
  220. const char str[] = /*"\xE2"*/"\x82\xAC""xz";
  221. assert(!is_valid(utf8_cstring_reader(str)));
  222. std::string fixed = replace_invalid(str, '?');
  223. std::cout << fixed << '\n';
  224. assert(fixed == "?xz");
  225. assert(is_valid(utf8_string_reader(fixed)));
  226. }
  227. //----------------------------------------------------------------------------
  228. void run()
  229. {
  230. read_write();
  231. long_code_point();
  232. valid_encoding_test();
  233. code_point_too_big_test();
  234. truncated_code_point_test();
  235. no_leading_byte_test();
  236. overlong_encoding_test();
  237. code_point_count_test();
  238. code_point_length_test();
  239. replace_invalid_test();
  240. }
  241. //----------------------------------------------------------------------------
  242. } // namespace
  243. int main()
  244. {
  245. try
  246. {
  247. tests::run();
  248. return 0;
  249. }
  250. catch(const std::exception &ex)
  251. {
  252. std::cerr << ex.what() << '\n';
  253. }
  254. return 1;
  255. }