win32utf8.c 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. #if defined(_WIN32)
  2. # include <stdio.h>
  3. # include <stdlib.h>
  4. # include <wchar.h>
  5. /*We need the following two to set stdin/stdout to binary.*/
  6. # include <io.h>
  7. # include <fcntl.h>
  8. # define WIN32_LEAN_AND_MEAN
  9. # define WIN32_EXTRA_LEAN
  10. # include <windows.h>
  11. # include "win32utf8.h"
  12. static char *utf16_to_utf8(const wchar_t *_src){
  13. char *dst;
  14. size_t len;
  15. size_t si;
  16. size_t di;
  17. len=wcslen(_src);
  18. dst=(char *)malloc(sizeof(*dst)*(3*len+1));
  19. if(dst==NULL)return dst;
  20. for(di=si=0;si<len;si++){
  21. unsigned c0;
  22. c0=_src[si];
  23. if(c0<0x80){
  24. /*Can be represented by a 1-byte sequence.*/
  25. dst[di++]=(char)c0;
  26. continue;
  27. }
  28. else if(c0<0x800){
  29. /*Can be represented by a 2-byte sequence.*/
  30. dst[di++]=(char)(0xC0|c0>>6);
  31. dst[di++]=(char)(0x80|c0&0x3F);
  32. continue;
  33. }
  34. else if(c0>=0xD800&&c0<0xDC00){
  35. unsigned c1;
  36. /*This is safe, because c0 was not 0 and _src is NUL-terminated.*/
  37. c1=_src[si+1];
  38. if(c1>=0xDC00&&c1<0xE000){
  39. unsigned w;
  40. /*Surrogate pair.*/
  41. w=((c0&0x3FF)<<10|c1&0x3FF)+0x10000;
  42. /*Can be represented by a 4-byte sequence.*/
  43. dst[di++]=(char)(0xF0|w>>18);
  44. dst[di++]=(char)(0x80|w>>12&0x3F);
  45. dst[di++]=(char)(0x80|w>>6&0x3F);
  46. dst[di++]=(char)(0x80|w&0x3F);
  47. si++;
  48. continue;
  49. }
  50. }
  51. /*Anything else is either a valid 3-byte sequence, an invalid surrogate
  52. pair, or 'not a character'.
  53. In the latter two cases, we just encode the value as a 3-byte
  54. sequence anyway (producing technically invalid UTF-8).
  55. Later error handling will detect the problem, with a better
  56. chance of giving a useful error message.*/
  57. dst[di++]=(char)(0xE0|c0>>12);
  58. dst[di++]=(char)(0x80|c0>>6&0x3F);
  59. dst[di++]=(char)(0x80|c0&0x3F);
  60. }
  61. dst[di++]='\0';
  62. return dst;
  63. }
  64. typedef LPWSTR *(APIENTRY *command_line_to_argv_w_func)(LPCWSTR cmd_line,
  65. int *num_args);
  66. /*Make a best-effort attempt to support UTF-8 on Windows.*/
  67. void win32_utf8_setup(int *_argc,const char ***_argv){
  68. HMODULE hlib;
  69. /*We need to set stdin/stdout to binary mode.
  70. This is unrelated to UTF-8 support, but it's platform specific and we need
  71. to do it in the same places.*/
  72. _setmode(_fileno(stdin),_O_BINARY);
  73. _setmode(_fileno(stdout),_O_BINARY);
  74. hlib=LoadLibraryA("shell32.dll");
  75. if(hlib!=NULL){
  76. command_line_to_argv_w_func command_line_to_argv_w;
  77. /*This function is only available on Windows 2000 or later.*/
  78. command_line_to_argv_w=(command_line_to_argv_w_func)GetProcAddress(hlib,
  79. "CommandLineToArgvW");
  80. if(command_line_to_argv_w!=NULL){
  81. wchar_t **argvw;
  82. int argc;
  83. argvw=(*command_line_to_argv_w)(GetCommandLineW(),&argc);
  84. if(argvw!=NULL){
  85. int ai;
  86. /*Really, I don't see why argc would ever differ from *_argc, but let's
  87. be paranoid.*/
  88. if(argc>*_argc)argc=*_argc;
  89. for(ai=0;ai<argc;ai++){
  90. char *argv;
  91. argv=utf16_to_utf8(argvw[ai]);
  92. if(argv!=NULL)(*_argv)[ai]=argv;
  93. }
  94. *_argc=argc;
  95. LocalFree(argvw);
  96. }
  97. }
  98. FreeLibrary(hlib);
  99. }
  100. # if defined(CP_UTF8)
  101. /*This does not work correctly in all environments (it breaks output in
  102. mingw32 for me), and requires a Unicode font (e.g., when using the default
  103. Raster font, even characters that are available in the font's codepage
  104. won't display properly).*/
  105. /*SetConsoleOutputCP(CP_UTF8);*/
  106. # endif
  107. }
  108. #endif