convert.cpp 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. /*
  2. * Copyright 2005 - 2016 Zarafa and its licensors
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. *
  16. */
  17. #include <kopano/platform.h>
  18. #include <kopano/charset/convert.h>
  19. #include <mapicode.h>
  20. #include <numeric>
  21. #include <vector>
  22. #include <stdexcept>
  23. #include <string>
  24. #include <kopano/stringutil.h>
  25. #include <cerrno>
  26. #define BUFSIZE 4096
  27. namespace KC {
  28. convert_exception::convert_exception(enum exception_type type, const std::string &message)
  29. : std::runtime_error(message)
  30. , m_type(type)
  31. {}
  32. unknown_charset_exception::unknown_charset_exception(const std::string &message)
  33. : convert_exception(eUnknownCharset, message)
  34. {}
  35. illegal_sequence_exception::illegal_sequence_exception(const std::string &message)
  36. : convert_exception(eIllegalSequence, message)
  37. {}
  38. namespace details {
  39. HRESULT HrFromException(const convert_exception &ce)
  40. {
  41. switch (ce.type()) {
  42. case convert_exception::eUnknownCharset: return MAPI_E_NOT_FOUND;
  43. case convert_exception::eIllegalSequence: return MAPI_E_INVALID_PARAMETER;
  44. default: return MAPI_E_CALL_FAILED;
  45. }
  46. }
  47. // HACK: prototypes may differ depending on the compiler and/or system (the
  48. // second parameter may or may not be 'const'). This redeclaration is a hack
  49. // to have a common prototype "iconv_cast".
  50. class ICONV_HACK {
  51. public:
  52. ICONV_HACK(const char** ptr) : m_ptr(ptr) { }
  53. // the compiler will choose the right operator
  54. operator const char **(void) const { return m_ptr; }
  55. operator char**() { return const_cast <char**>(m_ptr); }
  56. private:
  57. const char** m_ptr;
  58. };
  59. /**
  60. * Constructor for iconv_context_base
  61. *
  62. * The conversion context for iconv charset conversions takes a fromcode and a tocode,
  63. * which are the source and destination charsets, respectively. The 'tocode' may take
  64. * some extra options, separated with '//' from the charset, and then separated by commas
  65. *
  66. * This function accepts values accepted by GNU iconv:
  67. *
  68. * iso-8859-1//TRANSLIT,IGNORE
  69. * windows-1252//TRANSLIT
  70. *
  71. * The 'fromcode' can also take modifiers but they are ignored by iconv.
  72. *
  73. * Also, instead of IGNORE, the HTMLENTITY modifier can be used, eg:
  74. *
  75. * iso-8859-1//HTMLENTITY
  76. *
  77. * This works much like TRANSLIT, except that characters that cannot be represented in the
  78. * output character set are not represented by '?' but by the HTML entity '&#xxxx;'. This is useful
  79. * for generating HTML in which as many characters as possible are directly represented, but
  80. * other characters are represented by an HTML entity. Note: the HTMLENTITY modifier may only
  81. * be applied when the fromcode is CHARSET_WCHAR (this is purely an implementation limitation)
  82. *
  83. * Release builds default to //IGNORE (due to -DFORCE_CHARSET_CONVERSION
  84. * added by ./configure --enable-release), while debug builds default
  85. * to //NOIGNORE.
  86. *
  87. * @param tocode Destination charset
  88. * @param fromcode Source charset
  89. */
  90. iconv_context_base::iconv_context_base(const char* tocode, const char* fromcode)
  91. {
  92. /* Ignore illegal sequences by default. */
  93. m_bForce = true;
  94. m_bHTML = false;
  95. std::string strto = tocode;
  96. size_t pos = strto.find("//");
  97. if(pos != std::string::npos) {
  98. std::string options = strto.substr(pos+2);
  99. strto = strto.substr(0,pos);
  100. std::vector<std::string> vOptions = tokenize(options, ",");
  101. std::vector<std::string> vOptionsFiltered;
  102. std::vector<std::string>::const_iterator i;
  103. i = vOptions.begin();
  104. while(i != vOptions.end()) {
  105. if (*i == "IGNORE" || *i == "FORCE")
  106. m_bForce = true;
  107. else if (*i == "NOIGNORE" || *i == "NOFORCE")
  108. m_bForce = false;
  109. else if (*i == "HTMLENTITIES" && strcasecmp(fromcode, CHARSET_WCHAR) == 0)
  110. m_bHTML = true;
  111. else
  112. vOptionsFiltered.push_back(*i);
  113. ++i;
  114. }
  115. if(!vOptionsFiltered.empty()) {
  116. strto += "//";
  117. strto += join(vOptionsFiltered.begin(), vOptionsFiltered.end(), std::string(","));
  118. }
  119. }
  120. m_cd = iconv_open(strto.c_str(), fromcode);
  121. if (m_cd == (iconv_t)(-1))
  122. throw unknown_charset_exception(strerror(errno));
  123. }
  124. iconv_context_base::~iconv_context_base()
  125. {
  126. if (m_cd != (iconv_t)(-1))
  127. iconv_close(m_cd);
  128. }
  129. void iconv_context_base::doconvert(const char *lpFrom, size_t cbFrom)
  130. {
  131. char buf[BUFSIZE];
  132. const char *lpSrc = NULL;
  133. char *lpDst = NULL;
  134. size_t cbSrc = 0;
  135. size_t cbDst = 0;
  136. size_t err;
  137. lpSrc = lpFrom;
  138. cbSrc = cbFrom;
  139. while(cbSrc) {
  140. lpDst = buf;
  141. cbDst = sizeof(buf);
  142. err = iconv(m_cd, ICONV_HACK(&lpSrc), &cbSrc, &lpDst, &cbDst);
  143. if (err != static_cast<size_t>(-1) || cbDst != sizeof(buf)) {
  144. // buf now contains converted chars, append them to output
  145. append(buf, sizeof(buf) - cbDst);
  146. continue;
  147. }
  148. if (m_bHTML) {
  149. if(cbSrc < sizeof(wchar_t)) {
  150. // Do what //IGNORE would have done
  151. ++lpSrc;
  152. --cbSrc;
  153. continue;
  154. }
  155. // Convert the codepoint to '&#12345;'
  156. std::wstring wstrEntity = L"&#";
  157. size_t cbEntity;
  158. wchar_t code;
  159. const char *lpEntity;
  160. memcpy(&code, lpSrc, sizeof(code));
  161. wstrEntity += std::to_wstring(code);
  162. wstrEntity += L";";
  163. cbEntity = wstrEntity.size() * sizeof(wchar_t);
  164. lpEntity = (const char *)wstrEntity.c_str();
  165. // Since we don't know in what charset we are outputting, we have to send
  166. // the entity through iconv so that it can convert it to the target charset.
  167. err = iconv(m_cd, ICONV_HACK(&lpEntity), &cbEntity, &lpDst, &cbDst);
  168. if (err == static_cast<size_t>(-1))
  169. assert(false); // This will should never fail
  170. lpSrc += sizeof(wchar_t);
  171. cbSrc -= sizeof(wchar_t);
  172. } else if (m_bForce) {
  173. // Force conversion by skipping this character
  174. if (cbSrc) {
  175. ++lpSrc;
  176. --cbSrc;
  177. }
  178. } else {
  179. throw illegal_sequence_exception(strerror(errno));
  180. }
  181. // buf now contains converted chars, append them to output
  182. append(buf, sizeof(buf) - cbDst);
  183. }
  184. // Finalize (needed for stateful conversion)
  185. lpDst = buf;
  186. cbDst = sizeof(buf);
  187. err = iconv(m_cd, NULL, NULL, &lpDst, &cbDst);
  188. append(buf, sizeof(buf) - cbDst);
  189. }
  190. } // namespace details
  191. convert_context::~convert_context()
  192. {
  193. for (auto &ictx : m_contexts)
  194. delete ictx.second;
  195. for (auto &icode : m_codes)
  196. delete[] icode;
  197. }
  198. void convert_context::persist_code(context_key &key, unsigned flags)
  199. {
  200. if (flags & pfToCode) {
  201. code_set::const_iterator iCode = m_codes.find(key.tocode);
  202. if (iCode == m_codes.cend()) {
  203. auto tocode = new char[strlen(key.tocode)+1];
  204. memcpy(tocode, key.tocode, strlen(key.tocode) + 1);
  205. iCode = m_codes.insert(tocode).first;
  206. }
  207. key.tocode = *iCode;
  208. }
  209. if (flags & pfFromCode) {
  210. code_set::const_iterator iCode = m_codes.find(key.fromcode);
  211. if (iCode == m_codes.cend()) {
  212. auto fromcode = new char[strlen(key.fromcode)+1];
  213. memcpy(fromcode, key.fromcode, strlen(key.fromcode) + 1);
  214. iCode = m_codes.insert(fromcode).first;
  215. }
  216. key.fromcode = *iCode;
  217. }
  218. }
  219. char* convert_context::persist_string(const std::string &strValue)
  220. {
  221. m_lstStrings.push_back(strValue);
  222. return const_cast<char*>(m_lstStrings.back().c_str());
  223. }
  224. wchar_t* convert_context::persist_string(const std::wstring &wstrValue)
  225. {
  226. m_lstWstrings.push_back(wstrValue);
  227. return const_cast<wchar_t*>(m_lstWstrings.back().c_str());
  228. }
  229. } /* namespace */