rtfutil.cpp 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298
  1. /*
  2. * Copyright 2005 - 2016 Zarafa and its licensors
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. *
  16. */
  17. // From http://www.wischik.com/lu/programmer/mapi_utils.html
  18. // Parts rewritten by Zarafa
  19. #include <kopano/platform.h>
  20. #include <iostream>
  21. #include <kopano/codepage.h>
  22. #include <kopano/CommonUtil.h>
  23. #include <kopano/Util.h>
  24. #include <kopano/charset/convert.h>
  25. #include <kopano/stringutil.h>
  26. #include "HtmlEntity.h"
  27. #include "rtfutil.h"
  28. #include <string>
  29. #include <sstream>
  30. using namespace std;
  31. namespace KC {
  32. static const char szHex[] = "0123456789ABCDEF";
  33. // Charsets used in \fcharsetXXX (from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/dnrtfspec/html/rtfspec_6.asp )
  34. // charset "" is the ANSI codepage specified in \ansicpg
  35. // charset NULL means 'no conversion', ie direct 1-to-1 translation to UNICODE
  36. static const struct _rtfcharset {
  37. int id;
  38. const char *charset;
  39. } RTFCHARSET[] = {
  40. {0, ""}, // This is actually the codepage specified in \ansicpg
  41. {1, ""}, // default charset, probably also the codepage in \ansicpg
  42. {2, ""}, // This is SYMBOL, but in practice, we can just send the data
  43. // one-on-one down the line, as the actual character codes
  44. // don't change when converted to unicode (because the other
  45. // side will also be using the MS Symbol font to display)
  46. {3, NULL}, // 'Invalid'
  47. {77, "MAC"}, // Unsure if this is the correct charset
  48. {128,"SJIS"}, // OR cp 932 ?
  49. {129,"euc-kr"}, // 'Hangul' korean
  50. {130,"JOHAB"},
  51. {134,"GB2312"},
  52. {136,"BIG5"},
  53. {161,"windows-1253"},
  54. {162,"windows-1254"}, // 'Turkish'
  55. {163,"windows-1258"}, // Vietnamese
  56. {177,"windows-1255"}, // Hebrew
  57. {178,"windows-1256"}, // Arabic
  58. {179,"windows-1256"}, // Arabic traditional
  59. {180,"windows-1256"}, // Arabic user
  60. {181,"windows-1255"}, // Hebrew user
  61. {186,"windows-1257"},
  62. {204,"windows-1251"}, // Cyrillic for russian
  63. {222,NULL}, // Thai ?
  64. {238,"windows-1250"}, // Eastern European
  65. {254,"IBM437"},
  66. {255,NULL} // OEM
  67. };
  68. struct RTFSTATE {
  69. int ulFont;
  70. const char *szCharset;
  71. bool bInFontTbl;
  72. bool bInColorTbl;
  73. bool bInSkipTbl;
  74. std::string output; // text in current szCharset
  75. bool bRTFOnly;
  76. int ulUnicodeSkip; // number of characters to skip after a unicode character
  77. int ulSkipChars;
  78. };
  79. #define RTF_MAXSTATE 256
  80. #define RTF_MAXCMD 64
  81. typedef map<int,int> fontmap_t;
  82. /**
  83. * Converts RTF \ansicpgN <N> number to normal charset string.
  84. *
  85. * @param[in] id RTF codepage number
  86. * @param[out] lpszCharset static charset string
  87. * @retval MAPI_E_NOT_FOUND if id was unknown
  88. */
  89. static HRESULT HrGetCharsetByRTFID(int id, const char **lpszCharset)
  90. {
  91. for (size_t i = 0; i < ARRAY_SIZE(RTFCHARSET); ++i) {
  92. if(RTFCHARSET[i].id == id) {
  93. *lpszCharset = RTFCHARSET[i].charset;
  94. return hrSuccess;
  95. }
  96. }
  97. return MAPI_E_NOT_FOUND;
  98. }
  99. /** RTF ignore Commando's
  100. *
  101. * @param[in] lpCommand RTF command string, without leading \
  102. * @return bool
  103. */
  104. static bool isRTFIgnoreCommand(const char *lpCommand)
  105. {
  106. if(lpCommand == NULL)
  107. return false;
  108. if (strcmp(lpCommand,"stylesheet") == 0 ||
  109. strcmp(lpCommand,"revtbl") == 0 ||
  110. strcmp(lpCommand,"xmlnstbl") == 0 ||
  111. strcmp(lpCommand,"rsidtbl") == 0 ||
  112. strcmp(lpCommand,"fldinst") == 0 ||
  113. strcmp(lpCommand,"shpinst") == 0 ||
  114. strcmp(lpCommand,"wgrffmtfilter") == 0 ||
  115. strcmp(lpCommand,"pnseclvl") == 0 ||
  116. strcmp(lpCommand,"atrfstart") == 0 ||
  117. strcmp(lpCommand,"atrfend") == 0 ||
  118. strcmp(lpCommand,"atnauthor") == 0 ||
  119. strcmp(lpCommand,"annotation") == 0 ||
  120. strcmp(lpCommand,"sp") == 0 ||
  121. strcmp(lpCommand,"atnid") == 0 ||
  122. strcmp(lpCommand,"xmlopen") == 0
  123. //strcmp(lpCommand,"fldrslt") == 0
  124. )
  125. return true;
  126. return false;
  127. }
  128. /**
  129. * Initializes an RTFState struct to default values.
  130. *
  131. * @param[in/out] sState pointer to RTFState struct to init
  132. */
  133. static void InitRTFState(RTFSTATE *sState)
  134. {
  135. sState->bInSkipTbl = false;
  136. sState->bInFontTbl = false;
  137. sState->bInColorTbl = false;
  138. sState->szCharset = "us-ascii";
  139. sState->bRTFOnly = false;
  140. sState->ulFont = 0;
  141. sState->ulUnicodeSkip = 1;
  142. sState->ulSkipChars = 0;
  143. }
  144. static std::wstring RTFFlushStateOutput(convert_context &convertContext,
  145. RTFSTATE *sState, ULONG ulState)
  146. {
  147. std::wstring wstrUnicode;
  148. if (!sState[ulState].output.empty()) {
  149. TryConvert(convertContext, sState[ulState].output, rawsize(sState[ulState].output), sState[ulState].szCharset, wstrUnicode);
  150. sState[ulState].output.clear();
  151. }
  152. return wstrUnicode;
  153. }
  154. /**
  155. * Converts RTF text into HTML text. It will return an HTML string in
  156. * the given codepage.
  157. *
  158. * To convert between the RTF text and HTML codepage text, we use a
  159. * WCHAR string as intermediate.
  160. *
  161. * @param[in] lpStrRTFIn RTF input string that contains \fromtext
  162. * @param[out] lpStrHTMLOut HTML output in requested ulCodepage
  163. * @param[out] ulCodepage codepage for HTML output
  164. */
  165. HRESULT HrExtractHTMLFromRTF(const std::string &rtf_unfilt,
  166. std::string &lpStrHTMLOut, ULONG ulCodepage)
  167. {
  168. HRESULT hr;
  169. auto lpStrRTFIn = string_strip_nuls(rtf_unfilt);
  170. const char *szInput = lpStrRTFIn.c_str();
  171. const char *szANSICharset = "us-ascii";
  172. const char *szHTMLCharset;
  173. std::string strConvertCharset;
  174. std::wstring strOutput;
  175. int ulState = 0;
  176. RTFSTATE sState[RTF_MAXSTATE];
  177. fontmap_t mapFontToCharset;
  178. convert_context convertContext;
  179. // Find \\htmltag, if there is none we can't extract HTML
  180. if (strstr(szInput, "{\\*\\htmltag") == NULL)
  181. return MAPI_E_NOT_FOUND;
  182. // select output charset
  183. hr = HrGetCharsetByCP(ulCodepage, &szHTMLCharset);
  184. if (hr != hrSuccess) {
  185. szHTMLCharset = "us-ascii";
  186. hr = hrSuccess;
  187. }
  188. strConvertCharset = szHTMLCharset + string("//HTMLENTITIES");
  189. InitRTFState(&sState[0]);
  190. while(*szInput) {
  191. if(strncmp(szInput,"\\*",2) == 0) {
  192. szInput+=2;
  193. } else if(*szInput == '\\') {
  194. // Command
  195. char szCommand[RTF_MAXCMD];
  196. char *szCmdOutput;
  197. int lArg = -1;
  198. bool bNeg = false;
  199. ++szInput;
  200. if(isalpha(*szInput)) {
  201. szCmdOutput = szCommand;
  202. while (isalpha(*szInput) && szCmdOutput < szCommand + RTF_MAXCMD - 1)
  203. *szCmdOutput++ = *szInput++;
  204. *szCmdOutput = 0;
  205. if(*szInput == '-') {
  206. bNeg = true;
  207. ++szInput;
  208. }
  209. if(isdigit(*szInput)) {
  210. lArg = 0;
  211. while (isdigit(*szInput)) {
  212. lArg = lArg * 10 + *szInput - '0';
  213. ++szInput;
  214. }
  215. if(bNeg) lArg = -lArg;
  216. }
  217. if(*szInput == ' ')
  218. ++szInput;
  219. // szCommand is the command, lArg is the argument.
  220. if(strcmp(szCommand,"fonttbl") == 0) {
  221. sState[ulState].bInFontTbl = true;
  222. } else if(strcmp(szCommand,"colortbl") == 0) {
  223. sState[ulState].bInColorTbl = true;
  224. } else if(strcmp(szCommand,"pntext") == 0) { // pntext is the plaintext alternative, ignore it.
  225. sState[ulState].bRTFOnly = true;
  226. } else if(strcmp(szCommand,"ansicpg") == 0) {
  227. if(HrGetCharsetByCP(lArg, &szANSICharset) != hrSuccess)
  228. szANSICharset = "us-ascii";
  229. sState[ulState].szCharset = szANSICharset;
  230. } else if(strcmp(szCommand,"fcharset") == 0) {
  231. if(sState[ulState].bInFontTbl) {
  232. mapFontToCharset.insert(pair<int, int>(sState[ulState].ulFont, lArg));
  233. }
  234. } else if(strcmp(szCommand,"htmltag") == 0) {
  235. } else if(strcmp(szCommand,"mhtmltag") == 0) {
  236. } else if (strcmp(szCommand,"pard") == 0) {
  237. } else if (strcmp(szCommand,"par") == 0) {
  238. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl) {
  239. sState[ulState].output.append(1,'\r');
  240. sState[ulState].output.append(1,'\n');
  241. }
  242. } else if(strcmp(szCommand,"tab") == 0) {
  243. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl) {
  244. sState[ulState].output.append(1,' ');
  245. sState[ulState].output.append(1,' ');
  246. sState[ulState].output.append(1,' ');
  247. }
  248. } else if (strcmp(szCommand,"uc") == 0) {
  249. sState[ulState].ulUnicodeSkip = lArg;
  250. } else if(strcmp(szCommand,"f") == 0) {
  251. sState[ulState].ulFont = lArg;
  252. if(!sState[ulState].bInFontTbl) {
  253. fontmap_t::const_iterator i = mapFontToCharset.find(lArg);
  254. if (i == mapFontToCharset.cend())
  255. continue;
  256. // Output any data before this point
  257. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  258. // Set new charset
  259. HrGetCharsetByRTFID(i->second, &sState[ulState].szCharset);
  260. if(sState[ulState].szCharset == NULL) {
  261. sState[ulState].szCharset = "us-ascii";
  262. } else if(sState[ulState].szCharset[0] == 0) {
  263. sState[ulState].szCharset = szANSICharset;
  264. }
  265. }
  266. // ignore error
  267. }
  268. else if (strcmp(szCommand,"u") == 0) {
  269. // unicode character, in signed short WCHAR
  270. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  271. if (!sState[ulState].bRTFOnly)
  272. strOutput.append(1, (unsigned short)lArg); // add as literal character
  273. sState[ulState].ulSkipChars += sState[ulState].ulUnicodeSkip;
  274. }
  275. else if(strcmp(szCommand,"htmlrtf") == 0) {
  276. sState[ulState].bRTFOnly = lArg != 0;
  277. }else if(isRTFIgnoreCommand(szCommand)) {
  278. sState[ulState].bInSkipTbl = true;
  279. }
  280. }
  281. // Non-alnum after '\'
  282. else if(*szInput == '\\') {
  283. ++szInput;
  284. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  285. sState[ulState].output.append(1,'\\');
  286. }
  287. else if(*szInput == '{') {
  288. ++szInput;
  289. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  290. sState[ulState].output.append(1,'{');
  291. }
  292. else if(*szInput == '}') {
  293. ++szInput;
  294. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  295. sState[ulState].output.append(1,'}');
  296. }
  297. else if(*szInput == '\'') {
  298. unsigned int ulChar;
  299. while(*szInput == '\'')
  300. {
  301. ulChar = 0;
  302. ++szInput;
  303. if(*szInput) {
  304. ulChar = (unsigned int) (strchr(szHex, toupper(*szInput)) == NULL ? 0 : (strchr(szHex, toupper(*szInput)) - szHex));
  305. ulChar = ulChar << 4;
  306. ++szInput;
  307. }
  308. if(*szInput) {
  309. ulChar += (unsigned int) (strchr(szHex, toupper(*szInput)) == NULL ? 0 : (strchr(szHex, toupper(*szInput)) - szHex));
  310. ++szInput;
  311. }
  312. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl && !sState[ulState].ulSkipChars) {
  313. sState[ulState].output.append(1,ulChar);
  314. } else if (sState[ulState].ulSkipChars)
  315. --sState[ulState].ulSkipChars;
  316. if(*szInput == '\\' && *(szInput+1) == '\'')
  317. ++szInput;
  318. else
  319. break;
  320. }
  321. } else {
  322. ++szInput; // skip single character after '\'
  323. }
  324. } // Non-command
  325. else if(*szInput == '{') {
  326. // Dump output data
  327. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  328. ++ulState;
  329. if (ulState >= RTF_MAXSTATE)
  330. return MAPI_E_NOT_ENOUGH_MEMORY;
  331. sState[ulState] = sState[ulState-1];
  332. sState[ulState].output.clear();
  333. ++szInput;
  334. } else if(*szInput == '}') {
  335. // Dump output data
  336. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  337. if(ulState > 0)
  338. --ulState;
  339. ++szInput;
  340. } else if(*szInput == '\r' || *szInput == '\n') {
  341. ++szInput;
  342. } else {
  343. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl && !sState[ulState].ulSkipChars) {
  344. sState[ulState].output.append(1,*szInput);
  345. } else if (sState[ulState].ulSkipChars)
  346. --sState[ulState].ulSkipChars;
  347. ++szInput;
  348. }
  349. }
  350. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  351. try {
  352. lpStrHTMLOut = convertContext.convert_to<string>(strConvertCharset.c_str(), strOutput, rawsize(strOutput), CHARSET_WCHAR);
  353. } catch (const convert_exception &ce) {
  354. hr = details::HrFromException(ce);
  355. }
  356. return hr;
  357. }
  358. /**
  359. * Extracts the Plain text that was encapsulated in an RTF text, and
  360. * writes out HTML. It will return an HTML string in the given
  361. * codepage.
  362. *
  363. * To convert between the RTF text and HTML codepage text, we use a
  364. * WCHAR string as intermediate.
  365. *
  366. * @param[in] lpStrRTFIn RTF input string that contains \fromtext
  367. * @param[out] lpStrHTMLOut HTML output in requested ulCodepage
  368. * @param[out] ulCodepage codepage for HTML output
  369. */
  370. HRESULT HrExtractHTMLFromTextRTF(const std::string &rtf_unfilt,
  371. std::string &lpStrHTMLOut, ULONG ulCodepage)
  372. {
  373. HRESULT hr;
  374. auto lpStrRTFIn = string_strip_nuls(rtf_unfilt);
  375. std::wstring wstrUnicodeTmp;
  376. const char *szInput = lpStrRTFIn.c_str();
  377. const char *szANSICharset = "us-ascii";
  378. const char *szHTMLCharset;
  379. std::string strConvertCharset;
  380. std::wstring strOutput;
  381. int ulState = 0;
  382. bool bPar = false;
  383. int nLineChar=0;
  384. RTFSTATE sState[RTF_MAXSTATE];
  385. fontmap_t mapFontToCharset;
  386. convert_context convertContext;
  387. string tmp;
  388. // select output charset
  389. hr = HrGetCharsetByCP(ulCodepage, &szHTMLCharset);
  390. if (hr != hrSuccess) {
  391. szHTMLCharset = "us-ascii";
  392. hr = hrSuccess;
  393. }
  394. strConvertCharset = szHTMLCharset + string("//HTMLENTITIES");
  395. tmp = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2//EN\">\r\n" \
  396. "<HTML>\r\n" \
  397. "<HEAD>\r\n" \
  398. "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=";
  399. tmp += szHTMLCharset;
  400. tmp += "\">\r\n" \
  401. "<META NAME=\"Generator\" CONTENT=\"Kopano HrExtractHTMLFromTextRTF\">\r\n" \
  402. "<TITLE></TITLE>\r\n" \
  403. "</HEAD>\r\n" \
  404. "<BODY>\r\n" \
  405. "<!-- Converted from text/plain format -->\r\n" \
  406. "\r\n"; //FIXME create title on the fly ?
  407. wstrUnicodeTmp.resize(0,0);
  408. TryConvert(convertContext, tmp, rawsize(tmp), "us-ascii", wstrUnicodeTmp);
  409. strOutput.append(wstrUnicodeTmp);
  410. InitRTFState(&sState[0]);
  411. while(*szInput) {
  412. if(strncmp(szInput,"\\*",2) == 0) {
  413. szInput+=2;
  414. } else if(*szInput == '\\') {
  415. // Command
  416. char szCommand[RTF_MAXCMD];
  417. char *szCmdOutput;
  418. int lArg = -1;
  419. bool bNeg = false;
  420. ++szInput;
  421. if(isalpha(*szInput)) {
  422. szCmdOutput = szCommand;
  423. while (isalpha(*szInput) && szCmdOutput < szCommand + RTF_MAXCMD - 1)
  424. *szCmdOutput++ = *szInput++;
  425. *szCmdOutput = 0;
  426. if(*szInput == '-') {
  427. bNeg = true;
  428. ++szInput;
  429. }
  430. if(isdigit(*szInput)) {
  431. lArg = 0;
  432. while (isdigit(*szInput)) {
  433. lArg = lArg * 10 + *szInput - '0';
  434. ++szInput;
  435. }
  436. if(bNeg) lArg = -lArg;
  437. }
  438. if(*szInput == ' ')
  439. ++szInput;
  440. // szCommand is the command, lArg is the argument.
  441. if(strcmp(szCommand,"fonttbl") == 0) {
  442. sState[ulState].bInFontTbl = true;
  443. } else if(strcmp(szCommand,"colortbl") == 0) {
  444. sState[ulState].bInColorTbl = true;
  445. } else if(strcmp(szCommand,"pntext") == 0) { // pntext is the plaintext alternative, ignore it.
  446. sState[ulState].bRTFOnly = true;
  447. } else if(strcmp(szCommand,"ansicpg") == 0) {
  448. if(HrGetCharsetByCP(lArg, &szANSICharset) != hrSuccess)
  449. szANSICharset = "us-ascii";
  450. sState[ulState].szCharset = szANSICharset;
  451. } else if(strcmp(szCommand,"fcharset") == 0) {
  452. if(sState[ulState].bInFontTbl) {
  453. mapFontToCharset.insert(pair<int, int>(sState[ulState].ulFont, lArg));
  454. }
  455. } else if(strcmp(szCommand,"htmltag") == 0) {
  456. } else if(strcmp(szCommand,"mhtmltag") == 0) {
  457. } else if (strcmp(szCommand, "line") == 0) {
  458. sState[ulState].output.append("<br>\r\n");
  459. } else if (strcmp(szCommand, "par") == 0 &&
  460. !sState[ulState].bInFontTbl &&
  461. !sState[ulState].bRTFOnly &&
  462. !sState[ulState].bInColorTbl &&
  463. !sState[ulState].bInSkipTbl &&
  464. bPar) {
  465. sState[ulState].output.append("</P>\r\n\r\n");
  466. bPar = false;
  467. nLineChar = 0;
  468. } else if(strcmp(szCommand,"tab") == 0) {
  469. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl) {
  470. sState[ulState].output.append(1,' ');
  471. sState[ulState].output.append(1,' ');
  472. sState[ulState].output.append(1,' ');
  473. }
  474. } else if (strcmp(szCommand,"uc") == 0) {
  475. sState[ulState].ulUnicodeSkip = lArg;
  476. } else if(strcmp(szCommand,"f") == 0) {
  477. sState[ulState].ulFont = lArg;
  478. if(!sState[ulState].bInFontTbl) {
  479. fontmap_t::const_iterator i = mapFontToCharset.find(lArg);
  480. if (i == mapFontToCharset.cend())
  481. continue;
  482. // Output any data before this point
  483. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  484. // Set new charset
  485. HrGetCharsetByRTFID(i->second, &sState[ulState].szCharset);
  486. if(sState[ulState].szCharset == NULL) {
  487. sState[ulState].szCharset = "us-ascii";
  488. } else if(sState[ulState].szCharset[0] == 0) {
  489. sState[ulState].szCharset = szANSICharset;
  490. }
  491. }
  492. // ignore error
  493. }
  494. else if (strcmp(szCommand,"u") == 0) {
  495. if (!bPar) {
  496. sState[ulState].output.append("<p>");
  497. bPar = true;
  498. }
  499. // unicode character, in signed short WCHAR
  500. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  501. if (!sState[ulState].bRTFOnly)
  502. strOutput.append(1, (unsigned short)lArg); // add as literal character
  503. sState[ulState].ulSkipChars += sState[ulState].ulUnicodeSkip;
  504. }
  505. else if(strcmp(szCommand,"htmlrtf") == 0) {
  506. sState[ulState].bRTFOnly = lArg != 0;
  507. } else if(isRTFIgnoreCommand(szCommand)) {
  508. sState[ulState].bInSkipTbl = true;
  509. }
  510. }
  511. // Non-alnum after '\'
  512. else if(*szInput == '\\') {
  513. ++szInput;
  514. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  515. sState[ulState].output.append(1,'\\');
  516. }
  517. else if(*szInput == '{') {
  518. ++szInput;
  519. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  520. sState[ulState].output.append(1,'{');
  521. }
  522. else if(*szInput == '}') {
  523. ++szInput;
  524. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  525. sState[ulState].output.append(1,'}');
  526. }
  527. else if(*szInput == '\'') {
  528. unsigned int ulChar;
  529. if (!bPar) {
  530. sState[ulState].output.append("<p>");
  531. bPar = true;
  532. }
  533. // Dump output data until now, if we're switching charsets
  534. if(szANSICharset == NULL || strcmp(sState[ulState].szCharset, szANSICharset) != 0) {
  535. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  536. }
  537. while(*szInput == '\'')
  538. {
  539. ulChar = 0;
  540. ++szInput;
  541. if(*szInput) {
  542. ulChar = (unsigned int) (strchr(szHex, toupper(*szInput)) == NULL ? 0 : (strchr(szHex, toupper(*szInput)) - szHex));
  543. ulChar = ulChar << 4;
  544. ++szInput;
  545. }
  546. if(*szInput) {
  547. ulChar += (unsigned int) (strchr(szHex, toupper(*szInput)) == NULL ? 0 : (strchr(szHex, toupper(*szInput)) - szHex));
  548. ++szInput;
  549. }
  550. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].ulSkipChars) {
  551. sState[ulState].output.append(1,ulChar);
  552. } else if (sState[ulState].ulSkipChars)
  553. --sState[ulState].ulSkipChars;
  554. if(*szInput == '\\' && *(szInput+1) == '\'')
  555. ++szInput;
  556. else
  557. break;
  558. }
  559. // Dump escaped data in charset 0 (ansicpg), if we had to switch charsets
  560. if(szANSICharset == NULL || strcmp(sState[ulState].szCharset, szANSICharset) != 0) {
  561. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  562. }
  563. } else {
  564. ++szInput; // skip single character after '\'
  565. }
  566. } // Non-command
  567. else if(*szInput == '{') {
  568. // Dump output data
  569. if (!sState[ulState].output.empty()) {
  570. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  571. }
  572. ++ulState;
  573. if (ulState >= RTF_MAXSTATE)
  574. return MAPI_E_NOT_ENOUGH_MEMORY;
  575. sState[ulState] = sState[ulState-1];
  576. ++szInput;
  577. } else if(*szInput == '}') {
  578. // Dump output data
  579. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  580. if(ulState > 0)
  581. --ulState;
  582. ++szInput;
  583. } else if (*szInput == '\r' || *szInput == '\n') {
  584. ++szInput;
  585. } else if (!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl && !sState[ulState].ulSkipChars) {
  586. if (bPar == false) {
  587. sState[ulState].output.append("<P>");
  588. bPar = true;
  589. }
  590. // Change space to &nbsp; . The last space is a real space like "&nbsp;&nbsp; " or " "
  591. if (*szInput == ' ') {
  592. ++szInput;
  593. while (*szInput == ' ') {
  594. sState[ulState].output.append("&nbsp;");
  595. ++szInput;
  596. }
  597. sState[ulState].output.append(1, ' ');
  598. } else {
  599. std::wstring entity;
  600. if (!CHtmlEntity::CharToHtmlEntity((WCHAR)*szInput, entity))
  601. sState[ulState].output.append(1, *szInput);
  602. else
  603. sState[ulState].output.append(entity.begin(), entity.end());
  604. ++szInput;
  605. }
  606. ++nLineChar;
  607. } else {
  608. if (sState[ulState].ulSkipChars)
  609. --sState[ulState].ulSkipChars;
  610. ++szInput;
  611. }
  612. }
  613. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  614. if (bPar)
  615. strOutput += L"</p>\r\n";
  616. strOutput += L"\r\n" \
  617. L"</BODY>\r\n" \
  618. L"</HTML>\r\n";
  619. try {
  620. lpStrHTMLOut = convertContext.convert_to<string>(strConvertCharset.c_str(), strOutput, rawsize(strOutput), CHARSET_WCHAR);
  621. } catch (const convert_exception &ce) {
  622. hr = details::HrFromException(ce);
  623. }
  624. return hr;
  625. }
  626. /**
  627. * Extracts the HTML text that was encapsulated in an RTF text. It
  628. * will return an HTML string in the given codepage.
  629. *
  630. * To convert between the RTF text and HTML codepage text, we use a
  631. * WCHAR string as intermediate.
  632. *
  633. * @param[in] lpStrRTFIn RTF input string that contains \fromhtml
  634. * @param[out] lpStrHTMLOut HTML output in requested ulCodepage
  635. * @param[out] ulCodepage codepage for HTML output
  636. *
  637. * @todo Export the right HTML tags, now only plain stuff
  638. */
  639. HRESULT HrExtractHTMLFromRealRTF(const std::string &rtf_unfilt,
  640. std::string &lpStrHTMLOut, ULONG ulCodepage)
  641. {
  642. HRESULT hr;
  643. auto lpStrRTFIn = string_strip_nuls(rtf_unfilt);
  644. std::wstring wstrUnicodeTmp;
  645. const char *szInput = lpStrRTFIn.c_str();
  646. const char *szANSICharset = "us-ascii";
  647. const char *szHTMLCharset;
  648. std::string strConvertCharset;
  649. std::wstring strOutput;
  650. int ulState = 0;
  651. RTFSTATE sState[RTF_MAXSTATE];
  652. convert_context convertContext;
  653. string tmp;
  654. fontmap_t mapFontToCharset;
  655. bool bPar = false;
  656. // select output charset
  657. hr = HrGetCharsetByCP(ulCodepage, &szHTMLCharset);
  658. if (hr != hrSuccess) {
  659. szHTMLCharset = "us-ascii";
  660. hr = hrSuccess;
  661. }
  662. strConvertCharset = szHTMLCharset + string("//HTMLENTITIES");
  663. tmp = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2//EN\">\r\n" \
  664. "<HTML>\r\n" \
  665. "<HEAD>\r\n" \
  666. "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=";
  667. tmp += szHTMLCharset;
  668. tmp += "\">\r\n" \
  669. "<META NAME=\"Generator\" CONTENT=\"Kopano HrExtractHTMLFromRealRTF\">\r\n" \
  670. "<TITLE></TITLE>\r\n" \
  671. "</HEAD>\r\n" \
  672. "<BODY>\r\n" \
  673. "<!-- Converted from text/rtf format -->\r\n" \
  674. "\r\n"; //FIXME create title on the fly ?
  675. TryConvert(convertContext, tmp, rawsize(tmp), "us-ascii", wstrUnicodeTmp);
  676. strOutput.append(wstrUnicodeTmp);
  677. InitRTFState(&sState[0]);
  678. while(*szInput) {
  679. if(strncmp(szInput,"\\*",2) == 0) {
  680. szInput+=2;
  681. } else if(*szInput == '\\') {
  682. // Command
  683. char szCommand[RTF_MAXCMD];
  684. char *szCmdOutput;
  685. int lArg = -1;
  686. bool bNeg = false;
  687. ++szInput;
  688. if(isalpha(*szInput)) {
  689. szCmdOutput = szCommand;
  690. while (isalpha(*szInput) && szCmdOutput < szCommand + RTF_MAXCMD - 1)
  691. *szCmdOutput++ = *szInput++;
  692. *szCmdOutput = 0;
  693. if(*szInput == '-') {
  694. bNeg = true;
  695. ++szInput;
  696. }
  697. if(isdigit(*szInput)) {
  698. lArg = 0;
  699. while (isdigit(*szInput)) {
  700. lArg = lArg * 10 + *szInput - '0';
  701. ++szInput;
  702. }
  703. if(bNeg) lArg = -lArg;
  704. }
  705. if(*szInput == ' ')
  706. ++szInput;
  707. // szCommand is the command, lArg is the argument.
  708. if(strcmp(szCommand,"fonttbl") == 0) {
  709. sState[ulState].bInFontTbl = true;
  710. } else if(strcmp(szCommand,"colortbl") == 0) {
  711. sState[ulState].bInColorTbl = true;
  712. } else if(strcmp(szCommand,"listtable") == 0) {
  713. sState[ulState].bInSkipTbl = true;
  714. } else if(strcmp(szCommand,"pntext") == 0) { // pntext is the plaintext alternative, ignore it.
  715. sState[ulState].bRTFOnly = true;
  716. } else if(strcmp(szCommand,"ansicpg") == 0) {
  717. if(HrGetCharsetByCP(lArg, &szANSICharset) != hrSuccess)
  718. szANSICharset = "us-ascii";
  719. sState[ulState].szCharset = szANSICharset;
  720. } else if(strcmp(szCommand,"fcharset") == 0) {
  721. if(sState[ulState].bInFontTbl) {
  722. mapFontToCharset.insert(pair<int, int>(sState[ulState].ulFont, lArg));
  723. }
  724. } else if(strcmp(szCommand,"htmltag") == 0) {
  725. } else if(strcmp(szCommand,"latentstyles") == 0) {
  726. sState[ulState].bRTFOnly = true;
  727. } else if(strcmp(szCommand,"datastore") == 0) {
  728. sState[ulState].bRTFOnly = true;
  729. } else if(strcmp(szCommand,"mhtmltag") == 0) {
  730. } else if (strcmp(szCommand, "line") == 0) {
  731. sState[ulState].output.append("<br>\r\n");
  732. } else if (strcmp(szCommand,"par") == 0) {
  733. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl) {
  734. if (bPar)
  735. sState[ulState].output.append("</p>\r\n\r\n");
  736. sState[ulState].output.append("<p>");
  737. bPar = true;
  738. }
  739. } else if(strcmp(szCommand,"tab") == 0) {
  740. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl) {
  741. sState[ulState].output.append(1,' ');
  742. sState[ulState].output.append(1,' ');
  743. sState[ulState].output.append(1,' ');
  744. }
  745. } else if (strcmp(szCommand,"bin") == 0) {
  746. if (lArg > 0)
  747. szInput += lArg; // skip all binary bytes here.
  748. } else if (strcmp(szCommand,"uc") == 0) {
  749. sState[ulState].ulUnicodeSkip = lArg;
  750. } else if(strcmp(szCommand,"f") == 0) {
  751. sState[ulState].ulFont = lArg;
  752. if(!sState[ulState].bInFontTbl) {
  753. fontmap_t::const_iterator i = mapFontToCharset.find(lArg);
  754. if (i == mapFontToCharset.cend())
  755. continue;
  756. // Output any data before this point
  757. if (!sState[ulState].output.empty()) {
  758. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  759. }
  760. // Set new charset
  761. HrGetCharsetByRTFID(i->second, &sState[ulState].szCharset);
  762. if(sState[ulState].szCharset == NULL) {
  763. sState[ulState].szCharset = "us-ascii";
  764. } else if(sState[ulState].szCharset[0] == 0) {
  765. sState[ulState].szCharset = szANSICharset;
  766. }
  767. }
  768. // ignore error
  769. }
  770. else if (strcmp(szCommand,"u") == 0) {
  771. if (!bPar) {
  772. sState[ulState].output.append("<p>");
  773. bPar = true;
  774. }
  775. // unicode character, in signed short WCHAR
  776. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  777. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl) {
  778. std::wstring entity;
  779. if (! CHtmlEntity::CharToHtmlEntity((WCHAR)*szInput, entity))
  780. strOutput.append(1, (unsigned short)lArg); // add as literal character
  781. else
  782. strOutput.append(entity.begin(), entity.end());
  783. }
  784. sState[ulState].ulSkipChars += sState[ulState].ulUnicodeSkip;
  785. }
  786. else if(strcmp(szCommand,"htmlrtf") == 0) {
  787. sState[ulState].bRTFOnly = lArg != 0;
  788. }
  789. /*else if(strcmp(szCommand,"b") == 0) {
  790. if( lArg == -1)
  791. sState[ulState].output.append("<b>", 3);
  792. else
  793. sState[ulState].output.append("</b>", 4);
  794. }else if(strcmp(szCommand,"i") == 0) {
  795. if( lArg == -1)
  796. sState[ulState].output.append("<i>", 3);
  797. else
  798. sState[ulState].output.append("</i>", 4);
  799. }else if(strcmp(szCommand,"ul") == 0) {
  800. if( lArg == -1)
  801. sState[ulState].output.append("<u>", 3);
  802. else
  803. sState[ulState].output.append("</u>", 4);
  804. }else if(strcmp(szCommand,"ulnone") == 0) {
  805. sState[ulState].output.append("</u>", 4);
  806. }*/
  807. else if(strcmp(szCommand,"generator") == 0){
  808. while (*szInput != ';' && *szInput != '}' && *szInput)
  809. ++szInput;
  810. if(*szInput == ';')
  811. ++szInput;
  812. }
  813. else if(strcmp(szCommand,"bkmkstart") == 0 || strcmp(szCommand,"bkmkend") == 0){
  814. // skip bookmark name
  815. while (*szInput && isalnum(*szInput))
  816. ++szInput;
  817. sState[ulState].bInSkipTbl = true;
  818. } else if (strcmp(szCommand, "endash") == 0) {
  819. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  820. // windows-1252: 0x96, unicode 0x2013
  821. strOutput += 0x2013;
  822. } else if (strcmp(szCommand, "emdash") == 0) {
  823. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  824. // windows-1252: 0x97, unicode 0x2014
  825. strOutput += 0x2014;
  826. } else if (strcmp(szCommand, "lquote") == 0) {
  827. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  828. // windows-1252: 0x91, unicode 0x2018
  829. strOutput += 0x2018;
  830. } else if (strcmp(szCommand, "rquote") == 0) {
  831. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  832. // windows-1252: 0x92, unicode 0x2019
  833. strOutput += 0x2019;
  834. } else if (strcmp(szCommand, "ldblquote") == 0) {
  835. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  836. // windows-1252: 0x93, unicode 0x201C
  837. strOutput += 0x201C;
  838. } else if (strcmp(szCommand, "rdblquote") == 0) {
  839. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  840. // windows-1252: 0x94, unicode 0x201D
  841. strOutput += 0x201D;
  842. } else if (strcmp(szCommand, "bullet") == 0) {
  843. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  844. // windows-1252: 0x95, unicode 0x2022
  845. strOutput += 0x2022;
  846. } else if(isRTFIgnoreCommand(szCommand)) {
  847. sState[ulState].bInSkipTbl = true;
  848. }
  849. }
  850. // Non-alnum after '\'
  851. else if(*szInput == '\\') {
  852. ++szInput;
  853. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  854. sState[ulState].output.append(1,'\\');
  855. }
  856. else if(*szInput == '{') {
  857. ++szInput;
  858. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  859. sState[ulState].output.append(1,'{');
  860. }
  861. else if(*szInput == '}') {
  862. ++szInput;
  863. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  864. sState[ulState].output.append(1,'}');
  865. }
  866. else if(*szInput == '\'') {
  867. unsigned int ulChar;
  868. std::wstring wstrUnicode;
  869. if (!bPar) {
  870. sState[ulState].output.append("<p>");
  871. bPar = true;
  872. }
  873. while(*szInput == '\'')
  874. {
  875. ulChar = 0;
  876. ++szInput;
  877. if(*szInput) {
  878. ulChar = (unsigned int) (strchr(szHex, toupper(*szInput)) == NULL ? 0 : (strchr(szHex, toupper(*szInput)) - szHex));
  879. ulChar = ulChar << 4;
  880. ++szInput;
  881. }
  882. if(*szInput) {
  883. ulChar += (unsigned int) (strchr(szHex, toupper(*szInput)) == NULL ? 0 : (strchr(szHex, toupper(*szInput)) - szHex));
  884. ++szInput;
  885. }
  886. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl && !sState[ulState].ulSkipChars) {
  887. sState[ulState].output.append(1,ulChar);
  888. } else if (sState[ulState].ulSkipChars)
  889. --sState[ulState].ulSkipChars;
  890. if(*szInput == '\\' && *(szInput+1) == '\'')
  891. ++szInput;
  892. else
  893. break;
  894. }
  895. } else {
  896. ++szInput; // skip single character after '\'
  897. }
  898. } // Non-command
  899. else if(*szInput == '{') {
  900. // Dump output data
  901. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  902. ++ulState;
  903. if (ulState >= RTF_MAXSTATE)
  904. return MAPI_E_NOT_ENOUGH_MEMORY;
  905. sState[ulState] = sState[ulState-1];
  906. ++szInput;
  907. } else if(*szInput == '}') {
  908. // Dump output data
  909. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  910. if(ulState > 0)
  911. --ulState;
  912. ++szInput;
  913. } else if(*szInput == '\r' || *szInput == '\n') {
  914. ++szInput;
  915. } else {
  916. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl && !sState[ulState].ulSkipChars) {
  917. if (!bPar) {
  918. sState[ulState].output.append("<p>");
  919. bPar = true;
  920. }
  921. // basic html escaping only
  922. if (*szInput == '&')
  923. sState[ulState].output.append("&amp;");
  924. else if (*szInput == '<')
  925. sState[ulState].output.append("&lt;");
  926. else if (*szInput == '>')
  927. sState[ulState].output.append("&gt;");
  928. else
  929. sState[ulState].output.append(1,*szInput);
  930. } else if (sState[ulState].ulSkipChars)
  931. --sState[ulState].ulSkipChars;
  932. ++szInput;
  933. }
  934. }
  935. strOutput += RTFFlushStateOutput(convertContext, sState, ulState);
  936. if (bPar)
  937. strOutput += L"</p>\r\n";
  938. strOutput += L"\r\n" \
  939. L"</BODY>\r\n" \
  940. L"</HTML>\r\n";
  941. try {
  942. lpStrHTMLOut = convertContext.convert_to<string>(strConvertCharset.c_str(), strOutput, rawsize(strOutput), CHARSET_WCHAR);
  943. } catch (const convert_exception &ce) {
  944. hr = details::HrFromException(ce);
  945. }
  946. return hr;
  947. }
  948. /**
  949. * Checks if input is HTML "wrapped" in RTF.
  950. *
  951. * We look for the words "\fromhtml" somewhere in the file. If the
  952. * rtf encodes text rather than html, then instead it will only find
  953. * "\fromtext".
  954. *
  955. * @param[in] buf character buffer containing RTF text
  956. * @param[in] len length of input buffer
  957. * @return true if buf is html wrapped in rtf
  958. */
  959. bool isrtfhtml(const char *buf, unsigned int len)
  960. {
  961. for (const char *c = buf; c < buf + len - 9; ++c)
  962. if (strncmp(c, "\\from", 5) == 0)
  963. return strncmp(c, "\\fromhtml", 9) == 0;
  964. return false;
  965. }
  966. /**
  967. * Checks if input is Text "wrapped" in RTF.
  968. *
  969. * We look for the words "\fromtext" somewhere in the file. If the
  970. * rtf encodes text rather than text, then instead it will only find
  971. * "\fromhtml".
  972. *
  973. * @param[in] buf character buffer containing RTF text
  974. * @param[in] len length of input buffer
  975. * @return true if buf is html wrapped in rtf
  976. */
  977. bool isrtftext(const char *buf, unsigned int len)
  978. {
  979. for (const char *c = buf; c < buf + len - 9; ++c)
  980. if (strncmp(c, "\\from", 5) == 0)
  981. return strncmp(c, "\\fromtext", 9) == 0;
  982. return false;
  983. }
  984. /**
  985. * Convert RTF, which should have \fromtext marker, to plain text format in WCHAR.
  986. *
  987. * @param[in] lpStrRTFIn string containing RTF with \fromtext marker
  988. * @param[out] strBodyOut the converted body
  989. * @return mapi error code
  990. * @retval MAPI_E_NOT_ENOUGH_MEMORY too many states in rtf, > 256
  991. */
  992. HRESULT HrExtractBODYFromTextRTF(const std::string &rtf_unfilt,
  993. std::wstring &strBodyOut)
  994. {
  995. auto lpStrRTFIn = string_strip_nuls(rtf_unfilt);
  996. const char *szInput = lpStrRTFIn.c_str();
  997. const char *szANSICharset = "us-ascii";
  998. int ulState = 0;
  999. RTFSTATE sState[RTF_MAXSTATE];
  1000. fontmap_t mapFontToCharset;
  1001. convert_context convertContext;
  1002. std::wstring strwAppend;
  1003. strBodyOut.resize(0,0);
  1004. InitRTFState(&sState[0]);
  1005. while(*szInput) {
  1006. if(*szInput == '\\') {
  1007. // Command
  1008. char szCommand[RTF_MAXCMD];
  1009. char *szCmdOutput;
  1010. int lArg = -1;
  1011. bool bNeg = false;
  1012. ++szInput;
  1013. if(isalpha(*szInput)) {
  1014. szCmdOutput = szCommand;
  1015. while (isalpha(*szInput) && szCmdOutput < szCommand + RTF_MAXCMD - 1)
  1016. *szCmdOutput++ = *szInput++;
  1017. *szCmdOutput = 0;
  1018. if(*szInput == '-') {
  1019. bNeg = true;
  1020. ++szInput;
  1021. }
  1022. if(isdigit(*szInput)) {
  1023. lArg = 0;
  1024. while (isdigit(*szInput)) {
  1025. lArg = lArg * 10 + *szInput - '0';
  1026. ++szInput;
  1027. }
  1028. if(bNeg) lArg = -lArg;
  1029. }
  1030. if(*szInput == ' ')
  1031. ++szInput;
  1032. // szCommand is the command, lArg is the argument.
  1033. if(strcmp(szCommand,"fonttbl") == 0) {
  1034. sState[ulState].bInFontTbl = true;
  1035. } else if(strcmp(szCommand,"colortbl") == 0) {
  1036. sState[ulState].bInColorTbl = true;
  1037. } else if(strcmp(szCommand,"pntext") == 0) { // pntext is the plaintext alternative, ignore it.
  1038. sState[ulState].bRTFOnly = true;
  1039. } else if(strcmp(szCommand,"ansicpg") == 0) {
  1040. if(HrGetCharsetByCP(lArg, &szANSICharset) != hrSuccess)
  1041. szANSICharset = "us-ascii";
  1042. sState[ulState].szCharset = szANSICharset;
  1043. } else if(strcmp(szCommand,"fcharset") == 0) {
  1044. if(sState[ulState].bInFontTbl) {
  1045. mapFontToCharset.insert(pair<int, int>(sState[ulState].ulFont, lArg));
  1046. }
  1047. } else if(strcmp(szCommand,"htmltag") == 0) {
  1048. } else if(strcmp(szCommand,"mhtmltag") == 0) {
  1049. } else if(strcmp(szCommand,"par") == 0 || strcmp(szCommand,"line") == 0) {
  1050. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl) {
  1051. sState[ulState].output.append("\r\n");
  1052. }
  1053. } else if(strcmp(szCommand,"tab") == 0) {
  1054. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl) {
  1055. sState[ulState].output.append("\t");
  1056. }
  1057. } else if (strcmp(szCommand,"uc") == 0) {
  1058. sState[ulState].ulUnicodeSkip = lArg;
  1059. } else if(strcmp(szCommand,"f") == 0) {
  1060. sState[ulState].ulFont = lArg;
  1061. if(!sState[ulState].bInFontTbl) {
  1062. fontmap_t::const_iterator i = mapFontToCharset.find(lArg);
  1063. if (i == mapFontToCharset.cend())
  1064. continue;
  1065. // Output any data before this point
  1066. strBodyOut += RTFFlushStateOutput(convertContext, sState, ulState);
  1067. // Set new charset
  1068. HrGetCharsetByRTFID(i->second, &sState[ulState].szCharset);
  1069. if(sState[ulState].szCharset == NULL) {
  1070. sState[ulState].szCharset = "us-ascii";
  1071. } else if(sState[ulState].szCharset[0] == 0) {
  1072. sState[ulState].szCharset = szANSICharset;
  1073. }
  1074. }
  1075. // ignore error
  1076. }
  1077. else if (strcmp(szCommand,"u") == 0) {
  1078. // unicode character, in signed short WCHAR
  1079. strBodyOut += RTFFlushStateOutput(convertContext, sState, ulState);
  1080. if (!sState[ulState].bRTFOnly)
  1081. strBodyOut.append(1, (unsigned short)lArg); // add as literal character
  1082. sState[ulState].ulSkipChars += sState[ulState].ulUnicodeSkip;
  1083. }
  1084. else if(strcmp(szCommand,"htmlrtf") == 0) {
  1085. sState[ulState].bRTFOnly = lArg != 0;
  1086. }
  1087. else if(strcmp(szCommand,"generator") == 0){
  1088. while (*szInput != ';' && *szInput != '}' && *szInput)
  1089. ++szInput;
  1090. if(*szInput == ';')
  1091. ++szInput;
  1092. }
  1093. else if(strcmp(szCommand,"bkmkstart") == 0 || strcmp(szCommand,"bkmkend") == 0){
  1094. // skip bookmark name
  1095. while (*szInput && isalnum(*szInput))
  1096. ++szInput;
  1097. sState[ulState].bInSkipTbl = true;
  1098. } else if(isRTFIgnoreCommand(szCommand)) {
  1099. sState[ulState].bInSkipTbl = true;
  1100. }
  1101. }
  1102. // Non-alnum after '\'
  1103. else if(*szInput == '\\') {
  1104. ++szInput;
  1105. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  1106. sState[ulState].output.append(1,'\\');
  1107. }
  1108. else if(*szInput == '{') {
  1109. ++szInput;
  1110. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  1111. sState[ulState].output.append(1,'{');
  1112. }
  1113. else if(*szInput == '}') {
  1114. ++szInput;
  1115. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl)
  1116. sState[ulState].output.append(1,'}');
  1117. }
  1118. else if(*szInput == '\'') {
  1119. unsigned int ulChar = 0;
  1120. ++szInput;
  1121. if(*szInput) {
  1122. ulChar = (unsigned int) (strchr(szHex, toupper(*szInput)) == NULL ? 0 : (strchr(szHex, toupper(*szInput)) - szHex));
  1123. ulChar = ulChar << 4;
  1124. ++szInput;
  1125. }
  1126. if(*szInput) {
  1127. ulChar += (unsigned int) (strchr(szHex, toupper(*szInput)) == NULL ? 0 : (strchr(szHex, toupper(*szInput)) - szHex));
  1128. ++szInput;
  1129. }
  1130. if(!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl && !sState[ulState].ulSkipChars) {
  1131. if(ulChar > 255)
  1132. sState[ulState].output.append(1, '?');
  1133. else
  1134. sState[ulState].output.append(1, ulChar);
  1135. } else if (sState[ulState].ulSkipChars)
  1136. --sState[ulState].ulSkipChars;
  1137. } else {
  1138. ++szInput; // skip single character after '\'
  1139. }
  1140. } // Non-command
  1141. else if(*szInput == '{') {
  1142. // Dump output data
  1143. strBodyOut += RTFFlushStateOutput(convertContext, sState, ulState);
  1144. ++ulState;
  1145. if (ulState >= RTF_MAXSTATE)
  1146. return MAPI_E_NOT_ENOUGH_MEMORY;
  1147. sState[ulState] = sState[ulState-1];
  1148. ++szInput;
  1149. } else if(*szInput == '}') {
  1150. // Dump output data
  1151. strBodyOut += RTFFlushStateOutput(convertContext, sState, ulState);
  1152. if(ulState > 0)
  1153. --ulState;
  1154. ++szInput;
  1155. } else if(*szInput == '\r' || *szInput == '\n') {
  1156. ++szInput;
  1157. } else {
  1158. if (!sState[ulState].bInFontTbl && !sState[ulState].bRTFOnly && !sState[ulState].bInColorTbl && !sState[ulState].bInSkipTbl && !sState[ulState].ulSkipChars)
  1159. sState[ulState].output.append(1,*szInput);
  1160. else if (sState[ulState].ulSkipChars)
  1161. --sState[ulState].ulSkipChars;
  1162. ++szInput;
  1163. }
  1164. }
  1165. strBodyOut += RTFFlushStateOutput(convertContext, sState, ulState);
  1166. return hrSuccess;
  1167. }
  1168. } /* namespace */