librosie.cpp 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. #ifndef _GNU_SOURCE
  2. #define _GNU_SOURCE
  3. #endif
  4. #include <map>
  5. #include <set>
  6. #include <string>
  7. #include <vector>
  8. #include <cstdio>
  9. #include <pthread.h>
  10. #include <tidy.h>
  11. #include <tidybuffio.h>
  12. #include <kopano/platform.h>
  13. #include <kopano/stringutil.h>
  14. #include "librosie.h"
  15. namespace KC {
  16. static std::set<std::string> rosie_good_tags = {
  17. "a", "abbr", "address", "area", "article", "aside", "b", "blockquote",
  18. "body", "br", "caption", "center", "cite", "code", "col", "datalist",
  19. "details", "div", "em", "figcaption", "figure", "font", "footer",
  20. "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hr",
  21. "html", "i", "img", "input", "ins", "label", "legend", "li",
  22. "link", // FIXME css?
  23. "mark", "meter", "nav", "noscript", "ol", "optgroup", "output", "p",
  24. "pre", "q", "samp", "section", "small", "span", "strong", "style",
  25. "sub", "summary", "sup", "table", "tbody", "td", "textarea", "tfoot",
  26. "th", "thead", "time", "title", "tr", "u", "ul", "var", "wbr",
  27. };
  28. static std::map<std::string, std::set<std::string> > rosie_good_attrs = {
  29. {"a", {"class", "href", "name", "title"}},
  30. {"div", {"class", "id"}},
  31. {"font", {"size"}},
  32. {"form", {"id"}},
  33. {"img", {"alt", "border", "height", "src", "width"}},
  34. {"input", {"class", "id", "size", "type", "width", "for"}},
  35. {"label", {"for"}},
  36. {"li", {"class"}},
  37. {"li", {"id"}},
  38. {"link", {"href", "media", "rel", "type"}},
  39. {"meta", {"charset"}},
  40. {"table", {"cellpadding", "cellspacing", "class", "height", "width"}},
  41. {"td", {"align", "height", "valign", "width"}},
  42. {"tr", {"align", "height", "valign", "width"}},
  43. };
  44. static pthread_mutex_t rosie_initlock = PTHREAD_MUTEX_INITIALIZER;
  45. static bool rosie_inited = false;
  46. static void rosie_init(void)
  47. {
  48. for (const auto &p : rosie_good_attrs) {
  49. auto it = rosie_good_attrs.find(p.first);
  50. if (it != rosie_good_attrs.end())
  51. continue;
  52. rosie_good_tags.insert(p.first);
  53. }
  54. }
  55. static bool rosie_reject_tag(const char *const tag)
  56. {
  57. bool r = rosie_good_tags.find(tag) == rosie_good_tags.end();
  58. #ifdef _DEBUG
  59. if (r)
  60. fprintf(stderr, "%s ", tag);
  61. #endif
  62. return r;
  63. }
  64. static bool rosie_reject_attr(const char *tag, const char *const attr)
  65. {
  66. auto it = rosie_good_attrs.find(tag);
  67. if (it == rosie_good_attrs.end())
  68. return true;
  69. bool r = it->second.find(attr) == it->second.end();
  70. #ifdef _DEBUG
  71. if (r)
  72. fprintf(stderr, "%s(%s) ", tag, attr);
  73. #endif
  74. return r;
  75. }
  76. static void rosie_strip_attrs(TidyDoc tdoc, TidyNode tnod)
  77. {
  78. ctmbstr tname = tidyNodeGetName(tnod);
  79. for (TidyAttr attribute = tidyAttrFirst(tnod); attribute != NULL; ) {
  80. ctmbstr aname = tidyAttrName(attribute);
  81. if (aname != NULL && rosie_reject_attr(tname, aname)) {
  82. TidyAttr next = tidyAttrNext(attribute);
  83. tidyAttrDiscard(tdoc, tnod, attribute);
  84. attribute = next;
  85. } else {
  86. attribute = tidyAttrNext(attribute);
  87. }
  88. }
  89. }
  90. static bool rosie_strip_nodes(TidyDoc tdoc, TidyNode tnod)
  91. {
  92. for (TidyNode child = tidyGetChild(tnod); child != NULL; ) {
  93. ctmbstr name = tidyNodeGetName(child);
  94. if (name != NULL && rosie_reject_tag(name)) {
  95. child = tidyDiscardElement(tdoc, child);
  96. } else {
  97. rosie_strip_attrs(tdoc, tnod);
  98. rosie_strip_nodes(tdoc, child);
  99. child = tidyGetNext(child);
  100. }
  101. }
  102. return true;
  103. }
  104. static bool rosie_strip_nodes(TidyDoc tdoc)
  105. {
  106. return rosie_strip_nodes(tdoc, tidyGetRoot(tdoc));
  107. }
  108. bool rosie_clean_html(const std::string &in, std::string *const out,
  109. std::vector<std::string> *const errors)
  110. {
  111. pthread_mutex_lock(&rosie_initlock);
  112. if (!rosie_inited) {
  113. rosie_inited = true;
  114. rosie_init();
  115. }
  116. pthread_mutex_unlock(&rosie_initlock);
  117. TidyBuffer output;
  118. TidyBuffer errbuf;
  119. int rc = -1;
  120. out->clear();
  121. tidyBufInit(&output);
  122. tidyBufInit(&errbuf);
  123. TidyDoc tdoc = tidyCreate();
  124. tidyOptSetBool(tdoc, TidyHideComments, yes); /* they don't help */
  125. tidyOptSetBool(tdoc, TidyReplaceColor, yes);
  126. tidyOptSetBool(tdoc, TidyPreserveEntities, yes);
  127. rc = tidySetErrorBuffer(tdoc, &errbuf); /* capture diagnostics */
  128. if (rc != 0 && errors != NULL)
  129. errors->push_back(format("tidySetErrorBuffer(%d) ", rc));
  130. if (rc >= 0)
  131. rc = tidyParseString(tdoc, in.c_str());
  132. if (rc != 0 && errors != NULL)
  133. errors->push_back(format("tidyParseString(%d) ", rc));
  134. if (rc >= 0)
  135. rc = tidyCleanAndRepair(tdoc);
  136. if (rc != 0 && errors != NULL)
  137. errors->push_back(format("tidyCleanAndRepair(%d) ", rc));
  138. if (rc >= 0)
  139. rc = rosie_strip_nodes(tdoc) ? 0 : -1;
  140. if (rc != 0 && errors != NULL)
  141. errors->push_back(format("RemoveBadHtml(%d) ", rc));
  142. if (rc >= 0)
  143. rc = tidyRunDiagnostics(tdoc); /* kvetch */
  144. if (rc != 0 && errors != NULL)
  145. errors->push_back(format("tidyRunDiagnostics(%d) ", rc));
  146. tidyOptSetBool(tdoc, TidyForceOutput, yes);
  147. if (rc >= 0)
  148. rc = tidySaveBuffer(tdoc, &output); /* pretty print */
  149. if (rc != 0 && errors != NULL)
  150. errors->push_back(format("tidySaveBuffer(%d) ", rc));
  151. out->assign(reinterpret_cast<const char *>(output.bp));
  152. if (rc == 0 || rc == 1) {
  153. /* rc==1: warnings emitted */
  154. if (rc == 1 && errors != nullptr)
  155. errors->push_back(format("%s: libtidy warning: %s",
  156. __PRETTY_FUNCTION__,
  157. reinterpret_cast<const char *>(errbuf.bp)));
  158. } else if (errors != nullptr) {
  159. errors->push_back(format("%s: libtidy failed: %s",
  160. __PRETTY_FUNCTION__,
  161. reinterpret_cast<const char *>(errbuf.bp)));
  162. }
  163. if (rc >= 0)
  164. tidyBufFree(&output);
  165. tidyBufFree(&errbuf);
  166. tidyRelease(tdoc);
  167. return rc >= 0;
  168. }
  169. } /* namespace */