parser.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. /*
  2. * Timeless dependency
  3. * Copyright (C) <2019> <alkeon> [alkeon@autistici.org]
  4. *
  5. * Texdi is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation, either version 3 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * Texdi is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with timeless. If not, see <http://www.gnu.org/licenses/>.
  17. *
  18. */
  19. #include <iostream>
  20. #include <string>
  21. #include <fstream>
  22. #include <sstream>
  23. #include <map>
  24. #include <curl/curl.h>
  25. #include "parser.h"
  26. #include "datalite.h"
  27. using namespace std;
  28. #define TITLE 0
  29. #define LINK 1
  30. #define DESCRIPTION 2
  31. #define START_ITEM 3
  32. #define END_ITEM 4
  33. const char * user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
  34. map<string, string> ENTITIES_VALUES = {
  35. { "AElig", "Æ" },
  36. { "Aacute", "Á" },
  37. { "Acirc", "Â" },
  38. { "Agrave", "À" },
  39. { "Alpha", "Α" },
  40. { "Aring", "Å" },
  41. { "Atilde", "Ã" },
  42. { "Auml", "Ä" },
  43. { "Beta", "Β" },
  44. { "Ccedil", "Ç" },
  45. { "Chi", "Χ" },
  46. { "Dagger", "‡" },
  47. { "Delta", "Δ" },
  48. { "ETH", "Ð" },
  49. { "Eacute", "É" },
  50. { "Ecirc", "Ê" },
  51. { "Egrave", "È" },
  52. { "Epsilon", "Ε" },
  53. { "Eta", "Η" },
  54. { "Euml", "Ë" },
  55. { "Gamma", "Γ" },
  56. { "Iacute", "Í" },
  57. { "Icirc", "Î" },
  58. { "Igrave", "Ì" },
  59. { "Iota", "Ι" },
  60. { "Iuml", "Ï" },
  61. { "Kappa", "Κ" },
  62. { "Lambda", "Λ" },
  63. { "Mu", "Μ" },
  64. { "Ntilde", "Ñ" },
  65. { "Nu", "Ν" },
  66. { "OElig", "Œ" },
  67. { "Oacute", "Ó" },
  68. { "Ocirc", "Ô" },
  69. { "Ograve", "Ò" },
  70. { "Omega", "Ω" },
  71. { "Omicron", "Ο" },
  72. { "Oslash", "Ø" },
  73. { "Otilde", "Õ" },
  74. { "Ouml", "Ö" },
  75. { "Phi", "Φ" },
  76. { "Pi", "Π" },
  77. { "Prime", "″" },
  78. { "Psi", "Ψ" },
  79. { "Rho", "Ρ" },
  80. { "Scaron", "Š" },
  81. { "Sigma", "Σ" },
  82. { "THORN", "Þ" },
  83. { "Tau", "Τ" },
  84. { "Theta", "Θ" },
  85. { "Uacute", "Ú" },
  86. { "Ucirc", "Û" },
  87. { "Ugrave", "Ù" },
  88. { "Upsilon", "Υ" },
  89. { "Uuml", "Ü" },
  90. { "Xi", "Ξ" },
  91. { "Yacute", "Ý" },
  92. { "Yuml", "Ÿ" },
  93. { "Zeta", "Ζ" },
  94. { "aacute", "á" },
  95. { "acirc", "â" },
  96. { "acute", "´" },
  97. { "aelig", "æ" },
  98. { "agrave", "à" },
  99. { "alefsym", "ℵ" },
  100. { "alpha", "α" },
  101. { "amp", "&" },
  102. { "and", "∧" },
  103. { "ang", "∠" },
  104. { "apos", "'" },
  105. { "aring", "å" },
  106. { "asymp", "≈" },
  107. { "atilde", "ã" },
  108. { "auml", "ä" },
  109. { "bdquo", "„" },
  110. { "beta", "β" },
  111. { "brvbar", "¦" },
  112. { "bull", "•" },
  113. { "cap", "∩" },
  114. { "ccedil", "ç" },
  115. { "cedil", "¸" },
  116. { "cent", "¢" },
  117. { "chi", "χ" },
  118. { "circ", "ˆ" },
  119. { "clubs", "♣" },
  120. { "cong", "≅" },
  121. { "copy", "©" },
  122. { "crarr", "↵" },
  123. { "cup", "∪" },
  124. { "curren", "¤" },
  125. { "dArr", "⇓" },
  126. { "dagger", "†" },
  127. { "darr", "↓" },
  128. { "deg", "°" },
  129. { "delta", "δ" },
  130. { "diams", "♦" },
  131. { "divide", "÷" },
  132. { "eacute", "é" },
  133. { "ecirc", "ê" },
  134. { "egrave", "è" },
  135. { "empty", "∅" },
  136. { "emsp", "\xE2\x80\x83" },
  137. { "ensp", "\xE2\x80\x82" },
  138. { "epsilon", "ε" },
  139. { "equiv", "≡" },
  140. { "eta", "η" },
  141. { "eth", "ð" },
  142. { "euml", "ë" },
  143. { "euro", "€" },
  144. { "exist", "∃" },
  145. { "fnof", "ƒ" },
  146. { "forall", "∀" },
  147. { "frac12", "½" },
  148. { "frac14", "¼" },
  149. { "frac34", "¾" },
  150. { "frasl", "⁄" },
  151. { "gamma", "γ" },
  152. { "ge", "≥" },
  153. { "gt", ">" },
  154. { "hArr", "⇔" },
  155. { "harr", "↔" },
  156. { "hearts", "♥" },
  157. { "hellip", "…" },
  158. { "iacute", "í" },
  159. { "icirc", "î" },
  160. { "iexcl", "¡" },
  161. { "igrave", "ì" },
  162. { "image", "ℑ" },
  163. { "infin", "∞" },
  164. { "int", "∫" },
  165. { "iota", "ι" },
  166. { "iquest", "¿" },
  167. { "isin", "∈" },
  168. { "iuml", "ï" },
  169. { "kappa", "κ" },
  170. { "lArr", "⇐" },
  171. { "lambda", "λ" },
  172. { "lang", "〈" },
  173. { "laquo", "«" },
  174. { "larr", "←" },
  175. { "lceil", "⌈" },
  176. { "ldquo", "“" },
  177. { "le", "≤" },
  178. { "lfloor", "⌊" },
  179. { "lowast", "∗" },
  180. { "loz", "◊" },
  181. { "lrm", "\xE2\x80\x8E" },
  182. { "lsaquo", "‹" },
  183. { "lsquo", "‘" },
  184. { "lt", "<" },
  185. { "macr", "¯" },
  186. { "mdash", "—" },
  187. { "micro", "µ" },
  188. { "middot", "·" },
  189. { "minus", "−" },
  190. { "mu", "μ" },
  191. { "nabla", "∇" },
  192. { "nbsp", "\xC2\xA0" },
  193. { "ndash", "–" },
  194. { "ne", "≠" },
  195. { "ni", "∋" },
  196. { "not", "¬" },
  197. { "notin", "∉" },
  198. { "nsub", "⊄" },
  199. { "ntilde", "ñ" },
  200. { "nu", "ν" },
  201. { "oacute", "ó" },
  202. { "ocirc", "ô" },
  203. { "oelig", "œ" },
  204. { "ograve", "ò" },
  205. { "oline", "‾" },
  206. { "omega", "ω" },
  207. { "omicron", "ο" },
  208. { "oplus", "⊕" },
  209. { "or", "∨" },
  210. { "ordf", "ª" },
  211. { "ordm", "º" },
  212. { "oslash", "ø" },
  213. { "otilde", "õ" },
  214. { "otimes", "⊗" },
  215. { "ouml", "ö" },
  216. { "para", "¶" },
  217. { "part", "∂" },
  218. { "permil", "‰" },
  219. { "perp", "⊥" },
  220. { "phi", "φ" },
  221. { "pi", "π" },
  222. { "piv", "ϖ" },
  223. { "plusmn", "±" },
  224. { "pound", "£" },
  225. { "prime", "′" },
  226. { "prod", "∏" },
  227. { "prop", "∝" },
  228. { "psi", "ψ" },
  229. { "quot", "\"" },
  230. { "rArr", "⇒" },
  231. { "radic", "√" },
  232. { "rang", "〉" },
  233. { "raquo", "»" },
  234. { "rarr", "→" },
  235. { "rceil", "⌉" },
  236. { "rdquo", "”" },
  237. { "real", "ℜ" },
  238. { "reg", "®" },
  239. { "rfloor", "⌋" },
  240. { "rho", "ρ" },
  241. { "rlm", "\xE2\x80\x8F" },
  242. { "rsaquo", "›" },
  243. { "rsquo", "’" },
  244. { "sbquo", "‚" },
  245. { "scaron", "š" },
  246. { "sdot", "⋅" },
  247. { "sect", "§" },
  248. { "shy", "\xC2\xAD" },
  249. { "sigma", "σ" },
  250. { "sigmaf", "ς" },
  251. { "sim", "∼" },
  252. { "spades", "♠" },
  253. { "sub", "⊂" },
  254. { "sube", "⊆" },
  255. { "sum", "∑" },
  256. { "sup1", "¹" },
  257. { "sup2", "²" },
  258. { "sup3", "³" },
  259. { "sup", "⊃" },
  260. { "supe", "⊇" },
  261. { "szlig", "ß" },
  262. { "tau", "τ" },
  263. { "there4", "∴" },
  264. { "theta", "θ" },
  265. { "thetasym", "ϑ" },
  266. { "thinsp", "\xE2\x80\x89" },
  267. { "thorn", "þ" },
  268. { "tilde", "˜" },
  269. { "times", "×" },
  270. { "trade", "™" },
  271. { "uArr", "⇑" },
  272. { "uacute", "ú" },
  273. { "uarr", "↑" },
  274. { "ucirc", "û" },
  275. { "ugrave", "ù" },
  276. { "uml", "¨" },
  277. { "upsih", "ϒ" },
  278. { "upsilon", "υ" },
  279. { "uuml", "ü" },
  280. { "weierp", "℘" },
  281. { "xi", "ξ" },
  282. { "yacute", "ý" },
  283. { "yen", "¥" },
  284. { "yuml", "ÿ" },
  285. { "zeta", "ζ" },
  286. { "zwj", "\xE2\x80\x8D" },
  287. { "zwnj", "\xE2\x80\x8C" },
  288. };
  289. size_t write_to_string(void *ptr, size_t size, size_t nmemb, string stream){
  290. size_t realsize = size * nmemb;
  291. string temp(static_cast<const char*>(ptr), realsize);
  292. stream.append(temp);
  293. return realsize;
  294. }
  295. void parser::get_news(){
  296. datalite news;
  297. news.set_file("news");
  298. cout << "Downloading news" << endl;
  299. while(news.is_valid_channel()){
  300. string channel = news.get_new_channel();
  301. cout << channel << endl;
  302. cout << "\tDownloading" << endl;
  303. this->download_news(channel);
  304. }
  305. }
  306. void parser::download_news(const string& xml){
  307. CURL * curl;
  308. curl = curl_easy_init();
  309. if (curl) {
  310. string response;
  311. curl_easy_setopt(curl, CURLOPT_URL, xml.c_str());
  312. curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
  313. curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 3L);
  314. curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);
  315. curl_easy_setopt(curl, CURLOPT_USERAGENT, user_agent);
  316. curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 10L);
  317. curl_easy_setopt(curl, CURLOPT_USE_SSL, CURLUSESSL_TRY);
  318. curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_to_string);
  319. curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
  320. curl_easy_perform(curl);
  321. curl_easy_cleanup(curl);
  322. if(response.length() > 0){
  323. cout << "\tParsing" << endl;
  324. parser p;
  325. p.detect_rss_standard(istringstream(response).str());
  326. }
  327. }else
  328. curl_easy_cleanup(curl);
  329. }
  330. int parser::next_tag(const string& line){
  331. int tag = -1;
  332. size_t size = line.size();
  333. size_t title = line.find("<title>");
  334. size_t link = line.find("<link>");
  335. size_t description = line.find("<description>");
  336. size_t end_description = line.find("</description>");
  337. size_t start_item = line.find("<item");
  338. size_t end_item = line.find("</item>");
  339. if(title != string::npos && title < size){
  340. size = title;
  341. tag = TITLE;
  342. }
  343. if(link != string::npos && link < size){
  344. size = link;
  345. tag = LINK;
  346. }
  347. if(description != string::npos && description < size){
  348. size = description;
  349. tag = DESCRIPTION;
  350. }
  351. if(end_description != string::npos && end_description < size){
  352. size = end_description;
  353. tag = DESCRIPTION;
  354. }
  355. if(start_item != string::npos && start_item < size){
  356. size = start_item;
  357. tag = START_ITEM;
  358. }
  359. if(end_item != string::npos && end_item < size){
  360. size = end_item;
  361. tag = END_ITEM;
  362. }
  363. return tag;
  364. }
  365. int parser::next_tag_atom(const string& line){
  366. int tag = -1;
  367. size_t size = line.size();
  368. size_t title = line.find("<title>");
  369. size_t link = line.find("<link");
  370. size_t description = line.find("<content type");
  371. size_t end_description = line.find("</content>");
  372. size_t start_item = line.find("<entry>");
  373. size_t end_item = line.find("</entry>");
  374. if(title != string::npos && title < size){
  375. size = title;
  376. tag = TITLE;
  377. }
  378. if(link != string::npos && link < size){
  379. size = link;
  380. tag = LINK;
  381. }
  382. if(description != string::npos && description < size){
  383. size = description;
  384. tag = DESCRIPTION;
  385. }
  386. if(end_description != string::npos && end_description < size){
  387. size = end_description;
  388. tag = DESCRIPTION;
  389. }
  390. if(start_item != string::npos && start_item < size){
  391. size = start_item;
  392. tag = START_ITEM;
  393. }
  394. if(end_item != string::npos && end_item < size){
  395. size = end_item;
  396. tag = END_ITEM;
  397. }
  398. return tag;
  399. }
  400. void parser::detect_rss_standard(const string& rss_text){
  401. stringstream index(rss_text);
  402. string line;
  403. bool rss = false;
  404. bool atom = false;
  405. while(getline(index, line)){
  406. size_t rss_position = line.find("<rss version");
  407. if(rss_position != string::npos) rss = true;
  408. else{
  409. size_t atom_position = line.find("<feed");
  410. if(atom_position != string::npos){
  411. atom = true;
  412. _item = false;
  413. }
  414. size_t channel_position = line.find("<channel");
  415. if(channel_position != string::npos) rss = true;
  416. }
  417. if(atom)
  418. this->atom(line);
  419. else if(rss)
  420. this->rss(line);
  421. }
  422. datalite d;
  423. d.set_file("news");
  424. d.bulk_insert(_links, _titles, _descriptions);
  425. }
  426. void parser::atom(string line){
  427. int tag = next_tag_atom(line);
  428. if(tag == -1 && _content){
  429. atom_description(line);
  430. }
  431. while(tag != -1){
  432. switch(tag){
  433. case TITLE: title(line);break;
  434. case LINK: atom_link(line);break;
  435. case DESCRIPTION: atom_description(line);break;
  436. case START_ITEM: atom_start_item(line);break;
  437. case END_ITEM: atom_end_item(line);break;
  438. }
  439. tag = next_tag_atom(line);
  440. }
  441. atom_description(line);
  442. }
  443. void parser::atom_link(string& line){
  444. size_t link = line.find("<link");
  445. if(link != string::npos){
  446. line = line.substr(link + 5);
  447. link = line.find("href=\"");
  448. if(link != string::npos){
  449. line = line.substr(link + 6);
  450. size_t less_than = line.find("\"");
  451. if(less_than != string::npos){
  452. if(_item_link.size() == 0)
  453. ++_item_tags;
  454. _item_link = line.substr(0, less_than);
  455. line = line.substr(less_than);
  456. }
  457. }
  458. }
  459. }
  460. void parser::atom_description(string& line){
  461. size_t content = line.find("<content type");
  462. if(content != string::npos and !_content){
  463. content = line.find(">", content);
  464. line = line.substr(content + 1);
  465. _item_description = line;
  466. _content = true;
  467. atom_end_description(line);
  468. delete_tags(_item_description);
  469. }
  470. if(_content && content == string::npos){
  471. _item_description += " " + line;
  472. atom_end_description(line);
  473. delete_tags(_item_description);
  474. }
  475. atom_end_description(line);
  476. }
  477. void parser::atom_end_description(string& line){
  478. size_t content_end_item = _item_description.find("</content>");
  479. if(content_end_item != string::npos)
  480. _item_description = _item_description.substr(0, content_end_item);
  481. size_t content_end = line.find("</content>");
  482. if(content_end != string::npos && _content){
  483. line = line.substr(content_end + 10);
  484. while(line[0]==' ') line.erase(0,1);
  485. _content = false;
  486. ++_item_tags;
  487. }
  488. }
  489. void parser::atom_start_item(string& line){
  490. size_t start_entry = line.find("<entry>");
  491. if(start_entry != string::npos){
  492. _item_tags = 0;
  493. _item_link = string();
  494. _item = true;
  495. _content = false;
  496. line = line.substr(start_entry + 7);
  497. }
  498. }
  499. void parser::atom_end_item(string& line){
  500. size_t end_entry = line.find("</entry>");
  501. if(end_entry != string::npos){
  502. if(_item && _item_tags == 3){
  503. clean_string(_item_description);
  504. escape_character_sql(_item_title);
  505. escape_character_sql(_item_description);
  506. delete_tags(_item_description);
  507. _links.push_back(_item_link);
  508. _titles.push_back(_item_title);
  509. _descriptions.push_back(_item_description);
  510. }
  511. _item = false;
  512. _item_tags = 0;
  513. _item_link = string();
  514. _content = false;
  515. line = line.substr(end_entry + 8);
  516. }
  517. }
  518. void parser::rss(string line){
  519. int tag = next_tag(line);
  520. if(tag == -1 && _content){
  521. rss_description(line);
  522. }
  523. while(tag != -1){
  524. switch(tag){
  525. case TITLE: title(line);break;
  526. case LINK: rss_link(line);break;
  527. case DESCRIPTION: rss_description(line);break;
  528. case START_ITEM: rss_start_item(line);break;
  529. case END_ITEM: rss_end_item(line);break;
  530. }
  531. tag = next_tag(line);
  532. }
  533. }
  534. std::string UnicodeToUTF8(unsigned int codepoint)
  535. {
  536. std::string out;
  537. if (codepoint <= 0x7f)
  538. out.append(1, static_cast<char>(codepoint));
  539. else if (codepoint <= 0x7ff)
  540. {
  541. out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
  542. out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
  543. }
  544. else if (codepoint <= 0xffff)
  545. {
  546. out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
  547. out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
  548. out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
  549. }
  550. else
  551. {
  552. out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
  553. out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
  554. out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
  555. out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
  556. }
  557. return out;
  558. }
  559. string parser::decode_html_chars(string line){
  560. size_t amp = line.find("&");
  561. size_t semicolon = line.find(";");
  562. while(amp != string::npos && semicolon != string::npos){
  563. if(amp + 1 < line.length() && line[amp + 1] == '#'){
  564. string encoded_value;
  565. if(amp + 2 < line.length() && line[amp + 2] == 'x'){
  566. try{
  567. encoded_value = line.substr(amp + 3, semicolon - (amp + 3));
  568. line = line.erase(amp, semicolon + 1 - amp);
  569. line = line.insert(amp, UnicodeToUTF8(stoul(encoded_value, nullptr, 16)));
  570. }catch(...){
  571. amp += 1;
  572. }
  573. }else{
  574. try{
  575. encoded_value = line.substr(amp + 2, semicolon - (amp + 2));
  576. line = line.erase(amp, semicolon + 1 - amp);
  577. line = line.insert(amp, UnicodeToUTF8(stoi(encoded_value)));
  578. }catch(...){
  579. amp += 1;
  580. }
  581. }
  582. } else {
  583. string encoded_value = line.substr(amp + 1, semicolon - (amp + 1));
  584. map<string, string>::iterator encoded_value_iterator = ENTITIES_VALUES.find(encoded_value);
  585. if(encoded_value_iterator != ENTITIES_VALUES.end()){
  586. line = line.erase(amp, semicolon + 1 - amp);
  587. line = line.insert(amp, encoded_value_iterator->second);
  588. }else{
  589. amp += 1;
  590. }
  591. }
  592. amp = line.find("&", amp);
  593. semicolon = line.find(";", amp);
  594. }
  595. return line;
  596. }
  597. void parser::clean_string(string& line){
  598. size_t cdata = line.find("<![CDATA[");
  599. if(cdata != string::npos)
  600. line = line.erase(cdata, 9);
  601. cdata = line.find("]]>");
  602. if(cdata != string::npos)
  603. line = line.erase(cdata, 3);
  604. line = decode_html_chars(line);
  605. }
  606. void parser::title(string& line){
  607. size_t title = line.find("<title>");
  608. if(title != string::npos){
  609. line = line.substr(title + 7);
  610. size_t less_than = line.find("</");
  611. if(less_than != string::npos){
  612. _item_title = line.substr(0, less_than);
  613. clean_string(_item_title);
  614. ++_item_tags;
  615. }
  616. }
  617. }
  618. void parser::rss_link(string& line){
  619. size_t link = line.find("<link>");
  620. if(link != string::npos){
  621. line = line.substr(link + 6);
  622. size_t less_than = line.find("</");
  623. if(less_than != string::npos){
  624. _item_link = line.substr(0, less_than);
  625. clean_string(_item_link);
  626. ++_item_tags;
  627. }
  628. }
  629. }
  630. void parser::rss_description(string& line){
  631. size_t content = line.find("<description>");
  632. if(content != string::npos && !_content){
  633. line = line.substr(content + 13);
  634. _item_description = line;
  635. _content = true;
  636. clean_string(_item_description);
  637. rss_end_description(_item_description);
  638. delete_tags(_item_description);
  639. }
  640. if(_content && content == string::npos){
  641. _item_description += " " + line;
  642. clean_string(_item_description);
  643. rss_end_description(line);
  644. delete_tags(_item_description);
  645. }
  646. rss_end_description(line);
  647. }
  648. void parser::rss_end_description(string& line){
  649. size_t content_end_item = _item_description.find("</description>");
  650. if(content_end_item != string::npos)
  651. _item_description = _item_description.substr(0, content_end_item);
  652. size_t content_end = line.find("</description>");
  653. if(content_end != string::npos && _content){
  654. line = line.substr(content_end + 14);
  655. while(_item_description[0] == ' ') _item_description.erase(0,1);
  656. _content = false;
  657. ++_item_tags;
  658. }
  659. }
  660. void parser::escape_character_sql(string& line){
  661. if(line.size() > 100) line = line.substr(0, 100);
  662. for(unsigned i = 0; i < line.size(); ++i){
  663. if(line[i] == '\''){
  664. line = line.insert(i, "\'");
  665. ++i;
  666. }
  667. }
  668. }
  669. void parser::rss_end_item(string& line){
  670. size_t end_entry = line.find("</item>");
  671. if(end_entry != string::npos){
  672. if(_item && _item_tags == 3){
  673. clean_string(_item_description);
  674. escape_character_sql(_item_title);
  675. escape_character_sql(_item_description);
  676. delete_tags(_item_description);
  677. _links.push_back(_item_link);
  678. _titles.push_back(_item_title);
  679. _descriptions.push_back(_item_description);
  680. }
  681. _item = false;
  682. _content = false;
  683. _item_tags = 0;
  684. line = line.substr(end_entry + 7);
  685. }
  686. }
  687. void parser::rss_start_item(string& line){
  688. size_t start_entry = line.find("<item");
  689. if(start_entry != string::npos){
  690. _item_tags = 0;
  691. _item_link = string();
  692. _item = true;
  693. _content = false;
  694. line = line.substr(start_entry + 5);
  695. }
  696. }
  697. void parser::delete_tags(string& line){
  698. size_t less_than = _item_description.find("<");
  699. size_t great_than = _item_description.find(">");
  700. while(less_than != string::npos && great_than != string::npos){
  701. if(less_than > great_than)
  702. _item_description.erase(great_than, 1);
  703. else{
  704. _item_description =
  705. _item_description.erase(less_than, great_than + 1 - less_than);
  706. }
  707. less_than = _item_description.find("<");
  708. great_than = _item_description.find(">");
  709. }
  710. }