123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308 |
- #include <sstream>
- #include <stdlib.h>
- #include <string.h>
- #include <unistd.h>
- #include "BBS2chProxyHTML2Dat.h"
- #include "stringEncodingConverter.h"
- extern char *proxy_server;
- extern long proxy_port;
- extern long proxy_type;
- extern long timeout;
- extern char *user_agent;
- extern int force_ipv4;
- extern CURLSH *curl_share;
- extern void log_printf(int level, const char *format ...);
- extern void *memmem_priv(const void *l, size_t l_len, const void *s, size_t s_len);
- #ifdef _WIN32
- #define gmtime_r(a, b) gmtime_s(b, a)
- #endif
- static const char threadTimestampFmt[] = "%Y/%m/%d %H:%M:%S %Z";
- static const char *wdays[7] = {
- "日",
- "月",
- "火",
- "水",
- "木",
- "金",
- "土"
- };
- static int decryptMail(unsigned char *decrypted, char *encrypted)
- {
- char current[5]="0x";
- unsigned char *ptr = decrypted;
- current[2] = encrypted[0];
- current[3] = encrypted[1];
- unsigned int r = strtol(current,NULL,16);
- int len = strlen(encrypted);
- int n = 2;
- for(;n<len;n+=2) {
- current[2] = encrypted[n];
- current[3] = encrypted[n+1];
- unsigned int i = strtol(current,NULL,16);
- *ptr++ = i^r;
- }
- *ptr = 0;
- //fprintf(stderr,"%s->%s\n",encrypted,decrypted);
- return ptr - decrypted;
- }
- static void replaceAll(std::string &input, const std::string &oldValue, const std::string &newValue)
- {
- if (!oldValue.empty()) {
- size_t pos = 0;
- while ((pos = input.find(oldValue, pos)) != std::string::npos) {
- input.replace(pos, oldValue.size(), newValue);
- pos += newValue.size();
- }
- }
- }
- static void escapeForHTML(std::string &input)
- {
- replaceAll(input, "&", "&");
- replaceAll(input, "<", "<");
- replaceAll(input, ">", ">");
- replaceAll(input, "\"", """);
- replaceAll(input, "'", "'");
- }
- static size_t write_callback_download(char *buffer, size_t size, size_t nitems, void *userdata)
- {
- std::vector<char> *data = static_cast<std::vector<char> *>(userdata);
- size_t downloaded = size*nitems;
- data->insert(data->end(), buffer, buffer+downloaded);
- return downloaded;
- }
- BBS2chProxyHTML2Dat5ch::BBS2chProxyHTML2Dat5ch(BBS2chProxyThreadCache *cache, const BBS2chThreadIdentifier &identifier, bool useHttps, CURL *curl)
- : IBBS2chProxyHTML2Dat(cache, identifier, curl)
- {
- _url = useHttps ? "https://" : "http://";
- _url += identifier.host;
- _url += "/test/read.cgi/";
- _url += identifier.board;
- _url += '/';
- _url += identifier.key;
- _url += '/';
- }
- BBS2chProxyHTML2DatTalk::BBS2chProxyHTML2DatTalk(BBS2chProxyThreadCache *cache, const BBS2chThreadIdentifier &identifier, CURL *curl)
- : IBBS2chProxyHTML2Dat(cache, identifier, curl), _cachedJson(NULL)
- {
- _url = "https://talk.jp/api/boards/";
- _url += identifier.board;
- _url += "/threads/";
- _url += identifier.key;
- }
- BBS2chProxyHTML2DatTalkHTML::BBS2chProxyHTML2DatTalkHTML(BBS2chProxyThreadCache *cache, const BBS2chThreadIdentifier &identifier, CURL *curl)
- : BBS2chProxyHTML2DatTalk(cache, identifier, curl)
- {
- _url = "https://talk.jp/boards/";
- _url += identifier.board;
- _url += '/';
- _url += identifier.key;
- _url += '/';
- }
- BBS2chProxyHTML2Dat5chItest::BBS2chProxyHTML2Dat5chItest(BBS2chProxyThreadCache *cache, const BBS2chThreadIdentifier &identifier, CURL *curl)
- : BBS2chProxyHTML2DatTalk(cache, identifier, curl)
- {
- _url = "https://itest.5ch.net/public/newapi/client.php?subdomain=";
- _url += identifier.hostPrefix;
- _url += "&board=";
- _url += identifier.board;
- _url += "&dat=";
- _url += identifier.key;
- }
- std::vector<char> IBBS2chProxyHTML2Dat::getHtmlFromURL(const std::string &url, long *outStatusCode)
- {
- CURLcode res;
- long statusCode = 0;
- std::vector<char> html;
- if (curl_share) curl_easy_setopt(_curl, CURLOPT_SHARE, curl_share);
- curl_easy_setopt(_curl, CURLOPT_URL, url.c_str());
- curl_easy_setopt(_curl, CURLOPT_NOSIGNAL, 1L);
- curl_easy_setopt(_curl, CURLOPT_TIMEOUT, timeout);
- curl_easy_setopt(_curl, CURLOPT_ENCODING, "");
- curl_easy_setopt(_curl, CURLOPT_WRITEFUNCTION, write_callback_download);
- curl_easy_setopt(_curl, CURLOPT_WRITEDATA, &html);
- curl_easy_setopt(_curl, CURLOPT_FOLLOWLOCATION, 1L);
- curl_easy_setopt(_curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
- curl_easy_setopt(_curl, CURLOPT_SSL_VERIFYHOST, 0L);
- curl_easy_setopt(_curl, CURLOPT_SSL_VERIFYPEER, 0L);
- if (force_ipv4) curl_easy_setopt(_curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
- if (proxy_server) {
- curl_easy_setopt(_curl, CURLOPT_PROXY, proxy_server);
- curl_easy_setopt(_curl, CURLOPT_PROXYPORT, proxy_port);
- curl_easy_setopt(_curl, CURLOPT_PROXYTYPE, proxy_type);
- }
- if (user_agent) {
- curl_easy_setopt(_curl, CURLOPT_USERAGENT, user_agent);
- }
- else if (!_userAgent.empty()) {
- curl_easy_setopt(_curl, CURLOPT_USERAGENT, _userAgent.c_str());
- }
- res = curl_easy_perform(_curl);
- if (res == CURLE_OK) {
- curl_easy_getinfo(_curl, CURLINFO_RESPONSE_CODE, &statusCode);
- if (statusCode != 200) html.clear();
- } else {
- log_printf(0, "curl error: %s (%s)\n", curl_easy_strerror(res), url.c_str());
- }
- curl_easy_reset(_curl);
- if (outStatusCode) *outStatusCode = statusCode;
- return html;
- }
- void IBBS2chProxyHTML2Dat::setRequestHeaders(BBS2chProxyHttpHeaders &headers)
- {
- if (headers.has("User-Agent")) {
- _userAgent = headers.get("User-Agent");
- }
- }
- const std::string& IBBS2chProxyHTML2Dat::getKey()
- {
- return _threadKey;
- }
- std::string BBS2chProxyHTML2Dat5ch::generateDatFrom(int startFrom, time_t *lastModifiedOut, bool useCache, long *outStatusCode)
- {
- std::string tmpURL(_url);
- if (startFrom > 1) {
- std::ostringstream ss;
- ss << startFrom << "-n";
- tmpURL += ss.str();
- } else {
- tmpURL += "1-";
- }
- curl_easy_setopt(_curl, CURLOPT_COOKIE, "5chClassic=on");
- std::vector<char> html = getHtmlFromURL(tmpURL, outStatusCode);
- return html2dat(html, startFrom, lastModifiedOut, useCache);
- }
- std::string BBS2chProxyHTML2DatTalk::generateDatFrom(int startFrom, time_t *lastModifiedOut, bool useCache, long *outStatusCode)
- {
- if (!_cachedJson) {
- std::vector<char> json = getHtmlFromURL(_url, outStatusCode);
- if (json.empty()) return "";
- json.push_back(0);
- _cachedJson = json_parse_string(&json.front());
- }
- if (!_cachedJson) return "";
- return json2dat(_cachedJson, startFrom, lastModifiedOut, useCache);
- }
- std::string BBS2chProxyHTML2DatTalkHTML::generateDatFrom(int startFrom, time_t *lastModifiedOut, bool useCache, long *outStatusCode)
- {
- if (!_cachedJson) {
- std::vector<char> html = getHtmlFromURL(_url, outStatusCode);
- if (html.empty()) return "";
- html.push_back(0);
- const char *ptr = strstr(&html.front(), "id=\"__NEXT_DATA__\"");
- if (ptr) {
- ptr += strlen("id=\"__NEXT_DATA__\"");
- while (*ptr != '>' && *ptr != 0) ptr++;
- if (*ptr) {
- const char *end = strstr(++ptr, "</script>");
- if (end) {
- std::string jsonStr(ptr, end-ptr);
- _cachedJson = json_parse_string(jsonStr.c_str());
- }
- }
- }
- }
- if (!_cachedJson) return "";
- JSON_Value *threadData = json_object_dotget_value(json_object(_cachedJson), "props.pageProps.threadData");
- return json2dat(threadData, startFrom, lastModifiedOut, useCache);
- }
- std::string BBS2chProxyHTML2Dat5chItest::generateDatFrom(int startFrom, time_t *lastModifiedOut, bool useCache, long *outStatusCode)
- {
- if (!_cachedJson) {
- std::vector<char> json = getHtmlFromURL(_url, outStatusCode);
- if (json.empty()) return "";
- json.push_back(0);
- _cachedJson = json_parse_string(&json.front());
- }
- if (!_cachedJson) return "";
- return json2dat(_cachedJson, startFrom, lastModifiedOut, useCache);
- }
- std::string BBS2chProxyHTML2Dat5ch::html2dat_old(std::vector<char> &html, int startResNum, time_t *lastModified, bool useCache)
- {
- char *ptr = &html.front();
- char *end = &html.back();
- std::string txt;
- int res = startResNum, i=0;
- char signature[32];
- char title[1024];
- int cachedSize = 0;
- bool bbspink = strstr(_threadKey.c_str(),"bbspink.com") ? true : false;
-
- ptr = (char *)memmem_priv(ptr, end-ptr+1, "<title>", 7);
- if(!ptr) {
- return "";
- }
- ptr += 7;
- while(1) {
- if(*ptr == '<') {
- if(!strncasecmp(ptr,"</title>",8)) {
- ptr += 8;
- break;
- }
- else title[i++] = *ptr++;
- }
- else title[i++] = *ptr++;
- }
- title[i] = 0;
-
- snprintf(signature,32,"<dt>%d ",res);
- ptr = (char *)memmem_priv(ptr, end-ptr+1, signature, strlen(signature));
- if(!ptr) {
- return "";
- }
-
- unsigned char *buffer = (unsigned char *)malloc(65536+1024+1024+1024+2048);
- if(!buffer) {
- return "";
- }
-
- unsigned char *body = buffer;
- char *mail = (char *)body + 65536;
- char *name = mail + 1024;
- char *date = name + 1024;
- char *encrypted = date + 1024;
-
- while(ptr < end) {
- //fprintf(stderr,"%s\n",signature);
- std::string resData;
- i=0;
- mail[0] = 0;
- ptr = strstr(ptr,signature);
- ptr += strlen(signature);
- while(*ptr != '<') ptr++;
- ptr++;
- const char *endStr;
- if(*ptr == 'a' || *ptr == 'A') {
- replay:
- // has mail
- while(*ptr != '"') ptr++;
- ptr++;
- if(!strncmp(ptr,"/cdn-cgi/l/email-protection#",28)) {
- ptr += 28;
- while(*ptr != '"' && *ptr != 'X') encrypted[i++] = *ptr++;
- encrypted[i] = 0;
- i = decryptMail((unsigned char *)mail,encrypted);
- int reconstruct_len = *ptr == 'X' ? i + 15 : i + 16;
- ptr -= reconstruct_len;
- char *start = ptr;
- memcpy(ptr, "<a href=\"mailto:", 16);
- ptr += 16;
- memcpy(ptr, mail, i);
- ptr = start;
- i=0;
- goto replay;
- }
- else {
- if(!strncmp(ptr,"mailto:",7)) ptr += 7;
- while(*ptr != '"') mail[i++] = *ptr++;
- mail[i] = 0;
- }
- endStr = "</a>";
- }
- else if(*ptr == 'b') {
- endStr = NULL;
- }
- else {
- endStr = "</font>";
- }
-
- if(endStr) {
- ptr = strstr(ptr,"<b>");
- ptr += 3;
- }
- else {
- ptr = strchr(ptr,'>');
- ptr++;
- }
-
- i=0;
- while(1) {
- if(*ptr == '<') {
- if(!strncasecmp(ptr,"</b>",4) && (!endStr || !strncasecmp(ptr+4,endStr,strlen(endStr)))) {
- ptr += 4;
- if(endStr) ptr += strlen(endStr);
- break;
- }
- else if(!strncmp(ptr,"<span class=\"__cf_email__\"",26)) {
- int j=0;
- ptr = strstr(ptr,"data-cfemail=\"");
- ptr += 14;
- while(*ptr != '"') encrypted[j++] = *ptr++;
- encrypted[j] = 0;
- j = decryptMail((unsigned char *)name+i,encrypted);
- i += j;
- ptr = strstr(ptr,"</script>");
- ptr += 9;
- }
- else name[i++] = *ptr++;
- }
- else name[i++] = *ptr++;
- }
-
- resData.append(name, i);
- resData.append("<>");
- if(mail[0]) resData.append(mail);
- resData.append("<>");
-
- ptr += 2;
- i=0;
- while(1) {
- if(*ptr == '<') {
- if(!strncasecmp(ptr,"<dd>",4)) {
- ptr += 4;
- break;
- }
- else if(!strncmp(ptr,"<a href=\"javascript:be(",23)) {
- memcpy(date+i,"BE:",3);
- ptr += 23;
- i += 3;
- while(*ptr != ')') date[i++] = *ptr++;
- date[i++] = '-';
- ptr = strchr(ptr,'?');
- ptr++;
- char *tmp = strstr(ptr,"</a>");
- memcpy(date+i,ptr,tmp-ptr);
- i += tmp-ptr;
- ptr = tmp + 4;
- }
- else date[i++] = *ptr++;
- }
- else date[i++] = *ptr++;
- }
-
- resData.append(date, i);
- resData.append("<>");
-
- i=0;
- while(1) {
- if(*ptr == '<') {
- if(!strncasecmp(ptr,"<br><br>\n",9)) {
- ptr += 9;
- break;
- }
- else if(!strncasecmp(ptr,"<dt>",4) || !strncasecmp(ptr,"</dl>",5)) {
- while(i>0 &&body[i-1] == '\n') i--;
- break;
- }
- else if(!strncmp(ptr,"<span class=\"__cf_email__\"",26) || !strncmp(ptr,"<a class=\"__cf_email__\"",23)) {
- int j=0;
- ptr = strstr(ptr,"data-cfemail=\"");
- ptr += 14;
- while(*ptr != '"') encrypted[j++] = *ptr++;
- encrypted[j] = 0;
- j = decryptMail(body+i,encrypted);
- i += j;
- ptr = strstr(ptr,"</script>");
- ptr += 9;
- }
- else if(!strncmp(ptr,"<a href=\"http",13)) {
- ptr = strchr(ptr,'>');
- ptr++;
- char *link = ptr;
- ptr = strstr(link,"</a>");
- memcpy(body+i,link,ptr-link);
- i += ptr-link;
- ptr += 4;
- }
- else if(!strncmp(ptr,"<img src=\"",10)) {
- ptr += 10;
- char *img = ptr;
- ptr = strstr(img,"\">");
- memcpy(body+i,img,ptr-img);
- if(memmem_priv(img,ptr-img,"/img.2ch.net",12) || memmem_priv(img,ptr-img,"/img.5ch.net",12) || memmem_priv(img,ptr-img,"/o.8ch.net",10) || memmem_priv(img,ptr-img,"/o.5ch.net",10)) {
- int length = ptr-img;
- while(*img != '/') {
- img++;
- length--;
- }
- memcpy(body+i,"sssp:",5);
- memcpy(body+i+5,img,length);
- i += length + 5;
- }
- else i += ptr-img;
- ptr += 2;
- }
- else if(!bbspink && !strncmp(ptr,"<br>",4)) {
- if(i>5 && !strncmp((char *)body+i-5,"<br> ",5)) {
- memcpy(body+i," <br>",5);
- i += 5;
- }
- else {
- memcpy(body+i,"<br>",4);
- i += 4;
- }
- ptr += 4;
- }
- else body[i++] = *ptr++;
- }
- else if(!bbspink && *ptr == ' ') {
- if(*(ptr+1) == ' ') ptr++;
- else body[i++] = *ptr++;
- }
- else body[i++] = *ptr++;
- }
-
- resData.append((const char *)body ,i);
- resData.append("<>");
- if(res == 1) resData.append(title);
- resData.append("\n");
-
- if(useCache && res == startResNum) {
- PBBS2chProxyThreadInfo info = _threadCache->pop(_threadKey);
- bool hit = false;
- if(info) {
- log_printf(5,"cache hit");
- if(info->cachedData.size() == resData.size()) {
- log_printf(5,"... size match");
- if(info->cachedData == resData) {
- log_printf(5,"... content match");
- hit = true;
- cachedSize = info->cachedSize - resData.size();
- }
- }
- log_printf(5,"\n");
- }
- if(!hit) {
- free(buffer);
- return "";
- }
- }
-
- txt += resData;
- res++;
- while(*ptr == '\n' || *ptr == '\r') ptr++;
- snprintf(signature,32,"<dt>%d ",res);
- if(!memmem_priv(ptr, end-ptr+1, signature, strlen(signature))) {
- PBBS2chProxyThreadInfo info(new BBS2chProxyThreadInfo());
- info->lastResNum = res-1;
- info->cachedSize = txt.size()+cachedSize;
- info->cachedData = resData;
- _threadCache->set(_threadKey, info);
- log_printf(5,"cached thread %s (%ld bytes)\n",_threadKey.c_str(),resData.size());
-
- if(lastModified) {
- *lastModified = 0;
- char formattedDate[256];
- char *ptr;
- ptr = date;
- int year = strtol(ptr,&ptr,10);
- if(*ptr != '/') break;
- ptr++;
- int month = strtol(ptr,&ptr,10);
- if(*ptr != '/') break;
- ptr++;
- int day = strtol(ptr,&ptr,10);
- if(!*ptr) break;
- while(*ptr != ' ' && *ptr != 0) ptr++;
- if(!*ptr) break;
- ptr++;
- int hour = strtol(ptr,&ptr,10);
- if(*ptr != ':') break;
- ptr++;
- int minutes = strtol(ptr,&ptr,10);
- if(*ptr != ':') break;
- ptr++;
- int seconds = strtol(ptr,&ptr,10);
- if(!(month>0 && month<13) || !(day>0 && day<32)) break;
- if(year < 100) year += 2000;
- #if LIBCURL_VERSION_NUM >= 0x070c02 /* curl 7.12.2 or later */
- snprintf(formattedDate, 256, "%d%02d%02d %02d:%02d:%02d +0900", year, month, day, hour, minutes, seconds);
- *lastModified = curl_getdate(formattedDate, NULL);
- #else
- snprintf(formattedDate,256,"%d/%d/%d %02d:%02d:%02d JST",year,month,day,hour,minutes,seconds);
- struct tm time = {0};
- strptime(formattedDate,threadTimestampFmt,&time);
- *lastModified = mktime(&time);
- #endif
- }
- //fprintf(stderr,"not found,%ld\n",end-ptr+1);
- break;
- }
- }
-
- free(buffer);
- return txt;
- }
- std::string BBS2chProxyHTML2Dat5ch::html2dat(std::vector<char> &html, int startResNum, time_t *lastModified, bool useCache)
- {
- char *ptr = &html.front();
- char *end = &html.back();
- std::string txt;
- int res = startResNum, i=0;
- char signature[64];
- char title[1024];
- int cachedSize = 0;
- char signatureTag[32];
- char closeTag[48];
- int closeTagLen;
- bool isNewHTML = false;
- if (html.empty()) return "";
-
- ptr = (char *)memmem_priv(ptr, end-ptr+1, " id=\"threadtitle\">", 18);
- if (ptr) {
- isNewHTML = true;
- char *ptr2 = (char *)memmem_priv(ptr, end-ptr+1, "<article id=\"", 13);
- if (!ptr2) {
- strcpy(signatureTag, "div");
- } else {
- strcpy(signatureTag, "article");
- }
- const char *tmp = ptr;
- while (*tmp != '<') tmp--;
- memcpy(closeTag+2, tmp+1, ptr-tmp-1);
- closeTag[0] = '<';
- closeTag[1] = '/';
- closeTag[ptr-tmp+1] = '>';
- closeTag[ptr-tmp+2] = 0;
- ptr += 18;
- while (1) {
- if (*ptr == '<') {
- if (!strncasecmp(ptr, closeTag, strlen(closeTag))) {
- ptr += strlen(closeTag);
- break;
- }
- else title[i++] = *ptr++;
- }
- else if(*ptr == '\n') break;
- else title[i++] = *ptr++;
- }
- title[i] = 0;
- snprintf(signature, 32, "<%s id=\"%d\"", signatureTag, res);
- }
- else {
- ptr = &html.front();
- ptr = (char *)memmem_priv(ptr, end-ptr+1, "<h1 class=\"title\">", 18);
- if(!ptr) {
- return html2dat_old(html, startResNum, lastModified, useCache);
- }
- else {
- char *ptr2 = (char *)memmem_priv(ptr, end-ptr+1, " class=\"post\"", 13);
- if(ptr2) {
- char *tmp = ptr2;
- *ptr2 = 0;
- while(*ptr2 != '<') ptr2--;
- strcpy(signatureTag, ptr2);
- *tmp = ' ';
- }
- else {
- return "";
- }
- /*char *ptr2 = (char *)memmem_priv(ptr, end-ptr+1, "<dl class=\"post\"", 16);
- if(ptr2) {
- return html2dat_pink(html, startResNum, lastModified, useCache);
- }*/
- }
-
- ptr += 18;
- while(1) {
- if(*ptr == '<') {
- if(!strncasecmp(ptr,"</h1>",5)) {
- ptr += 5;
- break;
- }
- else title[i++] = *ptr++;
- }
- else if(*ptr == '\n') break;
- else title[i++] = *ptr++;
- }
- title[i] = 0;
-
- snprintf(signature,32,"%s class=\"post\" id=\"%d\"",signatureTag,res);
- }
- ptr = (char *)memmem_priv(ptr, end-ptr+1, signature, strlen(signature));
- if(!ptr) {
- return "";
- }
-
- unsigned char *buffer = (unsigned char *)malloc(65536+1024+1024+1024+2048);
- if(!buffer) {
- return "";
- }
-
- unsigned char *body = buffer;
- char *mail = (char *)body + 65536;
- char *name = mail + 1024;
- char *date = name + 1024;
- char *encrypted = date + 1024;
-
- while(ptr < end) {
- //fprintf(stderr,"%s\n",signature);
- std::string resData;
- i=0;
- mail[0] = 0;
- if (isNewHTML) ptr = strstr(ptr," class=\"postusername\"><b>");
- else ptr = strstr(ptr," class=\"name\"><b>");
- if(ptr) {
- char *tmp = ptr;
- *ptr = 0;
- while(*ptr != '<') ptr--;
- snprintf(closeTag, 48, "</%s>", ptr+1);
- closeTagLen = strlen(closeTag);
- if (isNewHTML) ptr = tmp + 25;
- else ptr = tmp + 17;
- }
- else {
- break;
- }
-
- char endStr[64];
- if(!strncmp(ptr,"<a ", 3)) {
- char *tmp = ptr;
- while (*tmp != '>') tmp++;
- ptr = (char *)memmem_priv(ptr, tmp-ptr, "href=\"", 6);
- if (!ptr) {
- ptr = tmp;
- goto mailEnd;
- }
- replay:
- // has mail
- while(*ptr != '"') ptr++;
- ptr++;
- if(!strncmp(ptr,"/cdn-cgi/l/email-protection#",28)) {
- ptr += 28;
- while(*ptr != '"' && *ptr != 'X') encrypted[i++] = *ptr++;
- encrypted[i] = 0;
- i = decryptMail((unsigned char *)mail,encrypted);
- int reconstruct_len = *ptr == 'X' ? i + 15 : i + 16;
- ptr -= reconstruct_len;
- char *start = ptr;
- memcpy(ptr, "<a href=\"mailto:", 16);
- ptr += 16;
- memcpy(ptr, mail, i);
- ptr = start;
- i=0;
- goto replay;
- }
- else {
- if(!strncmp(ptr,"mailto:",7)) ptr += 7;
- while(1) {
- if(*ptr == '<' && !strncmp(ptr,"<a href=\"",9)) {
- ptr = strchr(ptr,'>');
- ptr++;
- char *link = ptr;
- ptr = strstr(link,"</a>");
- memcpy(mail+i,link,ptr-link);
- i += ptr-link;
- ptr += 4;
- }
- else if(*ptr == '"') break;
- else mail[i++] = *ptr++;
- }
- //while(*ptr != '"') mail[i++] = *ptr++;
- mail[i] = 0;
- }
- mailEnd:
- snprintf(endStr,64,"</a></b>%s",closeTag);
- while(*ptr != '>') ptr++;
- ptr++;
- }
- /* we do not have to handle this special case because read.cgi on bbspink doesn't
- emit font tags anymore and it conflicts with text decorations using "melon point" */
- /*else if(!strncmp(ptr,"<font",5)) {
- snprintf(endStr,64,"</font></b>%s",closeTag);
- while(*ptr != '>') ptr++;
- ptr++;
- }*/
- else {
- snprintf(endStr,64,"</b>%s",closeTag);
- }
-
- i=0;
- while(1) {
- if(*ptr == '<') {
- if(!strncmp(ptr,endStr,strlen(endStr))) {
- ptr += strlen(endStr);
- break;
- }
- else if(!strncmp(ptr,"<span class=\"__cf_email__\"",26)) {
- int j=0;
- ptr = strstr(ptr,"data-cfemail=\"");
- ptr += 14;
- while(*ptr != '"') encrypted[j++] = *ptr++;
- encrypted[j] = 0;
- j = decryptMail((unsigned char *)name+i,encrypted);
- i += j;
- ptr = strstr(ptr,"</script>");
- ptr += 9;
- }
- else if(!strncmp(ptr,"<a href=\"",9)) {
- ptr = strchr(ptr,'>');
- ptr++;
- char *link = ptr;
- ptr = strstr(link,"</a>");
- memcpy(name+i,link,ptr-link);
- i += ptr-link;
- ptr += 4;
- }
- else name[i++] = *ptr++;
- }
- else name[i++] = *ptr++;
- }
-
- resData.append(name, i);
- resData.append("<>");
- if(mail[0]) resData.append(mail);
- resData.append("<>");
-
- ptr = strstr(ptr," class=\"date\">");
- if(ptr) {
- char *tmp = ptr;
- *ptr = 0;
- while(*ptr != '<') ptr--;
- snprintf(closeTag, 48, "</%s>", ptr+1);
- closeTagLen = strlen(closeTag);
- ptr = tmp + 14;
- }
- else {
- break;
- }
-
- i=0;
- while(1) {
- if(*ptr == '<') {
- if(!strncasecmp(ptr,closeTag,closeTagLen)) {
- ptr += closeTagLen;
- break;
- }
- else date[i++] = *ptr++;
- }
- else date[i++] = *ptr++;
- }
-
- if(!strncmp(ptr,"<div class=\"uid",15) || !strncmp(ptr,"<span class=\"uid",16)) {
- char *tmp = ptr+1;
- while(*ptr != ' ') ptr++;
- *ptr = 0;
- snprintf(closeTag, 48, "</%s>", tmp);
- closeTagLen = strlen(closeTag);
- ptr += 11;
- while(*ptr != '>') ptr++;
- ptr++;
- date[i++] = ' ';
- while(1) {
- if(*ptr == '<') {
- if(!strncasecmp(ptr,closeTag,closeTagLen)) {
- ptr += closeTagLen;
- break;
- }
- else date[i++] = *ptr++;
- }
- else date[i++] = *ptr++;
- }
- }
- if (isNewHTML && !strncmp(ptr, "</span>", 7)) ptr += 7;
-
- if(!strncmp(ptr,"<div class=\"be",14) || !strncmp(ptr,"<span class=\"be",15)) {
- ptr += 14;
- while(*ptr != '>') ptr++;
- ptr++;
- if(!strncmp(ptr,"<a href=\"",9)) {
- ptr += 9;
- while(*ptr != '/' && *ptr != '"') ptr++;
- if(*ptr == '/' && (!strncmp(ptr,"//be.2ch.net/user/",18) || !strncmp(ptr,"//be.5ch.net/user/",18))) {
- memcpy(date+i," BE:",4);
- i += 4;
- ptr += 18;
- while(*ptr != '"') date[i++] = *ptr++;
- date[i++] = '-';
- ptr = strchr(ptr,'?');
- ptr++;
- char *tmp = strstr(ptr,"</a>");
- memcpy(date+i,ptr,tmp-ptr);
- i += tmp-ptr;
- ptr = tmp + 4;
- }
- }
- }
-
- resData.append(date, i);
- resData.append("<>");
-
- if (isNewHTML) {
- ptr = strstr(ptr," class=\"post-content\">");
- if (!ptr) {
- break;
- }
- else {
- char *tmp = ptr;
- char postContentTag[32];
- while (*tmp != '<') tmp--;
- memcpy(postContentTag, tmp+1, ptr-tmp-1);
- postContentTag[ptr-tmp-1] = 0;
- ptr += 22;
- if (!strncasecmp(ptr, "<span class=\"AA\">", 17)) {
- snprintf(closeTag, 48, "</span></%s>", postContentTag);
- closeTagLen = strlen(closeTag);
- ptr += 17;
- }
- else {
- snprintf(closeTag, 48, "</%s>", postContentTag);
- closeTagLen = strlen(closeTag);
- }
- }
- }
- else if(!strcmp(signatureTag,"<div")) {
- ptr = strstr(ptr,"<div class=\"message\">");
- if(!ptr) {
- break;
- }
- else {
- ptr += 21;
- if(!strncasecmp(ptr,"<span class=\"escaped\">",22)) {
- if(!strncasecmp(ptr+22,"<span class=\"AA\">",17)) {
- strcpy(closeTag,"</span></span></div>");
- closeTagLen = 20;
- ptr += 22+17;
- }
- else {
- strcpy(closeTag,"</span></div>");
- closeTagLen = 13;
- ptr += 22;
- }
- }
- else {
- strcpy(closeTag,"</div>");
- closeTagLen = 6;
- }
- }
- }
- else {
- ptr = strstr(ptr,"<dd class=\"thread_in\">");
- if(!ptr) {
- break;
- }
- strcpy(closeTag,"</dd>");
- closeTagLen = 5;
- ptr += 22;
- }
- i=0;
- while(1) {
- if(*ptr == '<') {
- if(!strncasecmp(ptr,closeTag,closeTagLen)) {
- ptr += closeTagLen;
- break;
- }
- else if(!strncmp(ptr,"<span class=\"__cf_email__\"",26) || !strncmp(ptr,"<a class=\"__cf_email__\"",23)) {
- int j=0;
- ptr = strstr(ptr,"data-cfemail=\"");
- ptr += 14;
- while(*ptr != '"') encrypted[j++] = *ptr++;
- encrypted[j] = 0;
- j = decryptMail(body+i,encrypted);
- i += j;
- ptr = strstr(ptr,"</script>");
- ptr += 9;
- }
- else if(!strncmp(ptr,"<a ",3)) {
- char *tmp = strchr(ptr,'>');
- char *href = (char *)memmem_priv(ptr,tmp-ptr,"href=\"",6);
- char *link = tmp+1;
- if(href && !strncmp(link,">>",8) && memmem_priv(href,link-href,"test/read.cgi/",14)) {
- while(ptr < link) {
- if(!strncmp(ptr," class=\"",8)) {
- ptr += 8;
- while(*ptr != '"' && *ptr != '>') ptr++;
- if(*ptr == '"') ptr++;
- }
- else body[i++] = *ptr++;
- }
- }
- else {
- ptr = strstr(link,"</a>");
- memcpy(body+i,link,ptr-link);
- i += ptr-link;
- ptr += 4;
- }
- }
- else if(!strncmp(ptr,"<img src=\"",10)) {
- ptr += 10;
- char *img = ptr;
- ptr = strstr(img,"\">");
- memcpy(body+i,img,ptr-img);
- if(memmem_priv(img,ptr-img,"/img.2ch.net",12) || memmem_priv(img,ptr-img,"/img.5ch.net",12) || memmem_priv(img,ptr-img,"/o.8ch.net",10) || memmem_priv(img,ptr-img,"/o.5ch.net",10)) {
- int length = ptr-img;
- while(*img != '/') {
- img++;
- length--;
- }
- memcpy(body+i,"sssp:",5);
- memcpy(body+i+5,img,length);
- i += length + 5;
- }
- else i += ptr-img;
- ptr += 2;
- }
- else if(!strncmp(ptr,"<br>",4)) {
- if(i>5 && !strncmp((char *)body+i-5,"<br> ",5)) {
- memcpy(body+i," <br>",5);
- i += 5;
- }
- else {
- memcpy(body+i,"<br>",4);
- i += 4;
- }
- ptr += 4;
- }
- else body[i++] = *ptr++;
- }
- else body[i++] = *ptr++;
- }
-
- resData.append((const char *)body, i);
- resData.append("<>");
- if(res == 1) resData.append(title);
- resData.append("\n");
-
- if(useCache && res == startResNum) {
- PBBS2chProxyThreadInfo info = _threadCache->pop(_threadKey);
- bool hit = false;
- if(info) {
- log_printf(5,"cache hit");
- if(info->cachedData.size() == resData.size()) {
- log_printf(5,"... size match");
- if(info->cachedData == resData) {
- log_printf(5,"... content match");
- hit = true;
- cachedSize = info->cachedSize - resData.size();
- }
- }
- log_printf(5,"\n");
- }
- if(!hit) {
- free(buffer);
- return "";
- }
- }
-
- txt += resData;
- res++;
- while(*ptr == '\n' || *ptr == '\r') ptr++;
- if (isNewHTML) snprintf(signature, 64, "<%s id=\"", signatureTag);
- else snprintf(signature,64,"%s class=\"post\" id=\"",signatureTag);
- while (1) {
- ptr = (char *)memmem_priv(ptr, end-ptr+1, signature, strlen(signature));
- if (!isNewHTML || !ptr) break;
- char *tmp = ptr;
- while (*tmp != '>') tmp++;
- tmp = (char *)memmem_priv(ptr, tmp-ptr, "data-date", 9);
- if (tmp) break;
- ptr++;
- }
- if(ptr) {
- int next = atoi(ptr+strlen(signature));
- if(next >= res) {
- while(next > res) {
- txt += "broken<><>broken<> broken <>\n";
- res++;
- }
- }
- else ptr = NULL;
- }
- if(!ptr) {
- PBBS2chProxyThreadInfo info(new BBS2chProxyThreadInfo());
- info->lastResNum = res-1;
- info->cachedSize = txt.size()+cachedSize;
- info->cachedData = resData;
- _threadCache->set(_threadKey, info);
- log_printf(5,"cached thread %s (%ld bytes)\n",_threadKey.c_str(),resData.size());
-
- if(lastModified) {
- *lastModified = 0;
- char formattedDate[256];
- char *ptr;
- ptr = date;
- int year = strtol(ptr,&ptr,10);
- if(*ptr != '/') break;
- ptr++;
- int month = strtol(ptr,&ptr,10);
- if(*ptr != '/') break;
- ptr++;
- int day = strtol(ptr,&ptr,10);
- if(!*ptr) break;
- while(*ptr != ' ' && *ptr != 0) ptr++;
- if(!*ptr) break;
- ptr++;
- int hour = strtol(ptr,&ptr,10);
- if(*ptr != ':') break;
- ptr++;
- int minutes = strtol(ptr,&ptr,10);
- if(*ptr != ':') break;
- ptr++;
- int seconds = strtol(ptr,&ptr,10);
- if(!(month>0 && month<13) || !(day>0 && day<32)) break;
- if(year < 100) year += 2000;
- #if LIBCURL_VERSION_NUM >= 0x070c02 /* curl 7.12.2 or later */
- snprintf(formattedDate, 256, "%d%02d%02d %02d:%02d:%02d +0900", year, month, day, hour, minutes, seconds);
- *lastModified = curl_getdate(formattedDate, NULL);
- #else
- snprintf(formattedDate,256,"%d/%d/%d %02d:%02d:%02d JST",year,month,day,hour,minutes,seconds);
- struct tm time = {0};
- strptime(formattedDate,threadTimestampFmt,&time);
- *lastModified = mktime(&time);
- #endif
- }
- //fprintf(stderr,"not found,%ld\n",end-ptr+1);
- break;
- }
- }
-
- free(buffer);
- return txt;
- }
- std::string BBS2chProxyHTML2DatTalk::json2dat(JSON_Value *json, int startFrom, time_t *lastModifiedOut, bool useCache)
- {
- std::string out;
- if (!json || json_type(json) != JSONObject) {
- return "";
- }
- JSON_Object *root = json_object(json);
- const char *title = json_object_dotget_string(root, "data.title");
- const char *quoteSource = json_object_dotget_string(root, "data.quote_source");
- JSON_Array *comments = json_object_dotget_array(root, "data.comments");
- if (!title || !comments) {
- return "";
- }
- if (startFrom < 1) startFrom = 1;
- int prevNumber = startFrom - 1;
- time_t lastModified = 0;
- size_t cachedSize = 0;
- std::string lastLine;
- for (size_t i=0, length=json_array_get_count(comments); i<length; i++) {
- std::stringstream line;
- JSON_Object *comment = json_array_get_object(comments, i);
- if (!comment) continue;
- int number = json_object_get_number(comment, "number");
- if (number < startFrom) continue;
- const char *name = json_object_dotget_string(comment, "writer.name");
- const char *trip = json_object_dotget_string(comment, "writer.trip");
- const char *slip = json_object_dotget_string(comment, "writer.slip");
- const char *id = json_object_dotget_string(comment, "writer.id");
- time_t timestamp = json_object_get_number(comment, "timestamp");
- const char *body = json_object_get_string(comment, "body");
- if (timestamp > lastModified) lastModified = timestamp;
- for (int j=prevNumber+1; j<number; j++) {
- out += "broken<><>broken<> broken <>\n";
- }
- if (name) {
- std::string tmp(name);
- escapeForHTML(tmp);
- line << tmp;
- if (trip) line << "</b>◆" << trip << "<b>";
- if (slip) line << " </b>(" << slip << ")<b>";
- }
- else line << "削除";
- line << "<><>"; //mail cannot be obtained from json!
- if (timestamp) {
- char dateStr[256] = "";
- struct tm timestamp_tm = {0};
- timestamp += 32400;
- gmtime_r(×tamp, ×tamp_tm);
- strftime(dateStr, 256, "%Y/%m/%d(", ×tamp_tm);
- line << dateStr << wdays[timestamp_tm.tm_wday] << ") ";
- strftime(dateStr, 256, "%H:%M:%S", ×tamp_tm);
- line << dateStr;
- if (id) {
- line << " ID:" << id;
- }
- }
- else line << "削除";
- line << "<>";
- if (body) {
- std::string tmp(body);
- escapeForHTML(tmp);
- replaceAll(tmp, "\n", " <br> ");
- line << " " << tmp;
- if (number == 1 && quoteSource) {
- line << " <br> <br> 出典 " << quoteSource;
- }
- line << " ";
- }
- else line << "削除";
- line << "<>";
- if (number == 1) {
- std::string tmp(title);
- escapeForHTML(tmp);
- line << tmp;
- }
- line << "\n";
- prevNumber = number;
- char *lineSJIS = convertUTF8ToShiftJISWithNCR(line.str().c_str(), line.str().size());
- if (lineSJIS) {
- lastLine = lineSJIS;
- out += lastLine;
- free(lineSJIS);
- } else {
- lastLine = "broken<><>broken<> broken <>\n";
- out += lastLine;
- }
- if (useCache && startFrom == number) {
- PBBS2chProxyThreadInfo info = _threadCache->pop(_threadKey);
- bool hit = false;
- if (info) {
- log_printf(5, "cache hit");
- if (info->cachedData.size() == lastLine.size()) {
- log_printf(5, "... size match");
- if (info->cachedData == lastLine) {
- log_printf(5, "... content match");
- hit = true;
- cachedSize = info->cachedSize - lastLine.size();
- }
- }
- log_printf(5, "\n");
- }
- if (!hit) {
- return "";
- }
- }
- }
- if (!lastLine.empty()) {
- PBBS2chProxyThreadInfo info(new BBS2chProxyThreadInfo());
- info->lastResNum = prevNumber;
- info->cachedSize = out.size() + cachedSize;
- info->cachedData = lastLine;
- _threadCache->set(_threadKey, info);
- log_printf(5, "cached thread %s (%ld bytes)\n", _threadKey.c_str(), lastLine.size());
- }
- if (lastModifiedOut) *lastModifiedOut = lastModified;
- return out;
- }
- std::string BBS2chProxyHTML2Dat5chItest::json2dat(JSON_Value *json, int startFrom, time_t *lastModifiedOut, bool useCache)
- {
- std::string out;
- if (!json || json_type(json) != JSONObject) {
- return "";
- }
- JSON_Object *root = json_object(json);
- JSON_Array *threadMeta = json_object_get_array(root, "thread");
- time_t lastModified = json_array_get_number(threadMeta, 0);
- const char *boardAndKey = json_array_get_string(threadMeta, 3);
- const char *title = json_array_get_string(threadMeta, 5);
- JSON_Array *comments = json_object_get_array(root, "comments");
- if (!title || !*title || !comments) {
- return "";
- }
- if (startFrom < 1) startFrom = 1;
- int prevNumber = startFrom - 1;
- size_t cachedSize = 0;
- std::string lastLine;
- for (size_t i=0, length=json_array_get_count(comments); i<length; i++) {
- std::stringstream line;
- JSON_Array *comment = json_array_get_array(comments, i);
- if (!comment) continue;
- int number = json_array_get_number(comment, 0);
- if (number < startFrom) continue;
- const char *name = json_array_get_string(comment, 1);
- const char *mail = json_array_get_string(comment, 2);
- const char *date = json_array_get_string(comment, 3);
- const char *id = json_array_get_string(comment, 4);
- const char *be = json_array_get_string(comment, 5);
- const char *body = json_array_get_string(comment, 6);
- for (int j=prevNumber+1; j<number; j++) {
- out += "broken<><>broken<> broken <>\n";
- }
- if (name) line << name;
- else line << "削除";
- line << "<>";
- if (mail) line << mail;
- else line << "削除";
- line << "<>";
- if (date) {
- line << date;
- if (id && *id) {
- line << " ID:" << id;
- }
- if (be && *be) {
- line << " BE:" << be;
- }
- }
- else line << "削除";
- line << "<>";
- if (body) {
- const char *ptr = strstr(body, ">>");
- const char *start = body;
- while (ptr) {
- const char *tmp = ptr;
- unsigned int num = strtoul(ptr+8, (char **)&ptr, 10);
- if (num > 0) {
- if (tmp != start) line << std::string(start, tmp-start);
- line << "<a href=\"../test/read.cgi/" << boardAndKey << "/" << num << "\" rel=\"noopener noreferrer\" target=\"_blank\">";
- line << std::string(tmp, ptr-tmp);
- line << "</a>";
- }
- else line << std::string(start, ptr-start);
- start = ptr;
- ptr = strstr(start, ">>");
- }
- line << start;
- }
- else line << "削除";
- line << "<>";
- if (number == 1) {
- line << title;
- }
- line << "\n";
- prevNumber = number;
- char *lineSJIS = convertUTF8ToShiftJISWithNCR(line.str().c_str(), line.str().size());
- if (lineSJIS) {
- lastLine = lineSJIS;
- out += lastLine;
- free(lineSJIS);
- } else {
- lastLine = "broken<><>broken<> broken <>\n";
- out += lastLine;
- }
- if (useCache && startFrom == number) {
- PBBS2chProxyThreadInfo info = _threadCache->pop(_threadKey);
- bool hit = false;
- if (info) {
- log_printf(5, "cache hit");
- if (info->cachedData.size() == lastLine.size()) {
- log_printf(5, "... size match");
- if (info->cachedData == lastLine) {
- log_printf(5, "... content match");
- hit = true;
- cachedSize = info->cachedSize - lastLine.size();
- }
- }
- log_printf(5, "\n");
- }
- if (!hit) {
- return "";
- }
- }
- }
- if (!lastLine.empty()) {
- PBBS2chProxyThreadInfo info(new BBS2chProxyThreadInfo());
- info->lastResNum = prevNumber;
- info->cachedSize = out.size() + cachedSize;
- info->cachedData = lastLine;
- _threadCache->set(_threadKey, info);
- log_printf(5, "cached thread %s (%ld bytes)\n", _threadKey.c_str(), lastLine.size());
- }
- if (lastModifiedOut) *lastModifiedOut = lastModified;
- return out;
- }
|