BBS2chProxyHTML2Dat.cpp 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142
  1. #include <sstream>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <unistd.h>
  5. #include "BBS2chProxyHTML2Dat.h"
  6. #include "stringEncodingConverter.h"
  7. extern char *proxy_server;
  8. extern long proxy_port;
  9. extern long proxy_type;
  10. extern long timeout;
  11. extern char *user_agent;
  12. extern int force_ipv4;
  13. extern CURLSH *curl_share;
  14. extern void log_printf(int level, const char *format ...);
  15. extern void *memmem_priv(const void *l, size_t l_len, const void *s, size_t s_len);
  16. #ifdef _WIN32
  17. #define gmtime_r(a, b) gmtime_s(b, a)
  18. #endif
  19. static const char threadTimestampFmt[] = "%Y/%m/%d %H:%M:%S %Z";
  20. static const char *wdays[7] = {
  21. "日",
  22. "月",
  23. "火",
  24. "水",
  25. "木",
  26. "金",
  27. "土"
  28. };
  29. static int decryptMail(unsigned char *decrypted, char *encrypted)
  30. {
  31. char current[5]="0x";
  32. unsigned char *ptr = decrypted;
  33. current[2] = encrypted[0];
  34. current[3] = encrypted[1];
  35. unsigned int r = strtol(current,NULL,16);
  36. int len = strlen(encrypted);
  37. int n = 2;
  38. for(;n<len;n+=2) {
  39. current[2] = encrypted[n];
  40. current[3] = encrypted[n+1];
  41. unsigned int i = strtol(current,NULL,16);
  42. *ptr++ = i^r;
  43. }
  44. *ptr = 0;
  45. //fprintf(stderr,"%s->%s\n",encrypted,decrypted);
  46. return ptr - decrypted;
  47. }
  48. static void replaceAll(std::string &input, const std::string &oldValue, const std::string &newValue)
  49. {
  50. if (!oldValue.empty()) {
  51. size_t pos = 0;
  52. while ((pos = input.find(oldValue, pos)) != std::string::npos) {
  53. input.replace(pos, oldValue.size(), newValue);
  54. pos += newValue.size();
  55. }
  56. }
  57. }
  58. static void escapeForHTML(std::string &input)
  59. {
  60. replaceAll(input, "&", "&amp;");
  61. replaceAll(input, "<", "&lt;");
  62. replaceAll(input, ">", "&gt;");
  63. replaceAll(input, "\"", "&quot;");
  64. replaceAll(input, "'", "&#39;");
  65. }
  66. static size_t write_callback_download(char *buffer, size_t size, size_t nitems, void *userdata)
  67. {
  68. std::vector<char> *data = static_cast<std::vector<char> *>(userdata);
  69. size_t downloaded = size*nitems;
  70. data->insert(data->end(), buffer, buffer+downloaded);
  71. return downloaded;
  72. }
  73. BBS2chProxyHTML2Dat5ch::BBS2chProxyHTML2Dat5ch(BBS2chProxyThreadCache *cache, const BBS2chThreadIdentifier &identifier, bool useHttps, CURL *curl)
  74. : IBBS2chProxyHTML2Dat(cache, identifier, curl)
  75. {
  76. _url = useHttps ? "https://" : "http://";
  77. _url += identifier.host;
  78. _url += "/test/read.cgi/";
  79. _url += identifier.board;
  80. _url += '/';
  81. _url += identifier.key;
  82. _url += '/';
  83. }
  84. BBS2chProxyHTML2DatTalk::BBS2chProxyHTML2DatTalk(BBS2chProxyThreadCache *cache, const BBS2chThreadIdentifier &identifier, CURL *curl)
  85. : IBBS2chProxyHTML2Dat(cache, identifier, curl), _cachedJson(NULL)
  86. {
  87. _url = "https://talk.jp/api/boards/";
  88. _url += identifier.board;
  89. _url += "/threads/";
  90. _url += identifier.key;
  91. }
  92. BBS2chProxyHTML2DatTalkHTML::BBS2chProxyHTML2DatTalkHTML(BBS2chProxyThreadCache *cache, const BBS2chThreadIdentifier &identifier, CURL *curl)
  93. : BBS2chProxyHTML2DatTalk(cache, identifier, curl)
  94. {
  95. _url = "https://talk.jp/boards/";
  96. _url += identifier.board;
  97. _url += '/';
  98. _url += identifier.key;
  99. _url += '/';
  100. }
  101. std::vector<char> IBBS2chProxyHTML2Dat::getHtmlFromURL(const std::string &url, long *outStatusCode)
  102. {
  103. CURLcode res;
  104. long statusCode = 0;
  105. std::vector<char> html;
  106. if (curl_share) curl_easy_setopt(_curl, CURLOPT_SHARE, curl_share);
  107. curl_easy_setopt(_curl, CURLOPT_URL, url.c_str());
  108. curl_easy_setopt(_curl, CURLOPT_NOSIGNAL, 1L);
  109. curl_easy_setopt(_curl, CURLOPT_TIMEOUT, timeout);
  110. curl_easy_setopt(_curl, CURLOPT_ENCODING, "");
  111. curl_easy_setopt(_curl, CURLOPT_WRITEFUNCTION, write_callback_download);
  112. curl_easy_setopt(_curl, CURLOPT_WRITEDATA, &html);
  113. curl_easy_setopt(_curl, CURLOPT_FOLLOWLOCATION, 1L);
  114. curl_easy_setopt(_curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
  115. curl_easy_setopt(_curl, CURLOPT_SSL_VERIFYHOST, 0L);
  116. curl_easy_setopt(_curl, CURLOPT_SSL_VERIFYPEER, 0L);
  117. if (force_ipv4) curl_easy_setopt(_curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
  118. if (proxy_server) {
  119. curl_easy_setopt(_curl, CURLOPT_PROXY, proxy_server);
  120. curl_easy_setopt(_curl, CURLOPT_PROXYPORT, proxy_port);
  121. curl_easy_setopt(_curl, CURLOPT_PROXYTYPE, proxy_type);
  122. }
  123. if (user_agent) {
  124. curl_easy_setopt(_curl, CURLOPT_USERAGENT, user_agent);
  125. }
  126. else if (!_userAgent.empty()) {
  127. curl_easy_setopt(_curl, CURLOPT_USERAGENT, _userAgent.c_str());
  128. }
  129. res = curl_easy_perform(_curl);
  130. if (res == CURLE_OK) {
  131. curl_easy_getinfo(_curl, CURLINFO_RESPONSE_CODE, &statusCode);
  132. if (statusCode != 200) html.clear();
  133. } else {
  134. log_printf(0, "curl error: %s (%s)\n", curl_easy_strerror(res), url.c_str());
  135. }
  136. curl_easy_reset(_curl);
  137. if (outStatusCode) *outStatusCode = statusCode;
  138. return html;
  139. }
  140. void IBBS2chProxyHTML2Dat::setRequestHeaders(BBS2chProxyHttpHeaders &headers)
  141. {
  142. if (headers.has("User-Agent")) {
  143. _userAgent = headers.get("User-Agent");
  144. }
  145. }
  146. const std::string& IBBS2chProxyHTML2Dat::getKey()
  147. {
  148. return _threadKey;
  149. }
  150. std::string BBS2chProxyHTML2Dat5ch::generateDatFrom(int startFrom, time_t *lastModifiedOut, bool useCache, long *outStatusCode)
  151. {
  152. std::string tmpURL(_url);
  153. if (startFrom > 1) {
  154. std::ostringstream ss;
  155. ss << startFrom << "-n";
  156. tmpURL += ss.str();
  157. } else {
  158. tmpURL += "1-";
  159. }
  160. std::vector<char> html = getHtmlFromURL(tmpURL, outStatusCode);
  161. return html2dat(html, startFrom, lastModifiedOut, useCache);
  162. }
  163. std::string BBS2chProxyHTML2DatTalk::generateDatFrom(int startFrom, time_t *lastModifiedOut, bool useCache, long *outStatusCode)
  164. {
  165. if (!_cachedJson) {
  166. std::vector<char> json = getHtmlFromURL(_url, outStatusCode);
  167. if (json.empty()) return "";
  168. json.push_back(0);
  169. _cachedJson = json_parse_string(&json.front());
  170. }
  171. if (!_cachedJson) return "";
  172. return json2dat(_cachedJson, startFrom, lastModifiedOut, useCache);
  173. }
  174. std::string BBS2chProxyHTML2DatTalkHTML::generateDatFrom(int startFrom, time_t *lastModifiedOut, bool useCache, long *outStatusCode)
  175. {
  176. if (!_cachedJson) {
  177. std::vector<char> html = getHtmlFromURL(_url, outStatusCode);
  178. if (html.empty()) return "";
  179. html.push_back(0);
  180. const char *ptr = strstr(&html.front(), "id=\"__NEXT_DATA__\"");
  181. if (ptr) {
  182. ptr += strlen("id=\"__NEXT_DATA__\"");
  183. while (*ptr != '>' && *ptr != 0) ptr++;
  184. if (*ptr) {
  185. const char *end = strstr(++ptr, "</script>");
  186. if (end) {
  187. std::string jsonStr(ptr, end-ptr);
  188. _cachedJson = json_parse_string(jsonStr.c_str());
  189. }
  190. }
  191. }
  192. }
  193. if (!_cachedJson) return "";
  194. JSON_Value *threadData = json_object_dotget_value(json_object(_cachedJson), "props.pageProps.threadData");
  195. return json2dat(threadData, startFrom, lastModifiedOut, useCache);
  196. }
  197. std::string BBS2chProxyHTML2Dat5ch::html2dat_old(std::vector<char> &html, int startResNum, time_t *lastModified, bool useCache)
  198. {
  199. char *ptr = &html.front();
  200. char *end = &html.back();
  201. std::string txt;
  202. int res = startResNum, i=0;
  203. char signature[32];
  204. char title[1024];
  205. int cachedSize = 0;
  206. bool bbspink = strstr(_threadKey.c_str(),"bbspink.com") ? true : false;
  207. ptr = (char *)memmem_priv(ptr, end-ptr+1, "<title>", 7);
  208. if(!ptr) {
  209. return "";
  210. }
  211. ptr += 7;
  212. while(1) {
  213. if(*ptr == '<') {
  214. if(!strncasecmp(ptr,"</title>",8)) {
  215. ptr += 8;
  216. break;
  217. }
  218. else title[i++] = *ptr++;
  219. }
  220. else title[i++] = *ptr++;
  221. }
  222. title[i] = 0;
  223. snprintf(signature,32,"<dt>%d ",res);
  224. ptr = (char *)memmem_priv(ptr, end-ptr+1, signature, strlen(signature));
  225. if(!ptr) {
  226. return "";
  227. }
  228. unsigned char *buffer = (unsigned char *)malloc(65536+1024+1024+1024+2048);
  229. if(!buffer) {
  230. return "";
  231. }
  232. unsigned char *body = buffer;
  233. char *mail = (char *)body + 65536;
  234. char *name = mail + 1024;
  235. char *date = name + 1024;
  236. char *encrypted = date + 1024;
  237. while(ptr < end) {
  238. //fprintf(stderr,"%s\n",signature);
  239. std::string resData;
  240. i=0;
  241. mail[0] = 0;
  242. ptr = strstr(ptr,signature);
  243. ptr += strlen(signature);
  244. while(*ptr != '<') ptr++;
  245. ptr++;
  246. const char *endStr;
  247. if(*ptr == 'a' || *ptr == 'A') {
  248. replay:
  249. // has mail
  250. while(*ptr != '"') ptr++;
  251. ptr++;
  252. if(!strncmp(ptr,"/cdn-cgi/l/email-protection#",28)) {
  253. ptr += 28;
  254. while(*ptr != '"' && *ptr != 'X') encrypted[i++] = *ptr++;
  255. encrypted[i] = 0;
  256. i = decryptMail((unsigned char *)mail,encrypted);
  257. int reconstruct_len = *ptr == 'X' ? i + 15 : i + 16;
  258. ptr -= reconstruct_len;
  259. char *start = ptr;
  260. memcpy(ptr, "<a href=\"mailto:", 16);
  261. ptr += 16;
  262. memcpy(ptr, mail, i);
  263. ptr = start;
  264. i=0;
  265. goto replay;
  266. }
  267. else {
  268. if(!strncmp(ptr,"mailto:",7)) ptr += 7;
  269. while(*ptr != '"') mail[i++] = *ptr++;
  270. mail[i] = 0;
  271. }
  272. endStr = "</a>";
  273. }
  274. else if(*ptr == 'b') {
  275. endStr = NULL;
  276. }
  277. else {
  278. endStr = "</font>";
  279. }
  280. if(endStr) {
  281. ptr = strstr(ptr,"<b>");
  282. ptr += 3;
  283. }
  284. else {
  285. ptr = strchr(ptr,'>');
  286. ptr++;
  287. }
  288. i=0;
  289. while(1) {
  290. if(*ptr == '<') {
  291. if(!strncasecmp(ptr,"</b>",4) && (!endStr || !strncasecmp(ptr+4,endStr,strlen(endStr)))) {
  292. ptr += 4;
  293. if(endStr) ptr += strlen(endStr);
  294. break;
  295. }
  296. else if(!strncmp(ptr,"<span class=\"__cf_email__\"",26)) {
  297. int j=0;
  298. ptr = strstr(ptr,"data-cfemail=\"");
  299. ptr += 14;
  300. while(*ptr != '"') encrypted[j++] = *ptr++;
  301. encrypted[j] = 0;
  302. j = decryptMail((unsigned char *)name+i,encrypted);
  303. i += j;
  304. ptr = strstr(ptr,"</script>");
  305. ptr += 9;
  306. }
  307. else name[i++] = *ptr++;
  308. }
  309. else name[i++] = *ptr++;
  310. }
  311. resData.append(name, i);
  312. resData.append("<>");
  313. if(mail[0]) resData.append(mail);
  314. resData.append("<>");
  315. ptr += 2;
  316. i=0;
  317. while(1) {
  318. if(*ptr == '<') {
  319. if(!strncasecmp(ptr,"<dd>",4)) {
  320. ptr += 4;
  321. break;
  322. }
  323. else if(!strncmp(ptr,"<a href=\"javascript:be(",23)) {
  324. memcpy(date+i,"BE:",3);
  325. ptr += 23;
  326. i += 3;
  327. while(*ptr != ')') date[i++] = *ptr++;
  328. date[i++] = '-';
  329. ptr = strchr(ptr,'?');
  330. ptr++;
  331. char *tmp = strstr(ptr,"</a>");
  332. memcpy(date+i,ptr,tmp-ptr);
  333. i += tmp-ptr;
  334. ptr = tmp + 4;
  335. }
  336. else date[i++] = *ptr++;
  337. }
  338. else date[i++] = *ptr++;
  339. }
  340. resData.append(date, i);
  341. resData.append("<>");
  342. i=0;
  343. while(1) {
  344. if(*ptr == '<') {
  345. if(!strncasecmp(ptr,"<br><br>\n",9)) {
  346. ptr += 9;
  347. break;
  348. }
  349. else if(!strncasecmp(ptr,"<dt>",4) || !strncasecmp(ptr,"</dl>",5)) {
  350. while(i>0 &&body[i-1] == '\n') i--;
  351. break;
  352. }
  353. else if(!strncmp(ptr,"<span class=\"__cf_email__\"",26) || !strncmp(ptr,"<a class=\"__cf_email__\"",23)) {
  354. int j=0;
  355. ptr = strstr(ptr,"data-cfemail=\"");
  356. ptr += 14;
  357. while(*ptr != '"') encrypted[j++] = *ptr++;
  358. encrypted[j] = 0;
  359. j = decryptMail(body+i,encrypted);
  360. i += j;
  361. ptr = strstr(ptr,"</script>");
  362. ptr += 9;
  363. }
  364. else if(!strncmp(ptr,"<a href=\"http",13)) {
  365. ptr = strchr(ptr,'>');
  366. ptr++;
  367. char *link = ptr;
  368. ptr = strstr(link,"</a>");
  369. memcpy(body+i,link,ptr-link);
  370. i += ptr-link;
  371. ptr += 4;
  372. }
  373. else if(!strncmp(ptr,"<img src=\"",10)) {
  374. ptr += 10;
  375. char *img = ptr;
  376. ptr = strstr(img,"\">");
  377. memcpy(body+i,img,ptr-img);
  378. if(memmem_priv(img,ptr-img,"/img.2ch.net",12) || memmem_priv(img,ptr-img,"/img.5ch.net",12) || memmem_priv(img,ptr-img,"/o.8ch.net",10) || memmem_priv(img,ptr-img,"/o.5ch.net",10)) {
  379. int length = ptr-img;
  380. while(*img != '/') {
  381. img++;
  382. length--;
  383. }
  384. memcpy(body+i,"sssp:",5);
  385. memcpy(body+i+5,img,length);
  386. i += length + 5;
  387. }
  388. else i += ptr-img;
  389. ptr += 2;
  390. }
  391. else if(!bbspink && !strncmp(ptr,"<br>",4)) {
  392. if(i>5 && !strncmp((char *)body+i-5,"<br> ",5)) {
  393. memcpy(body+i," <br>",5);
  394. i += 5;
  395. }
  396. else {
  397. memcpy(body+i,"<br>",4);
  398. i += 4;
  399. }
  400. ptr += 4;
  401. }
  402. else body[i++] = *ptr++;
  403. }
  404. else if(!bbspink && *ptr == ' ') {
  405. if(*(ptr+1) == ' ') ptr++;
  406. else body[i++] = *ptr++;
  407. }
  408. else body[i++] = *ptr++;
  409. }
  410. resData.append((const char *)body ,i);
  411. resData.append("<>");
  412. if(res == 1) resData.append(title);
  413. resData.append("\n");
  414. if(useCache && res == startResNum) {
  415. PBBS2chProxyThreadInfo info = _threadCache->pop(_threadKey);
  416. bool hit = false;
  417. if(info) {
  418. log_printf(5,"cache hit");
  419. if(info->cachedData.size() == resData.size()) {
  420. log_printf(5,"... size match");
  421. if(info->cachedData == resData) {
  422. log_printf(5,"... content match");
  423. hit = true;
  424. cachedSize = info->cachedSize - resData.size();
  425. }
  426. }
  427. log_printf(5,"\n");
  428. }
  429. if(!hit) {
  430. free(buffer);
  431. return "";
  432. }
  433. }
  434. txt += resData;
  435. res++;
  436. while(*ptr == '\n' || *ptr == '\r') ptr++;
  437. snprintf(signature,32,"<dt>%d ",res);
  438. if(!memmem_priv(ptr, end-ptr+1, signature, strlen(signature))) {
  439. PBBS2chProxyThreadInfo info(new BBS2chProxyThreadInfo());
  440. info->lastResNum = res-1;
  441. info->cachedSize = txt.size()+cachedSize;
  442. info->cachedData = resData;
  443. _threadCache->set(_threadKey, info);
  444. log_printf(5,"cached thread %s (%ld bytes)\n",_threadKey.c_str(),resData.size());
  445. if(lastModified) {
  446. *lastModified = 0;
  447. char formattedDate[256];
  448. char *ptr;
  449. ptr = date;
  450. int year = strtol(ptr,&ptr,10);
  451. if(*ptr != '/') break;
  452. ptr++;
  453. int month = strtol(ptr,&ptr,10);
  454. if(*ptr != '/') break;
  455. ptr++;
  456. int day = strtol(ptr,&ptr,10);
  457. if(!*ptr) break;
  458. while(*ptr != ' ' && *ptr != 0) ptr++;
  459. if(!*ptr) break;
  460. ptr++;
  461. int hour = strtol(ptr,&ptr,10);
  462. if(*ptr != ':') break;
  463. ptr++;
  464. int minutes = strtol(ptr,&ptr,10);
  465. if(*ptr != ':') break;
  466. ptr++;
  467. int seconds = strtol(ptr,&ptr,10);
  468. if(!(month>0 && month<13) || !(day>0 && day<32)) break;
  469. if(year < 100) year += 2000;
  470. #if LIBCURL_VERSION_NUM >= 0x070c02 /* curl 7.12.2 or later */
  471. snprintf(formattedDate, 256, "%d%02d%02d %02d:%02d:%02d +0900", year, month, day, hour, minutes, seconds);
  472. *lastModified = curl_getdate(formattedDate, NULL);
  473. #else
  474. snprintf(formattedDate,256,"%d/%d/%d %02d:%02d:%02d JST",year,month,day,hour,minutes,seconds);
  475. struct tm time = {0};
  476. strptime(formattedDate,threadTimestampFmt,&time);
  477. *lastModified = mktime(&time);
  478. #endif
  479. }
  480. //fprintf(stderr,"not found,%ld\n",end-ptr+1);
  481. break;
  482. }
  483. }
  484. free(buffer);
  485. return txt;
  486. }
  487. std::string BBS2chProxyHTML2Dat5ch::html2dat(std::vector<char> &html, int startResNum, time_t *lastModified, bool useCache)
  488. {
  489. char *ptr = &html.front();
  490. char *end = &html.back();
  491. std::string txt;
  492. int res = startResNum, i=0;
  493. char signature[64];
  494. char title[1024];
  495. int cachedSize = 0;
  496. char signatureTag[32];
  497. char closeTag[32];
  498. int closeTagLen;
  499. bool isNewHTML = false;
  500. if (html.empty()) return "";
  501. ptr = (char *)memmem_priv(ptr, end-ptr+1, " id=\"threadtitle\">", 18);
  502. if (ptr) {
  503. isNewHTML = true;
  504. char *ptr2 = (char *)memmem_priv(ptr, end-ptr+1, "<article id=\"", 13);
  505. if (!ptr2) {
  506. return "";
  507. }
  508. const char *tmp = ptr;
  509. while (*tmp != '<') tmp--;
  510. memcpy(closeTag+2, tmp+1, ptr-tmp-1);
  511. closeTag[0] = '<';
  512. closeTag[1] = '/';
  513. closeTag[ptr-tmp+1] = '>';
  514. closeTag[ptr-tmp+2] = 0;
  515. ptr += 18;
  516. while (1) {
  517. if (*ptr == '<') {
  518. if (!strncasecmp(ptr, closeTag, strlen(closeTag))) {
  519. ptr += strlen(closeTag);
  520. break;
  521. }
  522. else title[i++] = *ptr++;
  523. }
  524. else if(*ptr == '\n') break;
  525. else title[i++] = *ptr++;
  526. }
  527. title[i] = 0;
  528. snprintf(signature, 32, "<article id=\"%d\"", res);
  529. }
  530. else {
  531. ptr = &html.front();
  532. ptr = (char *)memmem_priv(ptr, end-ptr+1, "<h1 class=\"title\">", 18);
  533. if(!ptr) {
  534. return html2dat_old(html, startResNum, lastModified, useCache);
  535. }
  536. else {
  537. char *ptr2 = (char *)memmem_priv(ptr, end-ptr+1, " class=\"post\"", 13);
  538. if(ptr2) {
  539. char *tmp = ptr2;
  540. *ptr2 = 0;
  541. while(*ptr2 != '<') ptr2--;
  542. strcpy(signatureTag, ptr2);
  543. *tmp = ' ';
  544. }
  545. else {
  546. return "";
  547. }
  548. /*char *ptr2 = (char *)memmem_priv(ptr, end-ptr+1, "<dl class=\"post\"", 16);
  549. if(ptr2) {
  550. return html2dat_pink(html, startResNum, lastModified, useCache);
  551. }*/
  552. }
  553. ptr += 18;
  554. while(1) {
  555. if(*ptr == '<') {
  556. if(!strncasecmp(ptr,"</h1>",5)) {
  557. ptr += 5;
  558. break;
  559. }
  560. else title[i++] = *ptr++;
  561. }
  562. else if(*ptr == '\n') break;
  563. else title[i++] = *ptr++;
  564. }
  565. title[i] = 0;
  566. snprintf(signature,32,"%s class=\"post\" id=\"%d\"",signatureTag,res);
  567. }
  568. ptr = (char *)memmem_priv(ptr, end-ptr+1, signature, strlen(signature));
  569. if(!ptr) {
  570. return "";
  571. }
  572. unsigned char *buffer = (unsigned char *)malloc(65536+1024+1024+1024+2048);
  573. if(!buffer) {
  574. return "";
  575. }
  576. unsigned char *body = buffer;
  577. char *mail = (char *)body + 65536;
  578. char *name = mail + 1024;
  579. char *date = name + 1024;
  580. char *encrypted = date + 1024;
  581. while(ptr < end) {
  582. //fprintf(stderr,"%s\n",signature);
  583. std::string resData;
  584. i=0;
  585. mail[0] = 0;
  586. if (isNewHTML) ptr = strstr(ptr," class=\"postusername\"><b>");
  587. else ptr = strstr(ptr," class=\"name\"><b>");
  588. if(ptr) {
  589. char *tmp = ptr;
  590. *ptr = 0;
  591. while(*ptr != '<') ptr--;
  592. snprintf(closeTag,32,"</%s>",ptr+1);
  593. closeTagLen = strlen(closeTag);
  594. if (isNewHTML) ptr = tmp + 25;
  595. else ptr = tmp + 17;
  596. }
  597. else {
  598. break;
  599. }
  600. char endStr[64];
  601. if(!strncmp(ptr,"<a href=\"mailto:",16)) {
  602. replay:
  603. // has mail
  604. while(*ptr != '"') ptr++;
  605. ptr++;
  606. if(!strncmp(ptr,"/cdn-cgi/l/email-protection#",28)) {
  607. ptr += 28;
  608. while(*ptr != '"' && *ptr != 'X') encrypted[i++] = *ptr++;
  609. encrypted[i] = 0;
  610. i = decryptMail((unsigned char *)mail,encrypted);
  611. int reconstruct_len = *ptr == 'X' ? i + 15 : i + 16;
  612. ptr -= reconstruct_len;
  613. char *start = ptr;
  614. memcpy(ptr, "<a href=\"mailto:", 16);
  615. ptr += 16;
  616. memcpy(ptr, mail, i);
  617. ptr = start;
  618. i=0;
  619. goto replay;
  620. }
  621. else {
  622. if(!strncmp(ptr,"mailto:",7)) ptr += 7;
  623. while(1) {
  624. if(*ptr == '<' && !strncmp(ptr,"<a href=\"",9)) {
  625. ptr = strchr(ptr,'>');
  626. ptr++;
  627. char *link = ptr;
  628. ptr = strstr(link,"</a>");
  629. memcpy(mail+i,link,ptr-link);
  630. i += ptr-link;
  631. ptr += 4;
  632. }
  633. else if(*ptr == '"') break;
  634. else mail[i++] = *ptr++;
  635. }
  636. //while(*ptr != '"') mail[i++] = *ptr++;
  637. mail[i] = 0;
  638. }
  639. snprintf(endStr,64,"</a></b>%s",closeTag);
  640. while(*ptr != '>') ptr++;
  641. ptr++;
  642. }
  643. /* we do not have to handle this special case because read.cgi on bbspink doesn't
  644. emit font tags anymore and it conflicts with text decorations using "melon point" */
  645. /*else if(!strncmp(ptr,"<font",5)) {
  646. snprintf(endStr,64,"</font></b>%s",closeTag);
  647. while(*ptr != '>') ptr++;
  648. ptr++;
  649. }*/
  650. else {
  651. snprintf(endStr,64,"</b>%s",closeTag);
  652. }
  653. i=0;
  654. while(1) {
  655. if(*ptr == '<') {
  656. if(!strncmp(ptr,endStr,strlen(endStr))) {
  657. ptr += strlen(endStr);
  658. break;
  659. }
  660. else if(!strncmp(ptr,"<span class=\"__cf_email__\"",26)) {
  661. int j=0;
  662. ptr = strstr(ptr,"data-cfemail=\"");
  663. ptr += 14;
  664. while(*ptr != '"') encrypted[j++] = *ptr++;
  665. encrypted[j] = 0;
  666. j = decryptMail((unsigned char *)name+i,encrypted);
  667. i += j;
  668. ptr = strstr(ptr,"</script>");
  669. ptr += 9;
  670. }
  671. else if(!strncmp(ptr,"<a href=\"",9)) {
  672. ptr = strchr(ptr,'>');
  673. ptr++;
  674. char *link = ptr;
  675. ptr = strstr(link,"</a>");
  676. memcpy(name+i,link,ptr-link);
  677. i += ptr-link;
  678. ptr += 4;
  679. }
  680. else name[i++] = *ptr++;
  681. }
  682. else name[i++] = *ptr++;
  683. }
  684. resData.append(name, i);
  685. resData.append("<>");
  686. if(mail[0]) resData.append(mail);
  687. resData.append("<>");
  688. ptr = strstr(ptr," class=\"date\">");
  689. if(ptr) {
  690. char *tmp = ptr;
  691. *ptr = 0;
  692. while(*ptr != '<') ptr--;
  693. snprintf(closeTag,32,"</%s>",ptr+1);
  694. closeTagLen = strlen(closeTag);
  695. ptr = tmp + 14;
  696. }
  697. else {
  698. break;
  699. }
  700. i=0;
  701. while(1) {
  702. if(*ptr == '<') {
  703. if(!strncasecmp(ptr,closeTag,closeTagLen)) {
  704. ptr += closeTagLen;
  705. break;
  706. }
  707. else date[i++] = *ptr++;
  708. }
  709. else date[i++] = *ptr++;
  710. }
  711. if(!strncmp(ptr,"<div class=\"uid",15) || !strncmp(ptr,"<span class=\"uid",16)) {
  712. char *tmp = ptr+1;
  713. while(*ptr != ' ') ptr++;
  714. *ptr = 0;
  715. snprintf(closeTag,32,"</%s>",tmp);
  716. closeTagLen = strlen(closeTag);
  717. ptr += 11;
  718. while(*ptr != '>') ptr++;
  719. ptr++;
  720. date[i++] = ' ';
  721. while(1) {
  722. if(*ptr == '<') {
  723. if(!strncasecmp(ptr,closeTag,closeTagLen)) {
  724. ptr += closeTagLen;
  725. break;
  726. }
  727. else date[i++] = *ptr++;
  728. }
  729. else date[i++] = *ptr++;
  730. }
  731. }
  732. if(!strncmp(ptr,"<div class=\"be",14) || !strncmp(ptr,"<span class=\"be",15)) {
  733. ptr += 14;
  734. while(*ptr != '>') ptr++;
  735. ptr++;
  736. if(!strncmp(ptr,"<a href=\"",9)) {
  737. ptr += 9;
  738. while(*ptr != '/' && *ptr != '"') ptr++;
  739. if(*ptr == '/' && (!strncmp(ptr,"//be.2ch.net/user/",18) || !strncmp(ptr,"//be.5ch.net/user/",18))) {
  740. memcpy(date+i," BE:",4);
  741. i += 4;
  742. ptr += 18;
  743. while(*ptr != '"') date[i++] = *ptr++;
  744. date[i++] = '-';
  745. ptr = strchr(ptr,'?');
  746. ptr++;
  747. char *tmp = strstr(ptr,"</a>");
  748. memcpy(date+i,ptr,tmp-ptr);
  749. i += tmp-ptr;
  750. ptr = tmp + 4;
  751. }
  752. }
  753. }
  754. resData.append(date, i);
  755. resData.append("<>");
  756. if (isNewHTML) {
  757. ptr = strstr(ptr,"<section class=\"post-content\">");
  758. if (!ptr) {
  759. break;
  760. }
  761. else {
  762. ptr += 30;
  763. if (!strncasecmp(ptr, "<span class=\"AA\">", 17)) {
  764. strcpy(closeTag, "</span></section>");
  765. closeTagLen = 17;
  766. ptr += 17;
  767. }
  768. else {
  769. strcpy(closeTag, "</section>");
  770. closeTagLen = 10;
  771. }
  772. }
  773. }
  774. else if(!strcmp(signatureTag,"<div")) {
  775. ptr = strstr(ptr,"<div class=\"message\">");
  776. if(!ptr) {
  777. break;
  778. }
  779. else {
  780. ptr += 21;
  781. if(!strncasecmp(ptr,"<span class=\"escaped\">",22)) {
  782. if(!strncasecmp(ptr+22,"<span class=\"AA\">",17)) {
  783. strcpy(closeTag,"</span></span></div>");
  784. closeTagLen = 20;
  785. ptr += 22+17;
  786. }
  787. else {
  788. strcpy(closeTag,"</span></div>");
  789. closeTagLen = 13;
  790. ptr += 22;
  791. }
  792. }
  793. else {
  794. strcpy(closeTag,"</div>");
  795. closeTagLen = 6;
  796. }
  797. }
  798. }
  799. else {
  800. ptr = strstr(ptr,"<dd class=\"thread_in\">");
  801. if(!ptr) {
  802. break;
  803. }
  804. strcpy(closeTag,"</dd>");
  805. closeTagLen = 5;
  806. ptr += 22;
  807. }
  808. i=0;
  809. while(1) {
  810. if(*ptr == '<') {
  811. if(!strncasecmp(ptr,closeTag,closeTagLen)) {
  812. ptr += closeTagLen;
  813. break;
  814. }
  815. else if(!strncmp(ptr,"<span class=\"__cf_email__\"",26) || !strncmp(ptr,"<a class=\"__cf_email__\"",23)) {
  816. int j=0;
  817. ptr = strstr(ptr,"data-cfemail=\"");
  818. ptr += 14;
  819. while(*ptr != '"') encrypted[j++] = *ptr++;
  820. encrypted[j] = 0;
  821. j = decryptMail(body+i,encrypted);
  822. i += j;
  823. ptr = strstr(ptr,"</script>");
  824. ptr += 9;
  825. }
  826. else if(!strncmp(ptr,"<a ",3)) {
  827. char *tmp = strchr(ptr,'>');
  828. char *href = (char *)memmem_priv(ptr,tmp-ptr,"href=\"",6);
  829. char *link = tmp+1;
  830. if(href && !strncmp(link,"&gt;&gt;",8) && memmem_priv(href,link-href,"test/read.cgi/",14)) {
  831. while(ptr < link) {
  832. if(!strncmp(ptr," class=\"",8)) {
  833. ptr += 8;
  834. while(*ptr != '"' && *ptr != '>') ptr++;
  835. if(*ptr == '"') ptr++;
  836. }
  837. else body[i++] = *ptr++;
  838. }
  839. }
  840. else {
  841. ptr = strstr(link,"</a>");
  842. memcpy(body+i,link,ptr-link);
  843. i += ptr-link;
  844. ptr += 4;
  845. }
  846. }
  847. else if(!strncmp(ptr,"<img src=\"",10)) {
  848. ptr += 10;
  849. char *img = ptr;
  850. ptr = strstr(img,"\">");
  851. memcpy(body+i,img,ptr-img);
  852. if(memmem_priv(img,ptr-img,"/img.2ch.net",12) || memmem_priv(img,ptr-img,"/img.5ch.net",12) || memmem_priv(img,ptr-img,"/o.8ch.net",10) || memmem_priv(img,ptr-img,"/o.5ch.net",10)) {
  853. int length = ptr-img;
  854. while(*img != '/') {
  855. img++;
  856. length--;
  857. }
  858. memcpy(body+i,"sssp:",5);
  859. memcpy(body+i+5,img,length);
  860. i += length + 5;
  861. }
  862. else i += ptr-img;
  863. ptr += 2;
  864. }
  865. else if(!strncmp(ptr,"<br>",4)) {
  866. if(i>5 && !strncmp((char *)body+i-5,"<br> ",5)) {
  867. memcpy(body+i," <br>",5);
  868. i += 5;
  869. }
  870. else {
  871. memcpy(body+i,"<br>",4);
  872. i += 4;
  873. }
  874. ptr += 4;
  875. }
  876. else body[i++] = *ptr++;
  877. }
  878. else body[i++] = *ptr++;
  879. }
  880. resData.append((const char *)body, i);
  881. resData.append("<>");
  882. if(res == 1) resData.append(title);
  883. resData.append("\n");
  884. if(useCache && res == startResNum) {
  885. PBBS2chProxyThreadInfo info = _threadCache->pop(_threadKey);
  886. bool hit = false;
  887. if(info) {
  888. log_printf(5,"cache hit");
  889. if(info->cachedData.size() == resData.size()) {
  890. log_printf(5,"... size match");
  891. if(info->cachedData == resData) {
  892. log_printf(5,"... content match");
  893. hit = true;
  894. cachedSize = info->cachedSize - resData.size();
  895. }
  896. }
  897. log_printf(5,"\n");
  898. }
  899. if(!hit) {
  900. free(buffer);
  901. return "";
  902. }
  903. }
  904. txt += resData;
  905. res++;
  906. while(*ptr == '\n' || *ptr == '\r') ptr++;
  907. if (isNewHTML) strcpy(signature, "<article id=\"");
  908. else snprintf(signature,64,"%s class=\"post\" id=\"",signatureTag);
  909. ptr = (char *)memmem_priv(ptr, end-ptr+1, signature, strlen(signature));
  910. if(ptr) {
  911. int next = atoi(ptr+strlen(signature));
  912. if(next >= res) {
  913. while(next > res) {
  914. txt += "broken<><>broken<> broken <>\n";
  915. res++;
  916. }
  917. }
  918. else ptr = NULL;
  919. }
  920. if(!ptr) {
  921. PBBS2chProxyThreadInfo info(new BBS2chProxyThreadInfo());
  922. info->lastResNum = res-1;
  923. info->cachedSize = txt.size()+cachedSize;
  924. info->cachedData = resData;
  925. _threadCache->set(_threadKey, info);
  926. log_printf(5,"cached thread %s (%ld bytes)\n",_threadKey.c_str(),resData.size());
  927. if(lastModified) {
  928. *lastModified = 0;
  929. char formattedDate[256];
  930. char *ptr;
  931. ptr = date;
  932. int year = strtol(ptr,&ptr,10);
  933. if(*ptr != '/') break;
  934. ptr++;
  935. int month = strtol(ptr,&ptr,10);
  936. if(*ptr != '/') break;
  937. ptr++;
  938. int day = strtol(ptr,&ptr,10);
  939. if(!*ptr) break;
  940. while(*ptr != ' ' && *ptr != 0) ptr++;
  941. if(!*ptr) break;
  942. ptr++;
  943. int hour = strtol(ptr,&ptr,10);
  944. if(*ptr != ':') break;
  945. ptr++;
  946. int minutes = strtol(ptr,&ptr,10);
  947. if(*ptr != ':') break;
  948. ptr++;
  949. int seconds = strtol(ptr,&ptr,10);
  950. if(!(month>0 && month<13) || !(day>0 && day<32)) break;
  951. if(year < 100) year += 2000;
  952. #if LIBCURL_VERSION_NUM >= 0x070c02 /* curl 7.12.2 or later */
  953. snprintf(formattedDate, 256, "%d%02d%02d %02d:%02d:%02d +0900", year, month, day, hour, minutes, seconds);
  954. *lastModified = curl_getdate(formattedDate, NULL);
  955. #else
  956. snprintf(formattedDate,256,"%d/%d/%d %02d:%02d:%02d JST",year,month,day,hour,minutes,seconds);
  957. struct tm time = {0};
  958. strptime(formattedDate,threadTimestampFmt,&time);
  959. *lastModified = mktime(&time);
  960. #endif
  961. }
  962. //fprintf(stderr,"not found,%ld\n",end-ptr+1);
  963. break;
  964. }
  965. }
  966. free(buffer);
  967. return txt;
  968. }
  969. std::string BBS2chProxyHTML2DatTalk::json2dat(JSON_Value *json, int startFrom, time_t *lastModifiedOut, bool useCache)
  970. {
  971. std::string out;
  972. if (!json || json_type(json) != JSONObject) {
  973. return "";
  974. }
  975. JSON_Object *root = json_object(json);
  976. const char *title = json_object_dotget_string(root, "data.title");
  977. const char *quoteSource = json_object_dotget_string(root, "data.quote_source");
  978. JSON_Array *comments = json_object_dotget_array(root, "data.comments");
  979. if (!title || !comments) {
  980. return "";
  981. }
  982. if (startFrom < 1) startFrom = 1;
  983. int prevNumber = startFrom - 1;
  984. time_t lastModified = 0;
  985. size_t cachedSize = 0;
  986. std::string lastLine;
  987. for (size_t i=0, length=json_array_get_count(comments); i<length; i++) {
  988. std::stringstream line;
  989. JSON_Object *comment = json_array_get_object(comments, i);
  990. if (!comment) continue;
  991. int number = json_object_get_number(comment, "number");
  992. if (number < startFrom) continue;
  993. const char *name = json_object_dotget_string(comment, "writer.name");
  994. const char *trip = json_object_dotget_string(comment, "writer.trip");
  995. const char *slip = json_object_dotget_string(comment, "writer.slip");
  996. const char *id = json_object_dotget_string(comment, "writer.id");
  997. time_t timestamp = json_object_get_number(comment, "timestamp");
  998. const char *body = json_object_get_string(comment, "body");
  999. if (timestamp > lastModified) lastModified = timestamp;
  1000. for (int j=prevNumber+1; j<number; j++) {
  1001. out += "broken<><>broken<> broken <>\n";
  1002. }
  1003. if (name) {
  1004. std::string tmp(name);
  1005. escapeForHTML(tmp);
  1006. line << tmp;
  1007. if (trip) line << "</b>◆" << trip << "<b>";
  1008. if (slip) line << " </b>(" << slip << ")<b>";
  1009. }
  1010. else line << "削除";
  1011. line << "<><>"; //mail cannot be obtained from json!
  1012. if (timestamp) {
  1013. char dateStr[256] = "";
  1014. struct tm timestamp_tm = {0};
  1015. timestamp += 32400;
  1016. gmtime_r(&timestamp, &timestamp_tm);
  1017. strftime(dateStr, 256, "%Y/%m/%d(", &timestamp_tm);
  1018. line << dateStr << wdays[timestamp_tm.tm_wday] << ") ";
  1019. strftime(dateStr, 256, "%H:%M:%S", &timestamp_tm);
  1020. line << dateStr;
  1021. if (id) {
  1022. line << " ID:" << id;
  1023. }
  1024. }
  1025. else line << "削除";
  1026. line << "<>";
  1027. if (body) {
  1028. std::string tmp(body);
  1029. escapeForHTML(tmp);
  1030. replaceAll(tmp, "\n", " <br> ");
  1031. line << " " << tmp;
  1032. if (number == 1 && quoteSource) {
  1033. line << " <br> <br> 出典 " << quoteSource;
  1034. }
  1035. line << " ";
  1036. }
  1037. else line << "削除";
  1038. line << "<>";
  1039. if (number == 1) {
  1040. std::string tmp(title);
  1041. escapeForHTML(tmp);
  1042. line << tmp;
  1043. }
  1044. line << "\n";
  1045. prevNumber = number;
  1046. char *lineSJIS = convertUTF8ToShiftJISWithNCR(line.str().c_str(), line.str().size());
  1047. if (lineSJIS) {
  1048. lastLine = lineSJIS;
  1049. out += lastLine;
  1050. free(lineSJIS);
  1051. } else {
  1052. lastLine = "broken<><>broken<> broken <>\n";
  1053. out += lastLine;
  1054. }
  1055. if (useCache && startFrom == number) {
  1056. PBBS2chProxyThreadInfo info = _threadCache->pop(_threadKey);
  1057. bool hit = false;
  1058. if (info) {
  1059. log_printf(5, "cache hit");
  1060. if (info->cachedData.size() == lastLine.size()) {
  1061. log_printf(5, "... size match");
  1062. if (info->cachedData == lastLine) {
  1063. log_printf(5, "... content match");
  1064. hit = true;
  1065. cachedSize = info->cachedSize - lastLine.size();
  1066. }
  1067. }
  1068. log_printf(5, "\n");
  1069. }
  1070. if (!hit) {
  1071. return "";
  1072. }
  1073. }
  1074. }
  1075. if (!lastLine.empty()) {
  1076. PBBS2chProxyThreadInfo info(new BBS2chProxyThreadInfo());
  1077. info->lastResNum = prevNumber;
  1078. info->cachedSize = out.size() + cachedSize;
  1079. info->cachedData = lastLine;
  1080. _threadCache->set(_threadKey, info);
  1081. log_printf(5, "cached thread %s (%ld bytes)\n", _threadKey.c_str(), lastLine.size());
  1082. }
  1083. if (lastModifiedOut) *lastModifiedOut = lastModified;
  1084. return out;
  1085. }