htmlParser.js 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. import * as cheerio from 'cheerio';
  2. // import jsonpath from 'jsonpath';
  3. import {urljoin} from "../utils/utils.js";
  4. import '../libs_drpy/jsonpathplus.min.js'
  5. export const jsonpath = {
  6. query(jsonObject, path) {
  7. return JSONPath.JSONPath({path: path, json: jsonObject})
  8. }
  9. };
  10. const PARSE_CACHE = true; // 解析缓存
  11. const NOADD_INDEX = ':eq|:lt|:gt|:first|:last|:not|:even|:odd|:has|:contains|:matches|:empty|^body$|^#'; // 不自动加eq下标索引
  12. const URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url|style)$|^(data-|url-|src-)'; // 需要自动urljoin的属性
  13. const SPECIAL_URL = '^(ftp|magnet|thunder|ws):'; // 过滤特殊链接,不走urlJoin
  14. class Jsoup {
  15. constructor(MY_URL = '') {
  16. this.MY_URL = MY_URL;
  17. this.pdfh_html = '';
  18. this.pdfa_html = '';
  19. this.pdfh_doc = null;
  20. this.pdfa_doc = null;
  21. }
  22. test(text, string) {
  23. const searchObj = new RegExp(text, 'mi').exec(string);
  24. return searchObj ? true : false;
  25. }
  26. contains(text, match) {
  27. return text.indexOf(match) !== -1;
  28. }
  29. parseHikerToJq(parse, first = false) {
  30. if (this.contains(parse, '&&')) {
  31. const parses = parse.split('&&');
  32. const new_parses = [];
  33. for (let i = 0; i < parses.length; i++) {
  34. const ps_list = parses[i].split(' ');
  35. const ps = ps_list[ps_list.length - 1];
  36. if (!this.test(NOADD_INDEX, ps)) {
  37. if (!first && i >= parses.length - 1) {
  38. new_parses.push(parses[i]);
  39. } else {
  40. new_parses.push(`${parses[i]}:eq(0)`);
  41. }
  42. } else {
  43. new_parses.push(parses[i]);
  44. }
  45. }
  46. parse = new_parses.join(' ');
  47. } else {
  48. const ps_list = parse.split(' ');
  49. const ps = ps_list[ps_list.length - 1];
  50. if (!this.test(NOADD_INDEX, ps) && first) {
  51. parse = `${parse}:eq(0)`;
  52. }
  53. }
  54. return parse;
  55. }
  56. getParseInfo(nparse) {
  57. let excludes = [];
  58. let nparse_index = 0;
  59. let nparse_rule = nparse;
  60. if (this.contains(nparse, ':eq')) {
  61. nparse_rule = nparse.split(':eq')[0];
  62. let nparse_pos = nparse.split(':eq')[1];
  63. if (this.contains(nparse_rule, '--')) {
  64. excludes = nparse_rule.split('--').slice(1);
  65. nparse_rule = nparse_rule.split('--')[0];
  66. } else if (this.contains(nparse_pos, '--')) {
  67. excludes = nparse_pos.split('--').slice(1);
  68. nparse_pos = nparse_pos.split('--')[0];
  69. }
  70. try {
  71. nparse_index = parseInt(nparse_pos.split('(')[1].split(')')[0]);
  72. } catch {
  73. }
  74. } else if (this.contains(nparse, '--')) {
  75. nparse_rule = nparse.split('--')[0];
  76. excludes = nparse.split('--').slice(1);
  77. }
  78. return {nparse_rule, nparse_index, excludes};
  79. }
  80. reorderAdjacentLtAndGt(selector) {
  81. const adjacentPattern = /:gt\((\d+)\):lt\((\d+)\)/;
  82. let match;
  83. while ((match = adjacentPattern.exec(selector)) !== null) {
  84. const replacement = `:lt(${match[2]}):gt(${match[1]})`;
  85. selector = selector.substring(0, match.index) + replacement + selector.substring(match.index + match[0].length);
  86. adjacentPattern.lastIndex = match.index;
  87. }
  88. return selector;
  89. }
  90. parseOneRule(doc, nparse, ret) {
  91. let {nparse_rule, nparse_index, excludes} = this.getParseInfo(nparse);
  92. nparse_rule = this.reorderAdjacentLtAndGt(nparse_rule);
  93. if (!ret) ret = doc(nparse_rule);
  94. else ret = ret.find(nparse_rule);
  95. if (this.contains(nparse, ':eq')) ret = ret.eq(nparse_index);
  96. if (excludes.length > 0 && ret) {
  97. ret = ret.clone();
  98. for (let exclude of excludes) {
  99. ret.find(exclude).remove();
  100. }
  101. }
  102. return ret;
  103. }
  104. parseText(text) {
  105. text = text.replace(/[\s]+/gm, '\n');
  106. text = text.replace(/\n+/g, '\n').replace(/^\s+/, '');
  107. text = text.replace(/\n/g, ' ');
  108. return text;
  109. }
  110. pdfa(html, parse) {
  111. if (!html || !parse) return [];
  112. parse = this.parseHikerToJq(parse);
  113. const doc = cheerio.load(html);
  114. if (PARSE_CACHE) {
  115. if (this.pdfa_html !== html) {
  116. this.pdfa_html = html;
  117. this.pdfa_doc = doc;
  118. }
  119. }
  120. const parses = parse.split(' ');
  121. let ret = null;
  122. for (const nparse of parses) {
  123. ret = this.parseOneRule(doc, nparse, ret);
  124. if (!ret) return [];
  125. }
  126. const res = (ret?.toArray() ?? []).map((item) => {
  127. const res_html = `${doc(item)}`;
  128. return res_html ? res_html : '';
  129. });
  130. return res;
  131. }
  132. pdfl(html, parse, list_text, list_url, url_key) {
  133. if (!html || !parse) return [];
  134. parse = this.parseHikerToJq(parse, false);
  135. const new_vod_list = [];
  136. const doc = cheerio.load(html);
  137. const parses = parse.split(' ');
  138. let ret = null;
  139. for (const pars of parses) {
  140. ret = this.parseOneRule(doc, pars, ret);
  141. if (!ret) return [];
  142. }
  143. ret.each((_, element) => {
  144. new_vod_list.push(`${doc(element)}`);
  145. });
  146. return new_vod_list;
  147. }
  148. pdfh(html, parse, baseUrl = '') {
  149. if (!html || !parse) return '';
  150. const doc = cheerio.load(html);
  151. if (typeof PARSE_CACHE !== 'undefined' && PARSE_CACHE) {
  152. if (this.pdfa_html !== html) {
  153. this.pdfa_html = html;
  154. this.pdfa_doc = doc;
  155. }
  156. }
  157. if (parse === 'body&&Text' || parse === 'Text') {
  158. return this.parseText(doc.text());
  159. } else if (parse === 'body&&Html' || parse === 'Html') {
  160. return doc.html();
  161. }
  162. let option;
  163. if (this.contains(parse, '&&')) {
  164. const parts = parse.split('&&');
  165. option = parts.pop();
  166. parse = parts.join('&&');
  167. }
  168. parse = this.parseHikerToJq(parse, true);
  169. const parses = parse.split(' ');
  170. let ret = null;
  171. for (const nparse of parses) {
  172. ret = this.parseOneRule(doc, nparse, ret);
  173. if (!ret) return '';
  174. }
  175. if (option) {
  176. switch (option) {
  177. case 'Text':
  178. ret = ret ? this.parseText(ret.text()) : '';
  179. break;
  180. case 'Html':
  181. ret = ret ? ret.html() : '';
  182. break;
  183. default:
  184. const originalRet = ret.clone();
  185. const options = option.split('||');
  186. for (const opt of options) {
  187. ret = originalRet?.attr(opt) || '';
  188. if (this.contains(opt.toLowerCase(), 'style') && this.contains(ret, 'url(')) {
  189. try {
  190. ret = ret.match(/url\((.*?)\)/)[1];
  191. ret = ret.replace(/^['"]|['"]$/g, '');
  192. } catch {
  193. }
  194. }
  195. if (ret && baseUrl) {
  196. const needAdd = this.test(URLJOIN_ATTR, opt) && !this.test(SPECIAL_URL, ret);
  197. if (needAdd) {
  198. ret = ret.includes('http') ? ret.slice(ret.indexOf('http')) : urljoin(baseUrl, ret);
  199. }
  200. }
  201. if (ret) break;
  202. }
  203. }
  204. }
  205. else { // 增加返回字符串,禁止直接返回pq对象
  206. ret = `${ret}`;
  207. }
  208. return ret;
  209. }
  210. pd(html, parse, baseUrl = '') {
  211. if (!baseUrl) baseUrl = this.MY_URL;
  212. return this.pdfh(html, parse, baseUrl);
  213. }
  214. pq(html) {
  215. return cheerio.load(html);
  216. }
  217. pjfh(html, parse, addUrl = false) {
  218. if (!html || !parse) return '';
  219. try {
  220. html = typeof html === 'string' ? JSON.parse(html) : html;
  221. } catch {
  222. console.log('字符串转 JSON 失败');
  223. return '';
  224. }
  225. if (!parse.startsWith('$.')) parse = '$.' + parse;
  226. let ret = '';
  227. const paths = parse.split('||');
  228. for (const path of paths) {
  229. const queryResult = jsonpath.query(html, path);
  230. ret = Array.isArray(queryResult) ? queryResult[0] || '' : queryResult || '';
  231. if (addUrl && ret) ret = urljoin(this.MY_URL, ret);
  232. if (ret) break;
  233. }
  234. return ret;
  235. }
  236. pj(html, parse) {
  237. return this.pjfh(html, parse, true);
  238. }
  239. pjfa(html, parse) {
  240. if (!html || !parse) return [];
  241. try {
  242. html = typeof html === 'string' ? JSON.parse(html) : html;
  243. } catch {
  244. return [];
  245. }
  246. if (!parse.startsWith('$.')) parse = '$.' + parse;
  247. const result = jsonpath.query(html, parse);
  248. if (Array.isArray(result) && Array.isArray(result[0]) && result.length === 1) {
  249. return result[0];
  250. }
  251. return result || [];
  252. }
  253. }
  254. export const jsoup = Jsoup;
  255. // export default Jsoup;