123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300 |
- import * as cheerio from 'cheerio';
- // import jsonpath from 'jsonpath';
- import {urljoin} from "../utils/utils.js";
- import '../libs_drpy/jsonpathplus.min.js'
- export const jsonpath = {
- query(jsonObject, path) {
- return JSONPath.JSONPath({path: path, json: jsonObject})
- }
- };
- const PARSE_CACHE = true; // 解析缓存
- const NOADD_INDEX = ':eq|:lt|:gt|:first|:last|:not|:even|:odd|:has|:contains|:matches|:empty|^body$|^#'; // 不自动加eq下标索引
- const URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url|style)$|^(data-|url-|src-)'; // 需要自动urljoin的属性
- const SPECIAL_URL = '^(ftp|magnet|thunder|ws):'; // 过滤特殊链接,不走urlJoin
- class Jsoup {
- constructor(MY_URL = '') {
- this.MY_URL = MY_URL;
- this.pdfh_html = '';
- this.pdfa_html = '';
- this.pdfh_doc = null;
- this.pdfa_doc = null;
- }
- test(text, string) {
- const searchObj = new RegExp(text, 'mi').exec(string);
- return searchObj ? true : false;
- }
- contains(text, match) {
- return text.indexOf(match) !== -1;
- }
- parseHikerToJq(parse, first = false) {
- if (this.contains(parse, '&&')) {
- const parses = parse.split('&&');
- const new_parses = [];
- for (let i = 0; i < parses.length; i++) {
- const ps_list = parses[i].split(' ');
- const ps = ps_list[ps_list.length - 1];
- if (!this.test(NOADD_INDEX, ps)) {
- if (!first && i >= parses.length - 1) {
- new_parses.push(parses[i]);
- } else {
- new_parses.push(`${parses[i]}:eq(0)`);
- }
- } else {
- new_parses.push(parses[i]);
- }
- }
- parse = new_parses.join(' ');
- } else {
- const ps_list = parse.split(' ');
- const ps = ps_list[ps_list.length - 1];
- if (!this.test(NOADD_INDEX, ps) && first) {
- parse = `${parse}:eq(0)`;
- }
- }
- return parse;
- }
- getParseInfo(nparse) {
- let excludes = [];
- let nparse_index = 0;
- let nparse_rule = nparse;
- if (this.contains(nparse, ':eq')) {
- nparse_rule = nparse.split(':eq')[0];
- let nparse_pos = nparse.split(':eq')[1];
- if (this.contains(nparse_rule, '--')) {
- excludes = nparse_rule.split('--').slice(1);
- nparse_rule = nparse_rule.split('--')[0];
- } else if (this.contains(nparse_pos, '--')) {
- excludes = nparse_pos.split('--').slice(1);
- nparse_pos = nparse_pos.split('--')[0];
- }
- try {
- nparse_index = parseInt(nparse_pos.split('(')[1].split(')')[0]);
- } catch {
- }
- } else if (this.contains(nparse, '--')) {
- nparse_rule = nparse.split('--')[0];
- excludes = nparse.split('--').slice(1);
- }
- return {nparse_rule, nparse_index, excludes};
- }
- reorderAdjacentLtAndGt(selector) {
- const adjacentPattern = /:gt\((\d+)\):lt\((\d+)\)/;
- let match;
- while ((match = adjacentPattern.exec(selector)) !== null) {
- const replacement = `:lt(${match[2]}):gt(${match[1]})`;
- selector = selector.substring(0, match.index) + replacement + selector.substring(match.index + match[0].length);
- adjacentPattern.lastIndex = match.index;
- }
- return selector;
- }
- parseOneRule(doc, nparse, ret) {
- let {nparse_rule, nparse_index, excludes} = this.getParseInfo(nparse);
- nparse_rule = this.reorderAdjacentLtAndGt(nparse_rule);
- if (!ret) ret = doc(nparse_rule);
- else ret = ret.find(nparse_rule);
- if (this.contains(nparse, ':eq')) ret = ret.eq(nparse_index);
- if (excludes.length > 0 && ret) {
- ret = ret.clone();
- for (let exclude of excludes) {
- ret.find(exclude).remove();
- }
- }
- return ret;
- }
- parseText(text) {
- text = text.replace(/[\s]+/gm, '\n');
- text = text.replace(/\n+/g, '\n').replace(/^\s+/, '');
- text = text.replace(/\n/g, ' ');
- return text;
- }
- pdfa(html, parse) {
- if (!html || !parse) return [];
- parse = this.parseHikerToJq(parse);
- const doc = cheerio.load(html);
- if (PARSE_CACHE) {
- if (this.pdfa_html !== html) {
- this.pdfa_html = html;
- this.pdfa_doc = doc;
- }
- }
- const parses = parse.split(' ');
- let ret = null;
- for (const nparse of parses) {
- ret = this.parseOneRule(doc, nparse, ret);
- if (!ret) return [];
- }
- const res = (ret?.toArray() ?? []).map((item) => {
- const res_html = `${doc(item)}`;
- return res_html ? res_html : '';
- });
- return res;
- }
- pdfl(html, parse, list_text, list_url, url_key) {
- if (!html || !parse) return [];
- parse = this.parseHikerToJq(parse, false);
- const new_vod_list = [];
- const doc = cheerio.load(html);
- const parses = parse.split(' ');
- let ret = null;
- for (const pars of parses) {
- ret = this.parseOneRule(doc, pars, ret);
- if (!ret) return [];
- }
- ret.each((_, element) => {
- new_vod_list.push(`${doc(element)}`);
- });
- return new_vod_list;
- }
- pdfh(html, parse, baseUrl = '') {
- if (!html || !parse) return '';
- const doc = cheerio.load(html);
- if (typeof PARSE_CACHE !== 'undefined' && PARSE_CACHE) {
- if (this.pdfa_html !== html) {
- this.pdfa_html = html;
- this.pdfa_doc = doc;
- }
- }
- if (parse === 'body&&Text' || parse === 'Text') {
- return this.parseText(doc.text());
- } else if (parse === 'body&&Html' || parse === 'Html') {
- return doc.html();
- }
- let option;
- if (this.contains(parse, '&&')) {
- const parts = parse.split('&&');
- option = parts.pop();
- parse = parts.join('&&');
- }
- parse = this.parseHikerToJq(parse, true);
- const parses = parse.split(' ');
- let ret = null;
- for (const nparse of parses) {
- ret = this.parseOneRule(doc, nparse, ret);
- if (!ret) return '';
- }
- if (option) {
- switch (option) {
- case 'Text':
- ret = ret ? this.parseText(ret.text()) : '';
- break;
- case 'Html':
- ret = ret ? ret.html() : '';
- break;
- default:
- const originalRet = ret.clone();
- const options = option.split('||');
- for (const opt of options) {
- ret = originalRet?.attr(opt) || '';
- if (this.contains(opt.toLowerCase(), 'style') && this.contains(ret, 'url(')) {
- try {
- ret = ret.match(/url\((.*?)\)/)[1];
- ret = ret.replace(/^['"]|['"]$/g, '');
- } catch {
- }
- }
- if (ret && baseUrl) {
- const needAdd = this.test(URLJOIN_ATTR, opt) && !this.test(SPECIAL_URL, ret);
- if (needAdd) {
- ret = ret.includes('http') ? ret.slice(ret.indexOf('http')) : urljoin(baseUrl, ret);
- }
- }
- if (ret) break;
- }
- }
- }
- else { // 增加返回字符串,禁止直接返回pq对象
- ret = `${ret}`;
- }
- return ret;
- }
- pd(html, parse, baseUrl = '') {
- if (!baseUrl) baseUrl = this.MY_URL;
- return this.pdfh(html, parse, baseUrl);
- }
- pq(html) {
- return cheerio.load(html);
- }
- pjfh(html, parse, addUrl = false) {
- if (!html || !parse) return '';
- try {
- html = typeof html === 'string' ? JSON.parse(html) : html;
- } catch {
- console.log('字符串转 JSON 失败');
- return '';
- }
- if (!parse.startsWith('$.')) parse = '$.' + parse;
- let ret = '';
- const paths = parse.split('||');
- for (const path of paths) {
- const queryResult = jsonpath.query(html, path);
- ret = Array.isArray(queryResult) ? queryResult[0] || '' : queryResult || '';
- if (addUrl && ret) ret = urljoin(this.MY_URL, ret);
- if (ret) break;
- }
- return ret;
- }
- pj(html, parse) {
- return this.pjfh(html, parse, true);
- }
- pjfa(html, parse) {
- if (!html || !parse) return [];
- try {
- html = typeof html === 'string' ? JSON.parse(html) : html;
- } catch {
- return [];
- }
- if (!parse.startsWith('$.')) parse = '$.' + parse;
- const result = jsonpath.query(html, parse);
- if (Array.isArray(result) && Array.isArray(result[0]) && result.length === 1) {
- return result[0];
- }
- return result || [];
- }
- }
- export const jsoup = Jsoup;
- // export default Jsoup;
|