123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- import 'assets://js/lib/uri.min.js'
- import cheerio from 'assets://js/lib/cheerio.min.js';
- import 'assets://js/lib/crypto-js.js'
- var charStr = 'abacdefghjklmnopqrstuvwxyzABCDEFGHJKLMNOPQRSTUVWXYZ0123456789';
- export function randIndex(min, max, i) {
- let index = Math.floor(Math.random() * (max - min + 1) + min),
- numStart = charStr.length - 10;
- if (i == 0 && index >= numStart) {
- index = randIndex(min, max, i);
- }
- return index;
- }
- export function randomStr(len) {
- let min = 0, max = charStr.length - 1, _str = '';
- len = len || 15;
- for (var i = 0, index; i < len; i++) {
- index = randIndex(min, max, i);
- _str += charStr[index];
- }
- return _str;
- }
- export function urljoin(base, url) {
- base = base || '';
- url = url || '';
- let baseU = new Uri(base.trim().rstrip('/'));
- url = url.trim().rstrip('/');
- let u = undefined;
- if (url.startsWith('http://') || url.startsWith('https://')) {
- u = new Uri(url);
- } else if (url.startsWith('://')) {
- u = new Uri(baseU.protocol() + url);
- } else if (url.startsWith('//')) {
- u = new Uri(baseU.protocol() + ':' + url);
- } else {
- u = new Uri(baseU.protocol() + '://' + baseU.host() + (baseU.port() ? ':' + baseU.port() : '') + '/' + url);
- }
- if ((!u.path() || u.path().trim().length === 0) && baseU.path())
- u.path(baseU.path());
- if (!u.query() && baseU.query())
- u.query(baseU.query());
- return u.toString();
- }
- const DOM_CHECK_ATTR = /(url|src|href|data-original|data-src)$/;
- const SELECT_REGEX = /:eq|:lt|:gt|#/g;
- const SELECT_REGEX_A = /:eq|:lt|:gt/g;
- export function pdfh(html, parse, base_url) {
- if (!parse || !parse.trim()) {
- return ''
- }
- let eleFind = typeof html === 'object';
- let option = undefined;
- if (eleFind && parse.startsWith('body&&')) {
- parse = parse.substr(6);
- if (parse.indexOf('&&') < 0) {
- option = parse.trim();
- parse = '*=*';
- }
- }
- if (parse.indexOf('&&') > -1) {
- let sp = parse.split('&&');
- option = sp[sp.length - 1];
- sp.splice(sp.length - 1);
- if (sp.length > 1) {
- for (let i in sp) {
- if (!SELECT_REGEX.test(sp[i])) {
- sp[i] = sp[i] + ':eq(0)';
- }
- }
- } else {
- if (!SELECT_REGEX.test(sp[0])) {
- sp[0] = sp[0] + ':eq(0)';
- }
- }
- parse = sp.join(' ');
- }
- let result = '';
- const $ = eleFind ? html.rr : cheerio.load(html);
- let ret = eleFind ? ((parse === '*=*' || $(html.ele).is(parse)) ? html.ele : $(html.ele).find(parse)) : $(parse);
- if (option) {
- if (option === 'Text') {
- result = $(ret).text();
- }
- else if (option === 'Html') {
- result = $(ret).html();
- }
- else {
- result = $(ret).attr(option);
- }
- if (result && base_url && DOM_CHECK_ATTR.test(option)) {
- if (/http/.test(result)) {
- result = result.substr(result.indexOf('http'));
- } else {
- result = urljoin(base_url, result)
- }
- }
- } else {
- result = $(ret).toString();
- }
- return result;
- }
- export function pdfa(html, parse) {
- if (!parse || !parse.trim()) {
- return [];
- }
- let eleFind = typeof html === 'object';
- if (parse.indexOf('&&') > -1) {
- let sp = parse.split('&&');
- for (let i in sp) {
- if (!SELECT_REGEX_A.test(sp[i]) && i < sp.length - 1) {
- sp[i] = sp[i] + ':eq(0)';
- }
- }
- parse = sp.join(' ');
- }
- const $ = eleFind ? html.rr : cheerio.load(html);
- let ret = eleFind ? ($(html.ele).is(parse) ? html.ele : $(html.ele).find(parse)) : $(parse);
- let result = [];
- if (ret) {
- ret.each(function (idx, ele) {
- result.push({ rr: $, ele: ele });
- });
- }
- return result;
- }
- const defaultParser = {
- pdfh:pdfh,
- pdfa:pdfa,
- pd(html,parse,uri){
- let ret = this.pdfh(html,parse);
- if(typeof(uri)==='undefined'||!uri){
- uri = '';
- }
- if(DOM_CHECK_ATTR.test(parse)){
- if(/http/.test(ret)){
- ret = ret.substr(ret.indexOf('http'));
- }else{
- ret = urljoin(MY_URL,ret)
- }
- }
- return ret
- },
- };
- globalThis.randIndex = randIndex;
- globalThis.randomStr = randomStr;
- globalThis.urljoin = urljoin;
- globalThis.joinUrl = urljoin;
- globalThis.defaultParser = defaultParser;
- globalThis.pdfa = defaultParser.pdfa;
- globalThis.pdfh = defaultParser.pdfh;
- globalThis.pd = defaultParser.pd;
|