HtmlParser.cs 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Data;
  4. using System.Linq;
  5. using System.Net;
  6. using System.Net.Http.Headers;
  7. using System.Net.Security;
  8. using System.Reflection;
  9. using System.Reflection.Metadata;
  10. using System.Reflection.PortableExecutable;
  11. using System.Security.Cryptography.X509Certificates;
  12. using System.Security.Policy;
  13. using System.Text;
  14. using System.Text.RegularExpressions;
  15. using System.Threading.Tasks;
  16. using System.Xml.Linq;
  17. using Esprima.Ast;
  18. using Jint;
  19. using Jint.Native;
  20. using Jint.Runtime;
  21. using Newtonsoft.Json.Linq;
  22. using NSoup.Nodes;
  23. using NSoup;
  24. using NSoup.Select;
  25. using Document = NSoup.Nodes.Document;
  26. using RestSharp;
  27. using System.Web;
  28. using System.Net.Mime;
  29. using Newtonsoft.Json;
  30. using NSoup.Helper;
  31. using System.Text.Encodings.Web;
  32. using System.Buffers.Text;
  33. using System.Text.Json.Nodes;
  34. namespace Peach.DataAccess
  35. {
  36. //html解析器
  37. public class HtmlParser
  38. {
  39. RestClient client;
  40. public HtmlParser()
  41. {
  42. ServicePointManager.ServerCertificateValidationCallback = (sender, cert, chain, sslPolicyErrors) => true;
  43. var options = new RestClientOptions()
  44. {
  45. RemoteCertificateValidationCallback = (a, c, d, v) => true,
  46. MaxTimeout = 100000,
  47. ThrowOnAnyError = true, //设置不然不会报异常
  48. UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
  49. };
  50. client = new RestClient(options);
  51. //client.AddDefaultHeader("Content-Type", "application/json");
  52. //client.AddDefaultHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
  53. }
  54. /// <summary>
  55. /// okhttp封装的html请求,给js调用http请求的
  56. /// </summary>
  57. /// <param name="url"></param>
  58. /// <param name="opt"></param>
  59. /// <returns></returns>
  60. public object Request(string url, JsValue arguments)
  61. {
  62. Uri uri = new Uri(url);
  63. string Host = uri.Host;
  64. var method = arguments.AsObject()["method"]?.ToString();
  65. var _headers = arguments.AsObject()["headers"].AsObject();
  66. var Referer = _headers["Referer"]?.ToString();
  67. var UserAgent = _headers["User-Agent"]?.ToString();
  68. var Cookie = _headers["Cookie"]?.ToString();
  69. var ContentType = _headers["Content-Type"]?.ToString();
  70. var Data = arguments.AsObject()["data"]?.ToString();
  71. var Body = arguments.AsObject()["body"]?.ToString();
  72. var Buffer = arguments.AsObject()["buffer"]?.ToString();
  73. String charset = "utf-8";
  74. if (ContentType != null && ContentType.Split("charset=").Length > 1)
  75. {
  76. charset = ContentType.Split("charset=")[1];
  77. }
  78. var request = new RestRequest(url);
  79. if (!string.IsNullOrEmpty(Data) && !Data.Equals("undefined"))
  80. {
  81. // 序列化JSON数据
  82. string post_data = JsonConvert.SerializeObject(Data);
  83. // 将JSON参数添加至请求中
  84. request.AddParameter("application/json", post_data, ParameterType.RequestBody);
  85. }
  86. if (!string.IsNullOrEmpty(Body) && !Body.Equals("undefined"))
  87. {
  88. String[] queryS = Body.Split("&");
  89. foreach (String query in queryS)
  90. {
  91. //String query = queryS[i];
  92. int tmp = query.IndexOf("=");
  93. String key;
  94. String value;
  95. if (tmp != -1)
  96. {
  97. key = query.Substring(0, tmp);
  98. value = query[(tmp + 1)..];
  99. }
  100. else
  101. {
  102. key = query;
  103. value = "";
  104. }
  105. request.AddParameter(key, value);
  106. }
  107. }
  108. if (string.IsNullOrEmpty(UserAgent))
  109. UserAgent = "Mozilla/5.0 (Linux; Android 11; M2007J3SC Build/RKQ1.200826.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/77.0.3865.120 MQQBrowser/6.2 TBS/045714 Mobile Safari/537.36";
  110. request.AddHeader("User-Agent", UserAgent);
  111. if (!string.IsNullOrEmpty(Referer))
  112. request.AddHeader("Referer", Referer);
  113. if (!string.IsNullOrEmpty(Cookie) && !Cookie.Equals("undefined"))
  114. {
  115. client.AddDefaultHeader("Cookie", Cookie);
  116. }
  117. string rContent = "";
  118. JsObject header = new (_headers.Engine);
  119. try
  120. {
  121. var client = new RestClient(url);
  122. RestResponse? response;
  123. if (method?.ToLower() == "post")
  124. response = client.Post(request);
  125. else
  126. response = client.Get(request);
  127. //rContent = response.Content;
  128. Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
  129. rContent = HttpUtility.UrlDecode(response.RawBytes == null ? Array.Empty<byte>() : response.RawBytes,
  130. Encoding.GetEncoding(charset));
  131. if (response.Headers != null)
  132. {
  133. foreach (var item in response.Headers)
  134. {
  135. header.Set(item.Name, item.Value == null ? "" : item.Value.ToString());
  136. }
  137. }
  138. if (Buffer == "1")
  139. {
  140. return new { headers = header, content = response.RawBytes };
  141. }
  142. else if (Buffer == "2")
  143. {
  144. return new { headers = header, content = Convert.ToBase64String(Encoding.UTF8.GetBytes(rContent)) };
  145. }
  146. else
  147. {
  148. return new { headers = header, content = rContent };
  149. }
  150. }
  151. catch (Exception)
  152. { }
  153. return new { headers = header, content = "" };
  154. }
  155. private static readonly Regex p = new ("url\\((.*?)\\)", RegexOptions.Multiline | RegexOptions.Singleline);
  156. private static readonly Regex NOAdd_INDEX = new (":eq|:lt|:gt|:first|:last|^body$|^#");
  157. private static readonly Regex URLJOIN_ATTR = new ("(url|src|href|-original|-src|-play|-url|style)$", RegexOptions.Multiline | RegexOptions.IgnoreCase);
  158. private static readonly Regex SPECIAL_URL = new ("^(ftp|magnet|thunder|ws):", RegexOptions.Multiline | RegexOptions.IgnoreCase);
  159. private static String pdfh_html = "";
  160. private static String pdfa_html = "";
  161. private static Document? pdfh_doc = null;
  162. private static Document? pdfa_doc = null;
  163. public static string JoinUrl(string parent, string child)
  164. {
  165. if (string.IsNullOrWhiteSpace(parent))
  166. {
  167. return child;
  168. }
  169. Uri url;
  170. string q = parent;
  171. try
  172. {
  173. url = new Uri(new Uri(parent), child);
  174. q = url.ToString();
  175. }
  176. catch (Exception)
  177. {
  178. //e.printStackTrace();
  179. }
  180. // if (q.Contains("#")) {
  181. // q = q.ReplaceAll("^(.+?)#.*?$", "$1");
  182. // }
  183. return q;
  184. }
  185. public class Painfo
  186. {
  187. public string? nparse_rule;
  188. public int nparse_index;
  189. public List<string>? excludes;
  190. }
  191. private static Painfo GetParseInfo(string nparse)
  192. {
  193. /*
  194. 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作
  195. :param nparse:
  196. :return:*/
  197. Painfo painfo = new Painfo();
  198. //List<string> excludes = new ArrayList<>(); //定义排除列表默认值为空
  199. //int nparse_index; //定义位置索引默认值为0
  200. painfo.nparse_rule = nparse; //定义规则默认值为本身
  201. if (nparse.Contains(":eq"))
  202. {
  203. painfo.nparse_rule = nparse.Split(":")[0];
  204. string nparse_pos = nparse.Split(":")[1];
  205. if (painfo.nparse_rule.Contains("--"))
  206. {
  207. string[] rules = painfo.nparse_rule.Split("--");
  208. painfo.excludes = rules.ToList();// new(Arrays.asList(rules));
  209. painfo.excludes.RemoveAt(0);
  210. painfo.nparse_rule = rules[0];
  211. }
  212. else if (nparse_pos.Contains("--"))
  213. {
  214. string[] rules = nparse_pos.Split("--");
  215. painfo.excludes = rules.ToList();// new ArrayList<>(Arrays.asList(rules));
  216. painfo.excludes.RemoveAt(0);
  217. nparse_pos = rules[0];
  218. }
  219. try
  220. {
  221. painfo.nparse_index = int.Parse(nparse_pos.Replace("eq(", "").Replace(")", ""));
  222. }
  223. catch (Exception)
  224. {
  225. painfo.nparse_index = 0;
  226. }
  227. }
  228. else
  229. {
  230. if (nparse.Contains("--"))
  231. {
  232. string[] rules = painfo.nparse_rule.Split("--");
  233. painfo.excludes = rules.ToList();// new ArrayList<>(Arrays.asList(rules));
  234. painfo.excludes.RemoveAt(0);
  235. painfo.nparse_rule = rules[0];
  236. }
  237. }
  238. return painfo;
  239. }
  240. //pdfh
  241. public string ParseDomForUrl(string html, string rule)
  242. {
  243. return ParseDom(html, rule, "");
  244. }
  245. //pd
  246. public string ParseDom(string html, string rule, string Add_url)
  247. {
  248. if (string.IsNullOrWhiteSpace(html)) return "";
  249. if (!pdfh_html.Equals(html))
  250. {
  251. pdfh_html = html;
  252. pdfh_doc = NSoupClient.Parse(html);
  253. }
  254. Document? doc = pdfh_doc;
  255. //Document doc = NSoupClient.Parse(html);
  256. if (rule.Equals("body&&Text") || rule.Equals("Text"))
  257. return doc.Text();
  258. else if (rule.Equals("body&&Html") || rule.Equals("Html"))
  259. return doc.Html();
  260. string option = "";
  261. if (rule.Contains("&&"))
  262. {
  263. string[] rs = rule.Split("&&");
  264. option = rs[rs.Length - 1];
  265. List<string> excludes = rs.ToList();// new ArrayList<>(Arrays.asList(rs));
  266. excludes.RemoveAt(rs.Length - 1);
  267. rule = string.Join("&&", excludes);// TextUtils.join("&&", excludes);
  268. }
  269. rule = parseHikerToJq(rule, true);
  270. string[]? parses = rule.Split(" ");
  271. Elements ret = new ();
  272. foreach (string nparse in parses)
  273. {
  274. ret = parseOneRule(doc, nparse, ret);
  275. if (ret.IsEmpty || ret.Count <= 0) return "";
  276. }
  277. if (string.IsNullOrWhiteSpace(option))
  278. return ret.OuterHtml();
  279. if (option.Equals("Text"))
  280. return ret.First.Text();
  281. else if (option.Equals("Html"))
  282. return ret.Html();
  283. else //(JSUtils.isNotEmpty(option))
  284. {
  285. string? result = ret.Attr(option);
  286. if (option.ToLower().Contains("style") && result.Contains("url("))
  287. {
  288. Match m = p.Match(result);
  289. if (m.Success)
  290. result = m.Groups[1]?.Value;
  291. result = Regex.Replace(result, "^['|\"](.*)['|\"]$", "$1");
  292. }
  293. if (!string.IsNullOrWhiteSpace(result) && !string.IsNullOrWhiteSpace(Add_url))// (JSUtils.isNotEmpty(result) && JSUtils.isNotEmpty(Add_url))
  294. {
  295. // 需要自动urljoin的属性
  296. Match m = URLJOIN_ATTR.Match(option);
  297. Match n = SPECIAL_URL.Match(result);
  298. //if (isUrl(option)) {
  299. if (m.Success && !n.Success)
  300. {
  301. if (result.Contains("http"))
  302. result = result[result.IndexOf("http")..];
  303. else
  304. result = JoinUrl(Add_url, result);
  305. }
  306. }
  307. return result;
  308. }
  309. }
  310. //pdfa
  311. public String[] ParseDomForArray(string html, string rule)
  312. {
  313. if (!pdfa_html.Equals(html))
  314. {
  315. pdfa_html = html;
  316. pdfa_doc = NSoupClient.Parse(html);
  317. }
  318. Document? doc = pdfa_doc;
  319. List<string>? eleHtml = new();
  320. //Document doc = NSoupClient.Parse(html);
  321. rule = parseHikerToJq(rule, false);
  322. string[]? parses = rule.Split(" ");
  323. Elements ret = new ();
  324. foreach (var pars in parses)
  325. {
  326. ret = parseOneRule(doc, pars, ret);
  327. if (ret.IsEmpty) return eleHtml.ToArray();
  328. }
  329. foreach (Element it in ret)
  330. {
  331. eleHtml.Add(it.OuterHtml());
  332. }
  333. return eleHtml.ToArray();
  334. }
  335. //pdfl
  336. public String[] ParseDomForList(string html, string rule, string list_text, string list_url, string urlKey)
  337. {
  338. if (!pdfa_html.Equals(html))
  339. {
  340. pdfa_html = html;
  341. pdfa_doc = NSoupClient.Parse(html);
  342. }
  343. Document? doc = pdfa_doc;
  344. //Document doc = NSoupClient.Parse(html);
  345. List<string>? new_vod_list = new();
  346. rule = parseHikerToJq(rule, false);
  347. string[]? parses = rule.Split(" ");
  348. Elements ret = new ();
  349. foreach (string pars in parses)
  350. {
  351. ret = parseOneRule(doc, pars, ret);
  352. if (ret.IsEmpty) return new_vod_list.ToArray();
  353. }
  354. foreach (Element it in ret)
  355. {
  356. new_vod_list.Add(ParseDom(it.OuterHtml(), list_text, "").Trim() + '$' + ParseDom(it.OuterHtml(), list_url, urlKey));
  357. }
  358. return new_vod_list.ToArray();
  359. }
  360. private string parseHikerToJq(string parse, bool first)
  361. {
  362. /*
  363. 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
  364. :param parse:
  365. :param first:
  366. :return:
  367. */
  368. // 不自动加eq下标索引
  369. if (parse.Contains("&&"))
  370. {
  371. string[]? parses = parse.Split("&&"); //带&&的重新拼接
  372. List<string>? new_parses = new(); //构造新的解析表达式列表
  373. for (int i = 0; i < parses.Length; i++)
  374. {
  375. string[]? pss = parses[i].Split(" ");
  376. string? ps = pss[pss.Length - 1]; //如果分割&&后带空格就取最后一个元素
  377. Match? m = NOAdd_INDEX.Match(ps); // Matcher m = NOAdd_INDEX.matcher(ps);
  378. //if (!isIndex(ps)) {
  379. if (!m.Success)
  380. {
  381. if (!first && i >= parses.Length - 1)
  382. { //不传first且遇到最后一个,不用补eq(0)
  383. new_parses.Add(parses[i]);
  384. }
  385. else
  386. {
  387. new_parses.Add(parses[i] + ":eq(0)");
  388. }
  389. }
  390. else
  391. {
  392. new_parses.Add(parses[i]);
  393. }
  394. }
  395. parse = string.Join(" ", new_parses);// TextUtils.join(" ", new_parses);
  396. }
  397. else
  398. {
  399. string[]? pss = parse.Split(" ");
  400. string? ps = pss[pss.Length - 1]; //如果分割&&后带空格就取最后一个元素
  401. //Matcher m = NOAdd_INDEX.matcher(ps);
  402. Match? m = NOAdd_INDEX.Match(ps);
  403. //if (!isIndex(ps) && first) {
  404. if (!m.Success && first)
  405. {
  406. parse += ":eq(0)";
  407. }
  408. }
  409. return parse;
  410. }
  411. private Elements parseOneRule(Document doc, string parse, Elements ret)
  412. {
  413. Painfo? info = GetParseInfo(parse);
  414. if (ret.IsEmpty)
  415. {
  416. ret = doc.Select(info.nparse_rule);
  417. }
  418. else
  419. {
  420. ret = ret.Select(info.nparse_rule);
  421. }
  422. if (parse.Contains(":eq"))
  423. {
  424. if (info.nparse_index < 0)
  425. {
  426. ret = ret.Eq(ret.Count + info.nparse_index);
  427. }
  428. else
  429. {
  430. ret = ret.Eq(info.nparse_index);
  431. }
  432. }
  433. if (info.excludes != null && !ret.IsEmpty)
  434. {
  435. foreach (var exclude in info.excludes)
  436. {
  437. ret.Select(exclude).Remove();
  438. }
  439. }
  440. return ret;
  441. }
  442. }
  443. }