htmlParser.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # File : htmlParser.py
  4. # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
  5. # Date : 2022/8/25
  6. # upDate : 2022/11/17 支持 -- 剔除元素 多个剔除
  7. import ujson
  8. from pyquery import PyQuery as pq
  9. from urllib.parse import urljoin
  10. import re
  11. from jsonpath import jsonpath
  12. PARSE_CACHE = True # 解析缓存
  13. NOADD_INDEX = ':eq|:lt|:gt|:first|:last|^body$|^#' # 不自动加eq下标索引
  14. URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$' # 需要自动urljoin的属性
  15. class jsoup:
  16. def __init__(self, MY_URL=''):
  17. self.MY_URL = MY_URL
  18. self.pdfh_html = ''
  19. self.pdfa_html = ''
  20. self.pdfh_doc = None
  21. self.pdfa_doc = None
  22. def test(self, text: str, string: str):
  23. """
  24. 正则判断字符串包含,模仿js的 //.test()
  25. :param text:
  26. :param string:
  27. :return:
  28. """
  29. searchObj = re.search(rf'{text}', string, re.M | re.I)
  30. test_ret = True if searchObj else False
  31. return test_ret
  32. def contains(self, text: str, match: str):
  33. # return match in text
  34. return text.find(match) > -1
  35. def parseHikerToJq(self, parse, first=False):
  36. """
  37. 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
  38. :param parse:
  39. :param first:
  40. :return:
  41. """
  42. if self.contains(parse, '&&'):
  43. parse = parse.split('&&') # 带&&的重新拼接
  44. new_parses = [] # 构造新的解析表达式列表
  45. for i in range(len(parse)):
  46. ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素
  47. if not self.test(NOADD_INDEX, ps):
  48. if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0)
  49. new_parses.append(parse[i])
  50. else:
  51. new_parses.append(f'{parse[i]}:eq(0)')
  52. else:
  53. new_parses.append(parse[i])
  54. parse = ' '.join(new_parses)
  55. else:
  56. ps = parse.split(' ')[-1] # 如果带空格就取最后一个元素
  57. if not self.test(NOADD_INDEX, ps) and first:
  58. parse = f'{parse}:eq(0)'
  59. return parse
  60. def getParseInfo(self, nparse):
  61. """
  62. 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作
  63. :param nparse:
  64. :return:
  65. """
  66. excludes = [] # 定义排除列表默认值为空
  67. nparse_index = 0 # 定义位置索引默认值为0
  68. nparse_rule = nparse # 定义规则默认值为本身
  69. if self.contains(nparse, ':eq'):
  70. nparse_rule = nparse.split(':eq')[0]
  71. nparse_pos = nparse.split(':eq')[1]
  72. # print(nparse_rule)
  73. if self.contains(nparse_rule, '--'):
  74. excludes = nparse_rule.split('--')[1:]
  75. nparse_rule = nparse_rule.split('--')[0]
  76. elif self.contains(nparse_pos, '--'):
  77. excludes = nparse_pos.split('--')[1:]
  78. nparse_pos = nparse_pos.split('--')[0]
  79. try:
  80. nparse_index = int(nparse_pos.split('(')[1].split(')')[0])
  81. except:
  82. pass
  83. elif self.contains(nparse, '--'):
  84. nparse_rule = nparse.split('--')[0]
  85. excludes = nparse.split('--')[1:]
  86. # if nparse_index > 0:
  87. # print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}')
  88. return nparse_rule, nparse_index, excludes
  89. def parseOneRule(self, doc, nparse, ret=None):
  90. """
  91. 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
  92. :param doc: pq(html) load 后的pq对象
  93. :param nparse: 当前单个解析表达式
  94. :param ret: pd对象结果
  95. :return:
  96. """
  97. nparse_rule, nparse_index, excludes = self.getParseInfo(nparse)
  98. if not ret:
  99. ret = doc(nparse_rule)
  100. else:
  101. ret = ret(nparse_rule)
  102. # print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes},ret:{ret}')
  103. if self.contains(nparse, ':eq'):
  104. ret = ret.eq(nparse_index)
  105. # if nparse_index > 4:
  106. # print('nparse_index',ret,not ret)
  107. if excludes and ret:
  108. # print(excludes)
  109. ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存
  110. for exclude in excludes:
  111. # ret.remove(exclude)
  112. ret(exclude).remove()
  113. return ret
  114. def pdfa(self, html, parse: str):
  115. # 看官方文档才能解决这个问题!!!
  116. # https://pyquery.readthedocs.io/en/latest/api.html
  117. if not all([html, parse]):
  118. return []
  119. parse = self.parseHikerToJq(parse)
  120. print(f'pdfa:{parse}')
  121. if PARSE_CACHE:
  122. if self.pdfa_html != html:
  123. self.pdfa_html = html
  124. self.pdfa_doc = pq(html)
  125. doc = self.pdfa_doc
  126. else:
  127. doc = pq(html)
  128. parses = parse.split(' ')
  129. # print(parses)
  130. ret = None
  131. for nparse in parses:
  132. ret = self.parseOneRule(doc, nparse, ret)
  133. if not ret: # 可能循环取值后ret 对应eq取完无值了,pdfa直接返回空列表
  134. return []
  135. res = [item.outerHtml() for item in ret.items()]
  136. return res
  137. def pdfh(self, html, parse: str, base_url: str = ''):
  138. if not all([html, parse]):
  139. return ''
  140. if PARSE_CACHE:
  141. if self.pdfh_html != html:
  142. self.pdfh_html = html
  143. self.pdfh_doc = pq(html)
  144. doc = self.pdfh_doc
  145. else:
  146. doc = pq(html)
  147. if parse == 'body&&Text' or parse == 'Text':
  148. text = doc.text()
  149. return text
  150. elif parse == 'body&&Html' or parse == 'Html':
  151. return doc.html()
  152. option = None
  153. if self.contains(parse, '&&'):
  154. option = parse.split('&&')[-1]
  155. parse = '&&'.join(parse.split('&&')[:-1])
  156. parse = self.parseHikerToJq(parse, True)
  157. # print(f'pdfh:{parse},option:{option}')
  158. parses = parse.split(' ')
  159. # print(parses)
  160. ret = None
  161. for nparse in parses:
  162. ret = self.parseOneRule(doc, nparse, ret)
  163. # print(nparse,ret)
  164. if not ret: # 可能循环取值后ret 对应eq取完无值了,pdfh直接返回空字符串
  165. return ''
  166. if option:
  167. if option == 'Text':
  168. ret = ret.text()
  169. elif option == 'Html':
  170. ret = ret.html()
  171. else:
  172. ret = ret.attr(option) or ''
  173. if self.contains(option.lower(), 'style') and self.contains(ret, 'url('):
  174. try:
  175. ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0]
  176. except:
  177. pass
  178. if ret and base_url:
  179. # need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
  180. need_add = self.test(URLJOIN_ATTR, option)
  181. if need_add:
  182. if 'http' in ret:
  183. ret = ret[ret.find('http'):]
  184. else:
  185. ret = urljoin(base_url, ret)
  186. else:
  187. ret = ret.outerHtml()
  188. return ret
  189. def pd(self, html, parse: str, base_url: str = ''):
  190. if not base_url:
  191. base_url = self.MY_URL
  192. return self.pdfh(html, parse, base_url)
  193. def pq(self, html: str):
  194. return pq(html)
  195. def pjfh(self, html, parse: str, add_url=False):
  196. if not all([html, parse]):
  197. return ''
  198. if isinstance(html, str):
  199. # print(html)
  200. try:
  201. html = ujson.loads(html)
  202. # html = eval(html)
  203. except:
  204. print('字符串转json失败')
  205. return ''
  206. if not parse.startswith('$.'):
  207. parse = f'$.{parse}'
  208. ret = ''
  209. for ps in parse.split('||'):
  210. ret = jsonpath(html, ps)
  211. if isinstance(ret, list):
  212. ret = str(ret[0]) if ret[0] else ''
  213. else:
  214. ret = str(ret) if ret else ''
  215. if add_url and ret:
  216. ret = urljoin(self.MY_URL, ret)
  217. if ret:
  218. break
  219. # print(ret)
  220. return ret
  221. def pj(self, html, parse: str):
  222. return self.pjfh(html, parse, True)
  223. def pjfa(self, html, parse: str):
  224. if not all([html, parse]):
  225. return []
  226. if isinstance(html, str):
  227. try:
  228. html = ujson.loads(html)
  229. except:
  230. return []
  231. if not parse.startswith('$.'):
  232. parse = f'$.{parse}'
  233. # print(html)
  234. # print(parse)
  235. ret = jsonpath(html, parse)
  236. # print(ret)
  237. # print(type(ret))
  238. # print(type(ret[0]))
  239. # print(len(ret))
  240. if isinstance(ret, list) and isinstance(ret[0], list) and len(ret) == 1:
  241. # print('自动解包')
  242. ret = ret[0] # 自动解包
  243. return ret or []
  244. if __name__ == '__main__':
  245. pass