searx.py 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138
  1. ########################################################################
  2. # Searx-Qt - Lightweight desktop application for Searx.
  3. # Copyright (C) 2020-2022 CYBERDEViL
  4. #
  5. # This file is part of Searx-Qt.
  6. #
  7. # Searx-Qt is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 3 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # Searx-Qt is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program. If not, see <https://www.gnu.org/licenses/>.
  19. #
  20. ########################################################################
  21. import time
  22. import urllib.parse
  23. from bs4 import BeautifulSoup
  24. from searxqt.core.requests import JsonResult, ErrorType, Schemas
  25. from searxqt.core.handler import HandlerProto, NetworkTypes
  26. from searxqt.utils.string import parseFilesize
  27. from searxqt.translations import _
  28. ## API result (format=json)
  29. class SearchResult(JsonResult):
  30. Schema = Schemas['searxng_query']
  31. def __init__(self, url, response, err="", errType=ErrorType.Success):
  32. JsonResult.__init__(self, url, response, err=err, errType=errType)
  33. def verifyFurther(self):
  34. JsonResult.verifyFurther(self)
  35. # One of the following keys has to be not empty, else we count it as
  36. # no (usable) result.
  37. validKeys = [
  38. 'results',
  39. 'answers',
  40. 'corrections',
  41. 'infoboxes',
  42. 'suggestions'
  43. ]
  44. if self._errType == ErrorType.Success:
  45. data = self.json()
  46. valid = False
  47. for key in validKeys:
  48. if len(data.get(key, [])):
  49. valid = True
  50. break
  51. if not valid:
  52. self._errType = ErrorType.NoResults
  53. self._err = f"NoResults: got: `{self.json()}`"
  54. def fixUrlScheme(url):
  55. """! Adds 'https://' when the scheme is missing."""
  56. parsedUrl = urllib.parse.urlparse(url)
  57. if not parsedUrl.scheme:
  58. return f"https://{url}"
  59. return url
  60. ## HTML result that will be parsed into JSON
  61. class SearchResult2(SearchResult):
  62. Schema = Schemas['searxng_query']
  63. def __init__(self, url, response, err="", errType=ErrorType.Success):
  64. self.__json = {}
  65. SearchResult.__init__(self, url, response, err=err, errType=errType)
  66. def verifyFurther(self):
  67. self.__json = self.parse()
  68. SearchResult.verifyFurther(self)
  69. def json(self):
  70. return self.__json
  71. def makeUrlAbsolute(self, url):
  72. """! Returns a absolute URL. It will add the SearXNG instance its
  73. schema and location in front when they are missing."""
  74. parsedUrl = urllib.parse.urlparse(url)
  75. instanceUrl = urllib.parse.urlparse(self.url())
  76. if not parsedUrl.netloc:
  77. url = f"{instanceUrl.netloc}{url}"
  78. if not parsedUrl.scheme:
  79. url = f"{instanceUrl.scheme}://{url}"
  80. return url
  81. def parseImagesResult(self, result):
  82. """! Parse image results from HTML."""
  83. """Example HTML:
  84. <article class="result result-images category-images">
  85. <a href="https://wallup.net/wp-content/uploads/2019/09/441567-landscapes-nature-wallpaper.jpg" rel="noreferrer">
  86. <img alt="landscapes, Nature, Wallpaper Wallpapers HD / Desktop and Mobile ..." class="image_thumbnail" height="200" loading="lazy" rel="noreferrer" src="/image_proxy?url=https%3A%2F%2Fs2.qwant.com%2Fthumbr%2F474x315%2Ff%2F1%2F5fe20d297b0af77d40641a1c2d1a0a430b235e0f98e1584d580cf7931b28f9%2Fth.jpg%3Fu%3Dhttps%253A%252F%252Ftse.mm.bing.net%252Fth%253Fid%253DOIP.bLDwvUIZXCd5HCilSOxKCAHaE7%2526pid%253DApi%26q%3D0%26b%3D1%26p%3D0%26a%3D0&amp;h=HASH ..." width="200"/>
  87. <span class="title">
  88. landscapes, Nature, Wallpaper Wallpapers HD / Desktop and Mobile ...
  89. </span>
  90. <span class="source">wallup.net</span>
  91. </a>
  92. <div class="detail">
  93. <a class="result-detail-close" href="#">
  94. <svg SVG_STUFF ...></svg>
  95. </a>
  96. <a class="result-detail-previous" href="#">
  97. <svg SVG_STUFF ...></svg>
  98. </a>
  99. <a class="result-images-source" href="https://wallup.net/wp-content/uploads/2019/09/441567-landscapes-nature-wallpaper.jpg" rel="noreferrer">
  100. <img alt="landscapes, Nature, Wallpaper Wallpapers HD / Desktop and Mobile ..." data-src="/image_proxy?url=https%3A%2F%2Fwallup.net%2Fwp-content%2Fuploads%2F2019%2F09%2F441567-landscapes-nature-wallpaper.jpg&amp;h=HASH ..." src=""/>
  101. </a>
  102. <div class="result-images-labels">
  103. <h4>landscapes, Nature, Wallpaper Wallpapers HD / Desktop and Mobile ...</h4>
  104. <p class="result-content"> </p>
  105. <hr/>
  106. <p class="result-author"> </p>
  107. <p class="result-format"> </p>
  108. <p class="result-source"> </p>
  109. <p class="result-engine">
  110. <span>Engine:</span>qwant images
  111. </p>
  112. <p class="result-url">
  113. <span>View source:</span>
  114. <a href="https://wallup.net/landscapes-nature-wallpaper-69/" rel="noreferrer">https://wallup.net/landscapes-nature-wallpaper-69/</a>
  115. </p>
  116. </div>
  117. </div>
  118. </article>
  119. """
  120. title = '' # image title
  121. url = '' # url to the website of the image
  122. content = '' # probably same as the title
  123. engines = [] # see img_src
  124. #publishedDate = ''
  125. img_format = '' # size/format of the image in string format
  126. img_src = '' # source if the image (engine)
  127. thumbnail_src = '' # url to thumbnail
  128. source = '' # where does the image come from?
  129. category = 'images'
  130. # !! GET Title
  131. try:
  132. title = result.a.img.get('alt')
  133. except AttributeError:
  134. log.debug("Failed to get img title", self)
  135. # !! GET thumbnail_src
  136. try:
  137. thumbnail_src = result.a.img.get('src')
  138. except AttributeError:
  139. log.debug("Failed to get img thumbnail url", self)
  140. # Make sure the thumbnail url is absolute
  141. thumbnail_src = self.makeUrlAbsolute(thumbnail_src)
  142. # !! GET url
  143. felem = result.find("p", {"class": "result-url"})
  144. if felem:
  145. try:
  146. url = felem.a.get('href')
  147. except AttributeError:
  148. log.debug("Failed to get img url (1)", self)
  149. else:
  150. log.debug("Failed to get img url (2)", self)
  151. # !! GET img_src
  152. felem = result.find("a", {"class": "result-images-source"})
  153. if felem:
  154. img_src = felem.get('href')
  155. img_src = fixUrlScheme(img_src) # Make sure it has a scheme
  156. else:
  157. log.debug("Failed to get img_src", self)
  158. ## !! GET content
  159. # p class=result-content
  160. felem = result.find("p", {"class": "result-content"})
  161. if felem:
  162. content = felem.get_text()
  163. else:
  164. log.debug("Failed to get img content", self)
  165. # !! GET img_format
  166. # p class=result-format
  167. felem = result.find("p", {"class": "result-format"})
  168. if felem:
  169. img_format = felem.get_text()
  170. else:
  171. log.debug("Failed to get img format", self)
  172. # !! GET source
  173. felem = result.find("span", {"class": "source"})
  174. if felem:
  175. source = felem.get_text()
  176. else:
  177. log.debug("Failed to get img source", self)
  178. # !! GET engines
  179. #<p class="result-engine">
  180. felem = result.find("p", {"class": "result-engine"})
  181. if felem:
  182. for engine in felem.find_all("span"):
  183. engines.append(engine.nextSibling.get_text().replace(' ', '-'))
  184. else:
  185. log.debug("Failed to get img source", self)
  186. return {
  187. 'title': title,
  188. 'url': url,
  189. 'content': content,
  190. 'engines': [engine for engine in engines],
  191. 'img_format': img_format,
  192. 'img_src': img_src,
  193. 'thumbnail_src': thumbnail_src,
  194. 'source': source,
  195. 'category': category
  196. }
  197. def parse(self):
  198. if self.errorType() != ErrorType.Success:
  199. return {}
  200. jsonResult = {
  201. 'results': [],
  202. 'answers': [],
  203. 'corrections': [],
  204. 'infoboxes': [],
  205. 'suggestions': [],
  206. 'unresponsive_engines': []
  207. }
  208. soup = BeautifulSoup(self.content(), "html.parser")
  209. #######################################################################
  210. ## 'results' key
  211. #######################################################################
  212. def _getResults():
  213. # Because the element may be a 'article' or 'div', also depending
  214. # on the category of the result.
  215. for result in soup.find_all("article", {"class": "result"}):
  216. yield result
  217. for result in soup.find_all("div", {"class": "result"}):
  218. yield result
  219. for result in _getResults():
  220. # Image results
  221. if "result-images" in result.get("class"):
  222. jsonResult['results'].append(
  223. self.parseImagesResult(result)
  224. )
  225. continue
  226. # Normal search results
  227. """
  228. <article class="result result-default category-general qwant duckduckgo google">
  229. <a href="https://linuxize.com/post/curl-post-request/" class="url_wrapper" rel="noreferrer">
  230. <span class="url_o1">
  231. <span class="url_i1">https://linuxize.com</span>
  232. </span>
  233. <span class="url_o2">
  234. <span class="url_i2"> › post › curl-post-request</span>
  235. </span>
  236. </a>
  237. <h3>
  238. <a href="https://linuxize.com/post/curl-post-request/" rel="noreferrer">
  239. How to make a <span class="highlight">POST</span>
  240. <span class="highlight">request</span>
  241. with <span class="highlight">curl</span>
  242. </a>
  243. </h3>
  244. <p class="content">
  245. Learn how to use <span class="highlight">curl</span>, a command-line utility for transferring data from or to a remote server, to make <span class="highlight">POST</span> requests. See examples of sending data, files, and JSON data with <span class="highlight">curl</span> options and options.
  246. </p>
  247. <div class="engines">
  248. <span>qwant</span>
  249. <span>duckduckgo</span>
  250. <span>google</span>
  251. <a href="https://web.archive.org/web/https://linuxize.com/post/curl-post-request/" class="cache_link" rel="noreferrer">
  252. <svg SVG_STUFF .../></svg>
  253. cached
  254. </a>
  255. &lrm;
  256. </div>
  257. <div class="break"></div>
  258. </article>
  259. """
  260. """
  261. <article class="result result-torrent category-files solidtorrents">
  262. <a href="https://solidtorrents.to/torrents/STUFF .../" class="url_wrapper" rel="noreferrer">
  263. <span class="url_o1">
  264. <span class="url_i1">https://solidtorrents.to</span>
  265. </span>
  266. <span class="url_o2">
  267. <span class="url_i2"> › torrents › SOME_NAME › SOME_HASH</span>
  268. </span>
  269. </a>
  270. <h3>
  271. <a href="https://solidtorrents.to/torrents/SOME_NAME/SOME_HASH/" rel="noreferrer">
  272. <span class="highlight">SOME</span>-<span class="highlight">NAME</span>
  273. </a>
  274. </h3>
  275. <time class="published_date" datetime="2018-10-20 00:00:00" >Oct 20, 2018</time>
  276. <div class="highlight">Other/Archive</div>
  277. <p class="altlink">
  278. &bull;
  279. <a href="magnet:MAGNET_LINK ..." class="magnetlink" rel="noreferrer"><svg SVG_STUFF .../></svg>magnet link</a>
  280. </p>
  281. <p class="altlink">
  282. &bull;
  283. <a href="https://itorrents.org/torrent/TORRENT_LINK ..." class="torrentfile" rel="noreferrer">torrent file</a>
  284. </p>
  285. <p class="stat">
  286. &bull; Seeder
  287. <span class="badge">407</span>
  288. &bull; Leecher
  289. <span class="badge">748</span>
  290. </p>
  291. <p class="stat"> Filesize
  292. <span class="badge">2.88 GiB</span>
  293. </p>
  294. <div class="engines">
  295. <span>solidtorrents</span>
  296. <a href="https://web.archive.org/web/https://solidtorrents.to/torrents/TORRENT_STUFF ..." class="cache_link" rel="noreferrer"><svg SVG_STUFF .../></svg>cached</a>
  297. &lrm;
  298. </div>
  299. <div class="break"></div>
  300. </article>
  301. """
  302. title = ''
  303. url = ''
  304. content = ''
  305. engines = []
  306. publishedDate = ''
  307. magnetlink = ''
  308. torrentfile = ''
  309. filesize = 0
  310. files = 0 # TODO unused for now
  311. seed = None
  312. leech = None
  313. # !! GET Title
  314. try:
  315. title = result.h3.a.get_text().lstrip().rstrip()
  316. except AttributeError:
  317. print("Failed to get title")
  318. # !! GET URL
  319. try:
  320. url = result.h3.a.get("href")
  321. except AttributeError:
  322. print("Failed to get url")
  323. # !! GET Content
  324. felem = result.find("p", {"class": "content"})
  325. if felem:
  326. content = felem.get_text().lstrip().rstrip()
  327. # !! GET Engines
  328. felem = result.find("div", {"class": "engines"})
  329. if felem:
  330. for engine in felem.find_all("span"):
  331. engines.append(engine.get_text().rstrip().lstrip())
  332. ## !! Get publishDate
  333. felem = result.find("time", {"class": "published_date"})
  334. if felem:
  335. publishedDate = felem.get("datetime", "")
  336. ## !! Get magnetlink
  337. felem = result.find("a", {"class": "magnetlink"})
  338. if felem:
  339. magnetlink = felem.get('href')
  340. ## !! Get torrentfile
  341. felem = result.find("a", {"class": "torrentfile"})
  342. if felem:
  343. torrentfile = felem.get('href')
  344. ## !! Get filesize
  345. for felem in result.find_all("span", {"class": "badge"}):
  346. if felem.previousSibling:
  347. precedingText = felem.previousSibling
  348. if "Filesize" in precedingText:
  349. filesize = parseFilesize(felem.get_text().rstrip().lstrip())
  350. elif "Seeder" in precedingText:
  351. seed = felem.get_text()
  352. elif "Leecher" in precedingText:
  353. leech = felem.get_text()
  354. # !! Add result
  355. resultData = {
  356. 'title': title,
  357. 'url': url,
  358. 'content': content,
  359. 'engines': [engine for engine in engines],
  360. # Optional
  361. 'publishedDate': publishedDate,
  362. # File attributes
  363. 'magnetlink': magnetlink,
  364. 'torrentfile': torrentfile,
  365. 'filesize': filesize,
  366. 'files': files,
  367. 'img_format': '' # TODO
  368. }
  369. if seed is not None:
  370. resultData.update({'seed': seed})
  371. if leech is not None:
  372. resultData.update({'leech': leech})
  373. jsonResult['results'].append(resultData)
  374. ##########################################################################
  375. ## 'suggestions' key
  376. ##########################################################################
  377. """
  378. <div id="sidebar">
  379. <div id="suggestions" role="complementary" aria-labelledby="suggestions-title">
  380. <details class="sidebar-collapsable">
  381. <summary class="title" id="suggestions-title">Suggestions</summary>
  382. <div class="wrapper">
  383. <form method="POST" action="/search">
  384. <input type="hidden" name="q" value="curl post request json">
  385. <input type="hidden" name="category_general" value="1">
  386. <input type="hidden" name="language" value="auto">
  387. <input type="hidden" name="time_range" value="">
  388. <input type="hidden" name="safesearch" value="0">
  389. <input type="hidden" name="theme" value="simple">
  390. <input type="submit" class="suggestion" role="link" value="&bull; curl post request json">
  391. """
  392. felem = soup.find("div", {"id": "suggestions"})
  393. if felem:
  394. for suggestion in felem.find_all("input", {"name": "q"}):
  395. jsonResult['suggestions'].append(suggestion.get("value"))
  396. ##########################################################################
  397. ## 'answers' key
  398. ##########################################################################
  399. """
  400. <h4 class="title" id="answers-title">Answers : </h4>
  401. <div class="answer">
  402. <span>LONG TEXT ...</span>
  403. <a href="some url ..." class="answer-url">url text ...</a>
  404. </div>
  405. """
  406. for answer in soup.find_all("div", {"class": "answer"}):
  407. felem = answer.find("span")
  408. if felem:
  409. jsonResult['answers'].append(felem.get_text())
  410. ##########################################################################
  411. ## 'corrections' key
  412. ##########################################################################
  413. """ TODO """
  414. ##########################################################################
  415. ## 'infoboxes' key
  416. ##########################################################################
  417. """
  418. <details open="" class="sidebar-collapsable">
  419. <summary class="title">Info</summary>
  420. <aside class="infobox" aria-label="Banana">
  421. <h2 class="title"><bdi>Banana</bdi></h2>
  422. <img src="/image_proxy?url=long_image_url" title="Banana" alt="Banana">
  423. <p><bdi>LOGNG TEXT HERE ...</bdi></p>
  424. <div class="urls">
  425. <ul>
  426. <li class="url"><bdi><a href="https://en.wikipedia.org/wiki/Banana" rel="noreferrer">Wikipedia</a></bdi></li>
  427. <li class="url"><bdi><a href="http://www.wikidata.org/entity/Q503" rel="noreferrer">Wikidata</a></bdi></li>
  428. </ul>
  429. </div>
  430. </aside>
  431. </details>
  432. """
  433. """
  434. <details open="" class="sidebar-collapsable">
  435. <summary class="title">Info</summary>
  436. <aside class="infobox" aria-label="Water">
  437. <h2 class="title"><bdi>Water</bdi></h2>
  438. <img src="/image_proxy?url=long url .." title="Water" alt="Water">
  439. <p><bdi>LONG TEXT ...</bdi></p>
  440. <div class="attributes">
  441. <dl>
  442. <dt><bdi>Chemical formula :</bdi></dt>
  443. <dd><bdi>H₂O</bdi></dd>
  444. </dl>
  445. </div>
  446. <div class="urls">
  447. <ul>
  448. <li class="url"><bdi><a href="https://en.wikipedia.org/wiki/Water" rel="noreferrer">Wikipedia</a></bdi></li>
  449. <li class="url"><bdi><a href="http://www.wikidata.org/entity/Q283" rel="noreferrer">Wikidata</a></bdi></li>
  450. </ul>
  451. </div>
  452. </aside>
  453. </details>
  454. """
  455. """
  456. infoboxes = []
  457. ibox = {
  458. 'infobox': 'str',
  459. 'id': 'uri',
  460. 'content': 'str',
  461. 'img_src': 'uri' | null
  462. 'urls': [
  463. {
  464. 'title': 'str',
  465. 'url': 'uri',
  466. 'entity': 'str',
  467. 'official': true
  468. }
  469. ],
  470. 'attributes': [
  471. {
  472. 'label': 'str',
  473. 'value': 'str',
  474. 'entity': 'str'
  475. }
  476. ],
  477. 'engines': ['str'],
  478. 'engine': 'str'
  479. }
  480. """
  481. for infobox in soup.find_all("aside", {"class": "infobox"}):
  482. title = ""
  483. id = ""
  484. content = ""
  485. img_src = ""
  486. urls = []
  487. attributes = []
  488. engines = []
  489. # Title
  490. felem = infobox.find("h2", {"class": "title"})
  491. if felem:
  492. title = felem.get_text().rstrip().lstrip()
  493. # ID
  494. # TODO
  495. # Content
  496. felem = infobox.find("p")
  497. if felem:
  498. felem = felem.find("bdi")
  499. if felem:
  500. content = felem.get_text().rstrip().lstrip()
  501. # Image
  502. felem = infobox.find("img")
  503. if felem:
  504. img_src = self.makeUrlAbsolute(felem.get("src"))
  505. # URLs
  506. for felem in infobox.find_all("li", {"class": "url"}):
  507. felem = felem.find("a")
  508. if felem:
  509. urls.append({
  510. 'title': felem.get_text().lstrip().rstrip(),
  511. 'url': felem.get("href", ""),
  512. 'entity': '', # TODO
  513. 'official': False # TODO
  514. })
  515. # Attributes
  516. """
  517. <div class="attributes">
  518. <dl>
  519. <dt><bdi>Chemical formula :</bdi></dt>
  520. <dd><bdi>H₂O</bdi></dd>
  521. </dl>
  522. </div>
  523. """
  524. felem = infobox.find("div", {"class": "attributes"})
  525. if felem:
  526. for item in felem.find_all("dl"):
  527. label = ""
  528. value = ""
  529. entity = "" # TODO
  530. try:
  531. label = item.dt.bdi.get_text().rstrip().lstrip()
  532. value = item.dd.bdi.get_text().rstrip().lstrip()
  533. except AttributeError:
  534. continue
  535. attributes.append({
  536. "label": label,
  537. "value": value,
  538. "entity": entity
  539. })
  540. # Engines
  541. for url in urls:
  542. engines.append(url['title'].lower())
  543. jsonResult['infoboxes'].append({
  544. "infobox": title,
  545. "id": id,
  546. "content": content,
  547. "img_src": img_src,
  548. "urls": urls,
  549. "attributes": attributes,
  550. "engines": engines
  551. })
  552. ##########################################################################
  553. ## 'unresponsive_engines' key
  554. ##########################################################################
  555. """
  556. <div id="engines_msg">
  557. <details class="sidebar-collapsable" open="">
  558. <summary class="title" id="engines_msg-title">Messages from the search engines</summary>
  559. <div class="dialog-error" role="alert">
  560. <svg class="ion-icon-big" etc..></svg>
  561. <div>
  562. <p>
  563. <strong>Error!</strong>
  564. Engines cannot retrieve results:
  565. </p>
  566. <p>
  567. brave (<a href="/stats?engine=brave" title="View error logs and submit a bug report">Suspended: too many requests</a>)
  568. </p>
  569. <p>
  570. qwant (<a href="/stats?engine=qwant" title="View error logs and submit a bug report">Suspended: too many requests</a>)
  571. </p>
  572. </div>
  573. </div>
  574. </details>
  575. </div>
  576. """
  577. felem = soup.find("div", {"id": "engines_msg"})
  578. if felem:
  579. for errDialog in felem.find_all("div", {"class": "dialog-error"}):
  580. for p in errDialog.find_all("p"):
  581. a = p.find("a")
  582. if not a:
  583. continue
  584. engine, msg = p.get_text().split(" ", 1)
  585. jsonResult['unresponsive_engines'].append([engine, msg])
  586. return jsonResult
  587. class SearxConfigResult(JsonResult):
  588. Schema = Schemas['searxng_config']
  589. def __init__(self, url, response, err="", errType=ErrorType.Success):
  590. JsonResult.__init__(self, url, response, err=err, errType=errType)
  591. class Categories:
  592. types = {
  593. 'general': (_('General'), 'category_general'),
  594. 'files': (_('Files'), 'category_files'),
  595. 'images': (_('Images'), 'category_images'),
  596. 'videos': (_('Videos'), 'category_videos'),
  597. 'it': (_('IT'), 'category_it'),
  598. 'map': (_('Location'), 'category_map'),
  599. 'music': (_('Music'), 'category_music'),
  600. 'news': (_('News'), 'category_news'),
  601. 'science': (_('Science'), 'category_science'),
  602. 'social media': (_('Social'), 'category_social media'),
  603. 'onions': (_('Onions'), 'category_onions'),
  604. 'shopping': (_('Shopping'), 'category_shopping')
  605. }
  606. def __init__(self):
  607. self._options = {}
  608. self.__makeOptions()
  609. def __makeOptions(self):
  610. self._options.clear()
  611. for key, t in self.types.items():
  612. self._options.update({key: False})
  613. def reset(self):
  614. self.__makeOptions()
  615. def get(self, key):
  616. return self._options[key]
  617. def set(self, key, state):
  618. """
  619. @param key: One of the keys in Categories.types
  620. @type key: str
  621. @param state: Enabled / disabled state
  622. @type state: bool
  623. """
  624. self._options[key] = state
  625. def dict(self):
  626. newDict = {}
  627. for key, state in self._options.items():
  628. if state:
  629. newDict.update({self.types[key][1]: 'on'})
  630. return newDict
  631. def enabledKeys(self):
  632. """ Returns a list with enabled engine strings (key from
  633. Categories.types)
  634. """
  635. return [key for key, state in self._options.items() if state]
  636. class Engines(list):
  637. def __init__(self):
  638. list.__init__(self)
  639. def dict(self):
  640. if not self:
  641. return {}
  642. return {
  643. 'engines': ",".join(self)
  644. }
  645. class SearX:
  646. Periods = {
  647. '': _('Anytime'),
  648. 'day': _('Last day'),
  649. 'week': _('Last week'),
  650. 'month': _('Last month'),
  651. 'year': _('Last year')
  652. }
  653. # https://github.com/asciimoo/searx/blob/master/searx/languages.py
  654. Languages = {
  655. '': _('No language'),
  656. 'all': _('Default language'),
  657. 'af-NA': 'Afrikaans - af-NA',
  658. 'ca-AD': 'Català - ca-AD',
  659. 'da-DK': 'Dansk - da-DK',
  660. 'de': 'Deutsch - de',
  661. 'de-AT': 'Deutsch (Österreich) - de-AT',
  662. 'de-CH': 'Deutsch (Schweiz) - de-CH',
  663. 'de-DE': 'Deutsch (Deutschland) - de-DE',
  664. 'et-EE': 'Eesti - et-EE',
  665. 'en': 'English - en',
  666. 'en-AU': 'English (Australia) - en-AU',
  667. 'en-CA': 'English (Canada) - en-CA',
  668. 'en-GB': 'English (United Kingdom) - en-GB',
  669. 'en-IE': 'English (Ireland) - en-IE',
  670. 'en-IN': 'English (India) - en-IN',
  671. 'en-NZ': 'English (New Zealand) - en-NZ',
  672. 'en-PH': 'English (Philippines) - en-PH',
  673. 'en-SG': 'English (Singapore) - en-SG',
  674. 'en-US': 'English (United States) - en-US',
  675. 'es': 'Español - es',
  676. 'es-AR': 'Español (Argentina) - es-AR',
  677. 'es-CL': 'Español (Chile) - es-CL',
  678. 'es-ES': 'Español (España) - es-ES',
  679. 'es-MX': 'Español (México) - es-MX',
  680. 'fr': 'Français - fr',
  681. 'fr-BE': 'Français (Belgique) - fr-BE',
  682. 'fr-CA': 'Français (Canada) - fr-CA',
  683. 'fr-CH': 'Français (Suisse) - fr-CH',
  684. 'fr-FR': 'Français (France) - fr-FR',
  685. 'hr-HR': 'Hrvatski - hr-HR',
  686. 'id-ID': 'Indonesia - id-ID',
  687. 'it-IT': 'Italiano - it-IT',
  688. 'sw-KE': 'Kiswahili - sw-KE',
  689. 'lv-LV': 'Latviešu - lv-LV',
  690. 'lt-LT': 'Lietuvių - lt-LT',
  691. 'hu-HU': 'Magyar - hu-HU',
  692. 'ms-MY': 'Melayu - ms-MY',
  693. 'nl': 'Nederlands - nl',
  694. 'nl-BE': 'Nederlands (België) - nl-BE',
  695. 'nl-NL': 'Nederlands (Nederland) - nl-NL',
  696. 'nb-NO': 'Norsk Bokmål - nb-NO',
  697. 'pl-PL': 'Polski - pl-PL',
  698. 'pt': 'Português - pt',
  699. 'pt-BR': 'Português (Brasil) - pt-BR',
  700. 'pt-PT': 'Português (Portugal) - pt-PT',
  701. 'ro-RO': 'Română - ro-RO',
  702. 'sk-SK': 'Slovenčina - sk-SK',
  703. 'sl-SI': 'Slovenščina - sl-SI',
  704. 'sr-RS': 'Srpski - sr-RS',
  705. 'fi-FI': 'Suomi - fi-FI',
  706. 'sv-SE': 'Svenska - sv-SE',
  707. 'vi-VN': 'Tiếng Việt - vi-VN',
  708. 'tr-TR': 'Türkçe - tr-TR',
  709. 'is-IS': 'Íslenska - is-IS',
  710. 'cs-CZ': 'Čeština - cs-CZ',
  711. 'el-GR': 'Ελληνικά - el-GR',
  712. 'be-BY': 'Беларуская - be-BY',
  713. 'bg-BG': 'Български - bg-BG',
  714. 'ru-RU': 'Русский - ru-RU',
  715. 'uk-UA': 'Українська - uk-UA',
  716. 'hy-AM': 'Հայերեն - hy-AM',
  717. 'he-IL': 'עברית - he-IL',
  718. 'ar-SA': 'العربية - ar-SA',
  719. 'fa-IR': 'فارسی - fa-IR',
  720. 'th-TH': 'ไทย - th-TH',
  721. 'zh': '中文 - zh',
  722. 'zh-CN': '中文 (中国) - zh-CN',
  723. 'zh-TW': '中文 (台灣) - zh-TW',
  724. 'ja-JP': '日本語 - ja-JP',
  725. 'ko-KR': '한국어 - ko-KR'
  726. }
  727. def __init__(self, requestHandler):
  728. self._requestHandler = requestHandler
  729. self._url = ""
  730. self._categories = Categories()
  731. self._engines = Engines()
  732. self._query = ""
  733. self._lang = ""
  734. self._pageno = "" # int formatted as string
  735. self._timeRange = "" # '', 'day', 'week', 'month' or 'year'
  736. self._safesearch = False
  737. self._parseHtml = True
  738. @property
  739. def categories(self): return self._categories
  740. @property
  741. def engines(self): return self._engines
  742. @property
  743. def url(self):
  744. """
  745. @return: Instance url
  746. @rtype: str
  747. """
  748. return self._url
  749. @url.setter
  750. def url(self, url):
  751. """
  752. @param url: Instance url
  753. @type url: str
  754. """
  755. self._url = url
  756. @property
  757. def query(self):
  758. """
  759. @return: Search query
  760. @rtype: str
  761. """
  762. return self._query
  763. @query.setter
  764. def query(self, q):
  765. """
  766. @param q: Search query
  767. @type q: str
  768. """
  769. self._query = q
  770. @property
  771. def lang(self):
  772. """
  773. @return: Language code
  774. @rtype: str
  775. """
  776. return self._lang
  777. @lang.setter
  778. def lang(self, lang):
  779. """
  780. @param lang: Language code
  781. @type lang: str
  782. """
  783. self._lang = lang
  784. @property
  785. def pageno(self):
  786. """
  787. @return: Page number
  788. @rtype: int
  789. """
  790. return int(self._pageno)
  791. @pageno.setter
  792. def pageno(self, i):
  793. """
  794. @param i: Page number
  795. @type i: int
  796. """
  797. self._pageno = str(i)
  798. @property
  799. def timeRange(self):
  800. """
  801. @return: Search time range ('', 'day', 'week', 'month' or 'year')
  802. @rtype: str
  803. """
  804. return self._timeRange
  805. @timeRange.setter
  806. def timeRange(self, value):
  807. """
  808. @param value: Key from SearX.Periods
  809. @type value: str
  810. """
  811. self._timeRange = value
  812. @property
  813. def safeSearch(self):
  814. """
  815. @return: Whether safe search is enabled or not.
  816. @rtype: bool
  817. """
  818. return self._safesearch
  819. @safeSearch.setter
  820. def safeSearch(self, state):
  821. """
  822. @param state: Enable/disable safe search.
  823. @type state: bool
  824. """
  825. self._safesearch = state
  826. @property
  827. def parseHtml(self):
  828. """
  829. @return: Whether parsing HTML is enabled, this will not use the
  830. JSON API when it returns True.
  831. @rtype: bool
  832. """
  833. return self._parseHtml
  834. @parseHtml.setter
  835. def parseHtml(self, state):
  836. """
  837. @param value: Enable/disable parsing HTML instead of using the JSON API
  838. @type value: bool
  839. """
  840. self._parseHtml = state
  841. @property
  842. def requestKwargs(self):
  843. """ Returns current data that will be send with the POST
  844. request used for the search operation. The search query,
  845. language, page-number and enabled categories/engines.
  846. @rtype: dict
  847. """
  848. data = {
  849. "q": self.query,
  850. "safesearch": "1" if self.safeSearch else "0"
  851. }
  852. # Choose what resource to use (JSON API or HTML parser)
  853. if self.parseHtml:
  854. data.update({"theme": "simple"})
  855. else:
  856. data.update({"format": "json"})
  857. # After testing found that searx will honor only engines when
  858. # both engines and categories are set.
  859. if self.engines:
  860. data.update(self.engines.dict())
  861. elif self.categories:
  862. data.update(self.categories.dict())
  863. if self.lang:
  864. data.update({"language": self.lang})
  865. if self.pageno:
  866. data.update({"pageno": self.pageno})
  867. if self.timeRange:
  868. data.update({"time_range": self.timeRange})
  869. return data
  870. def reset(self):
  871. self.url = ""
  872. self.timeRange = ""
  873. self.lang = ""
  874. self.pageno = 1
  875. self.categories.reset()
  876. self.engines.clear()
  877. self.query = ""
  878. def search(self):
  879. """ Preform search operation with current set values.
  880. @returns: The result of this search.
  881. @rtype: SearchResult
  882. """
  883. rtype = SearchResult
  884. if self.parseHtml:
  885. rtype = SearchResult2
  886. return self._requestHandler.get(
  887. self.url,
  888. data=self.requestKwargs,
  889. ResultType=rtype
  890. )
  891. class SearxConfigHandler(HandlerProto):
  892. def __init__(self, requestsHandler):
  893. HandlerProto.__init__(self, requestsHandler)
  894. def updateInstance(self, url):
  895. newUrl = urllib.parse.urljoin(url, "/config")
  896. result = self.requestsHandler.get(newUrl, ResultType=SearxConfigResult)
  897. if result:
  898. instance = self.instances[url]
  899. j = result.json()
  900. """ Update instance version
  901. """
  902. instance.update({
  903. "version": j.get("version", "")
  904. })
  905. """ Update instance network_type to use our own network type
  906. definitions as class NetworkTypes (core/handler.py)
  907. """
  908. instance.update({"network_type": NetworkTypes.netTypeFromUrl(url)})
  909. """ Update Engines
  910. What we get:
  911. "engines": [
  912. categories (list, str)
  913. enabled (bool)
  914. language_support (bool)
  915. name (str)
  916. paging (bool)
  917. safesearch (bool)
  918. shortcut (str)
  919. supported_languages (list, str)
  920. time_range_support (bool)
  921. timeout (float)
  922. ]
  923. What instanceModel wants
  924. "engines" : {
  925. "not evil": {
  926. "error_rate": 15,
  927. "errors": [
  928. 0
  929. ]
  930. }
  931. }
  932. What enginesModel wants
  933. "engines": {
  934. "1337x": {
  935. "categories": [
  936. "videos"
  937. ],
  938. "language_support": true,
  939. "paging": true,
  940. "safesearch": false,
  941. "shortcut": "1337x",
  942. "time_range_support": false
  943. },
  944. """
  945. newInstanceEngines = {}
  946. newEnginesEngines = {}
  947. for engine in j.get('engines', []):
  948. name = engine.get('name', "")
  949. if not name:
  950. continue
  951. newInstanceEngines.update({
  952. name: {}
  953. })
  954. if name not in self.engines:
  955. newEnginesEngines.update({
  956. name: {
  957. "categories": list(engine.get('categories', [])),
  958. "language_support": engine.get(
  959. 'language_support',
  960. False
  961. ),
  962. "paging": engine.get('paging', False),
  963. "safesearch": engine.get('safesearch', False),
  964. "shortcut": engine.get('shortcut', ""),
  965. "time_range_support": engine.get(
  966. 'time_range_support',
  967. False
  968. )
  969. }
  970. })
  971. instance.update({
  972. "engines": dict(newInstanceEngines)
  973. })
  974. self.engines.update(newEnginesEngines)
  975. """ Update instance lastUpdated
  976. """
  977. instance.update({
  978. "lastUpdated": time.time()
  979. })
  980. return True
  981. return False
  982. def addInstance(self, url):
  983. if url not in self.instances:
  984. self._instances[url] = {}
  985. return True
  986. return False
  987. def removeInstance(self, url):
  988. """
  989. @param url: url of the instance to remove.
  990. @type url: str
  991. """
  992. del self._instances[url]
  993. def removeMultiInstances(self, urls):
  994. """ Remove instance(s) by url without emitting changed for every
  995. instance that got removed.
  996. @param urls: list with urls of instances to remove.
  997. @type urls: list
  998. """
  999. for url in urls:
  1000. del self._instances[url]