opengraph.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import logging
  2. import opengraph
  3. import requests
  4. from bs4 import BeautifulSoup
  5. from little_boxes import activitypub as ap
  6. from little_boxes.errors import NotAnActivityError
  7. from little_boxes.urlutils import check_url
  8. from little_boxes.urlutils import is_url_valid
  9. from .lookup import lookup
  10. logger = logging.getLogger(__name__)
  11. def links_from_note(note):
  12. tags_href = set()
  13. for t in note.get("tag", []):
  14. h = t.get("href")
  15. if h:
  16. tags_href.add(h)
  17. links = set()
  18. if "content" in note:
  19. soup = BeautifulSoup(note["content"])
  20. for link in soup.find_all("a"):
  21. h = link.get("href")
  22. if h.startswith("http") and h not in tags_href and is_url_valid(h):
  23. links.add(h)
  24. # FIXME(tsileo): support summary and name fields
  25. return links
  26. def fetch_og_metadata(user_agent, links):
  27. res = []
  28. for l in links:
  29. check_url(l)
  30. # Remove any AP actor from the list
  31. try:
  32. p = lookup(l)
  33. if p.has_type(ap.ACTOR_TYPES):
  34. continue
  35. except NotAnActivityError:
  36. pass
  37. r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
  38. r.raise_for_status()
  39. if not r.headers.get("content-type").startswith("text/html"):
  40. logger.debug(f"skipping {l}")
  41. continue
  42. r.encoding = "UTF-8"
  43. html = r.text
  44. try:
  45. data = dict(opengraph.OpenGraph(html=html))
  46. except Exception:
  47. logger.exception(f"failed to parse {l}")
  48. continue
  49. if data.get("url"):
  50. res.append(data)
  51. return res