sitemap.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. # -*- coding: utf-8 -*-
  2. """
  3. Sitemap
  4. -------
  5. The sitemap plugin generates plain-text or XML sitemaps.
  6. """
  7. from __future__ import unicode_literals
  8. from codecs import open
  9. import collections
  10. from datetime import datetime
  11. from logging import info, warning
  12. import os.path
  13. import re
  14. from pytz import timezone
  15. from pelican import contents, signals
  16. from pelican.utils import get_date
  17. TXT_HEADER = ""
  18. XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
  19. <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  20. xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  21. xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  22. """
  23. XML_URL = """
  24. <url>
  25. <loc>{0}/{1}</loc>
  26. <lastmod>{2}</lastmod>
  27. <changefreq>{3}</changefreq>
  28. <priority>{4}</priority>
  29. </url>
  30. """
  31. XML_FOOTER = """
  32. </urlset>
  33. """
  34. def format_date(date):
  35. if date.tzinfo:
  36. tz = date.strftime("%z")
  37. tz = tz[:-2] + ":" + tz[-2:]
  38. else:
  39. tz = "-00:00"
  40. return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
  41. class SitemapGenerator(object):
  42. def __init__(self, context, settings, path, theme, output_path, *null):
  43. self.output_path = output_path
  44. self.context = context
  45. self.now = datetime.now()
  46. self.siteurl = settings.get("SITEURL")
  47. self.default_timezone = settings.get("TIMEZONE", "UTC")
  48. self.timezone = getattr(self, "timezone", self.default_timezone)
  49. self.timezone = timezone(self.timezone)
  50. self.format = "xml"
  51. self.changefreqs = {
  52. "articles": "monthly",
  53. "indexes": "daily",
  54. "pages": "monthly",
  55. }
  56. self.priorities = {"articles": 0.5, "indexes": 0.5, "pages": 0.5}
  57. self.sitemapExclude = []
  58. config = settings.get("SITEMAP", {})
  59. if not isinstance(config, dict):
  60. warning("sitemap plugin: the SITEMAP setting must be a dict")
  61. else:
  62. fmt = config.get("format")
  63. pris = config.get("priorities")
  64. chfreqs = config.get("changefreqs")
  65. self.sitemapExclude = config.get("exclude", [])
  66. if fmt not in ("xml", "txt"):
  67. warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
  68. warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
  69. elif fmt == "txt":
  70. self.format = fmt
  71. return
  72. valid_keys = ("articles", "indexes", "pages")
  73. valid_chfreqs = (
  74. "always",
  75. "hourly",
  76. "daily",
  77. "weekly",
  78. "monthly",
  79. "yearly",
  80. "never",
  81. )
  82. if isinstance(pris, dict):
  83. # We use items for Py3k compat. .iteritems() otherwise
  84. for k, v in pris.items():
  85. if k in valid_keys and not isinstance(v, (int, float)):
  86. default = self.priorities[k]
  87. warning("sitemap plugin: priorities must be numbers")
  88. warning(
  89. "sitemap plugin: setting SITEMAP['priorities']"
  90. "['{0}'] on {1}".format(k, default)
  91. )
  92. pris[k] = default
  93. self.priorities.update(pris)
  94. elif pris is not None:
  95. warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
  96. warning("sitemap plugin: using the default values")
  97. if isinstance(chfreqs, dict):
  98. # .items() for py3k compat.
  99. for k, v in chfreqs.items():
  100. if k in valid_keys and v not in valid_chfreqs:
  101. default = self.changefreqs[k]
  102. warning("sitemap plugin: invalid changefreq `{0}'".format(v))
  103. warning(
  104. "sitemap plugin: setting SITEMAP['changefreqs']"
  105. "['{0}'] on '{1}'".format(k, default)
  106. )
  107. chfreqs[k] = default
  108. self.changefreqs.update(chfreqs)
  109. elif chfreqs is not None:
  110. warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
  111. warning("sitemap plugin: using the default values")
  112. def write_url(self, page, fd): # NOQA C901
  113. if getattr(page, "status", "published") != "published":
  114. return
  115. if getattr(page, "private", "False") == "True":
  116. return
  117. # We can disable categories/authors/etc by using False instead of ''
  118. if not page.save_as:
  119. return
  120. page_path = os.path.join(self.output_path, page.save_as)
  121. if not os.path.exists(page_path):
  122. return
  123. lastdate = getattr(page, "date", self.now)
  124. try:
  125. lastdate = self.get_date_modified(page, lastdate)
  126. except ValueError:
  127. warning(
  128. "sitemap plugin: " + page.save_as + " has invalid modification date,"
  129. )
  130. warning("sitemap plugin: using date value as lastmod.")
  131. lastmod = format_date(lastdate)
  132. if isinstance(page, contents.Article):
  133. pri = self.priorities["articles"]
  134. chfreq = self.changefreqs["articles"]
  135. elif isinstance(page, contents.Page):
  136. pri = self.priorities["pages"]
  137. chfreq = self.changefreqs["pages"]
  138. else:
  139. pri = self.priorities["indexes"]
  140. chfreq = self.changefreqs["indexes"]
  141. pageurl = "" if page.url == "index.html" else page.url
  142. # Exclude URLs from the sitemap:
  143. if self.format == "xml":
  144. flag = False
  145. for regstr in self.sitemapExclude:
  146. if re.match(regstr, pageurl):
  147. flag = True
  148. break
  149. if not flag:
  150. fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri))
  151. else:
  152. fd.write(self.siteurl + "/" + pageurl + "\n")
  153. def get_date_modified(self, page, default):
  154. if hasattr(page, "modified"):
  155. if isinstance(page.modified, datetime):
  156. return page.modified
  157. return get_date(page.modified)
  158. else:
  159. return default
  160. def set_url_wrappers_modification_date(self, wrappers):
  161. for (wrapper, articles) in wrappers:
  162. lastmod = datetime.min.replace(tzinfo=self.timezone)
  163. for article in articles:
  164. lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone))
  165. try:
  166. modified = self.get_date_modified(article, datetime.min).replace(
  167. tzinfo=self.timezone
  168. )
  169. lastmod = max(lastmod, modified)
  170. except ValueError:
  171. # Supressed: user will be notified.
  172. pass
  173. setattr(wrapper, "modified", str(lastmod))
  174. def generate_output(self, writer):
  175. path = os.path.join(self.output_path, "sitemap.{0}".format(self.format))
  176. pages = (
  177. self.context["pages"]
  178. + self.context["articles"]
  179. + [c for (c, a) in self.context["categories"]]
  180. + [t for (t, a) in self.context["tags"]]
  181. + [a for (a, b) in self.context["authors"]]
  182. )
  183. self.set_url_wrappers_modification_date(self.context["categories"])
  184. self.set_url_wrappers_modification_date(self.context["tags"])
  185. self.set_url_wrappers_modification_date(self.context["authors"])
  186. for article in self.context["articles"]:
  187. pages += article.translations
  188. info("writing {0}".format(path))
  189. with open(path, "w", encoding="utf-8") as fd:
  190. if self.format == "xml":
  191. fd.write(XML_HEADER)
  192. else:
  193. fd.write(TXT_HEADER.format(self.siteurl))
  194. FakePage = collections.namedtuple(
  195. "FakePage", ["status", "date", "url", "save_as"]
  196. )
  197. for standard_page in self.context["DIRECT_TEMPLATES"]:
  198. standard_page_url = self.context.get(
  199. "{}_URL".format(standard_page.upper())
  200. )
  201. standard_page_save_as = self.context.get(
  202. "{}_SAVE_AS".format(standard_page.upper())
  203. )
  204. # No save _SAVE_AS field means no output file. Skip.
  205. if not standard_page_save_as:
  206. continue
  207. fake = FakePage(
  208. status="published",
  209. date=self.now,
  210. url=standard_page_url or "{}.html".format(standard_page),
  211. save_as=standard_page_save_as,
  212. )
  213. self.write_url(fake, fd)
  214. # add template pages
  215. # We use items for Py3k compat. .iteritems() otherwise
  216. for path, template_page_url in self.context["TEMPLATE_PAGES"].items():
  217. # don't add duplicate entry for index page
  218. if template_page_url == "index.html":
  219. continue
  220. fake = FakePage(
  221. status="published",
  222. date=self.now,
  223. url=template_page_url,
  224. save_as=template_page_url,
  225. )
  226. self.write_url(fake, fd)
  227. for page in pages:
  228. self.write_url(page, fd)
  229. if self.format == "xml":
  230. fd.write(XML_FOOTER)
  231. def get_generators(generators):
  232. return SitemapGenerator
  233. def register():
  234. signals.get_generators.connect(get_generators)