sitemap.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. # -*- coding: utf-8 -*-
  2. '''
  3. Sitemap
  4. -------
  5. The sitemap plugin generates plain-text or XML sitemaps.
  6. '''
  7. from __future__ import unicode_literals
  8. import re
  9. import collections
  10. import os.path
  11. from datetime import datetime
  12. from logging import warning, info
  13. from codecs import open
  14. from pytz import timezone
  15. from pelican import signals, contents
  16. from pelican.utils import get_date
  17. TXT_HEADER = """{0}/index.html
  18. {0}/archives.html
  19. {0}/tags.html
  20. {0}/categories.html
  21. """
  22. XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
  23. <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  24. xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  25. xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  26. """
  27. XML_URL = """
  28. <url>
  29. <loc>{0}/{1}</loc>
  30. <lastmod>{2}</lastmod>
  31. <changefreq>{3}</changefreq>
  32. <priority>{4}</priority>
  33. </url>
  34. """
  35. XML_FOOTER = """
  36. </urlset>
  37. """
  38. def format_date(date):
  39. if date.tzinfo:
  40. tz = date.strftime('%z')
  41. tz = tz[:-2] + ':' + tz[-2:]
  42. else:
  43. tz = "-00:00"
  44. return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
  45. class SitemapGenerator(object):
  46. def __init__(self, context, settings, path, theme, output_path, *null):
  47. self.output_path = output_path
  48. self.context = context
  49. self.now = datetime.now()
  50. self.siteurl = settings.get('SITEURL')
  51. self.default_timezone = settings.get('TIMEZONE', 'UTC')
  52. self.timezone = getattr(self, 'timezone', self.default_timezone)
  53. self.timezone = timezone(self.timezone)
  54. self.format = 'xml'
  55. self.changefreqs = {
  56. 'articles': 'monthly',
  57. 'indexes': 'daily',
  58. 'pages': 'monthly'
  59. }
  60. self.priorities = {
  61. 'articles': 0.5,
  62. 'indexes': 0.5,
  63. 'pages': 0.5
  64. }
  65. self.sitemapExclude = []
  66. config = settings.get('SITEMAP', {})
  67. if not isinstance(config, dict):
  68. warning("sitemap plugin: the SITEMAP setting must be a dict")
  69. else:
  70. fmt = config.get('format')
  71. pris = config.get('priorities')
  72. chfreqs = config.get('changefreqs')
  73. self.sitemapExclude = config.get('exclude', [])
  74. if fmt not in ('xml', 'txt'):
  75. warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
  76. warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
  77. elif fmt == 'txt':
  78. self.format = fmt
  79. return
  80. valid_keys = ('articles', 'indexes', 'pages')
  81. valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
  82. 'yearly', 'never')
  83. if isinstance(pris, dict):
  84. # We use items for Py3k compat. .iteritems() otherwise
  85. for k, v in pris.items():
  86. if k in valid_keys and not isinstance(v, (int, float)):
  87. default = self.priorities[k]
  88. warning("sitemap plugin: priorities must be numbers")
  89. warning("sitemap plugin: setting SITEMAP['priorities']"
  90. "['{0}'] on {1}".format(k, default))
  91. pris[k] = default
  92. self.priorities.update(pris)
  93. elif pris is not None:
  94. warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
  95. warning("sitemap plugin: using the default values")
  96. if isinstance(chfreqs, dict):
  97. # .items() for py3k compat.
  98. for k, v in chfreqs.items():
  99. if k in valid_keys and v not in valid_chfreqs:
  100. default = self.changefreqs[k]
  101. warning("sitemap plugin: invalid changefreq `{0}'".format(v))
  102. warning("sitemap plugin: setting SITEMAP['changefreqs']"
  103. "['{0}'] on '{1}'".format(k, default))
  104. chfreqs[k] = default
  105. self.changefreqs.update(chfreqs)
  106. elif chfreqs is not None:
  107. warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
  108. warning("sitemap plugin: using the default values")
  109. def write_url(self, page, fd):
  110. if getattr(page, 'status', 'published') != 'published':
  111. return
  112. if getattr(page, 'private', 'False') == 'True':
  113. return
  114. # We can disable categories/authors/etc by using False instead of ''
  115. if not page.save_as:
  116. return
  117. page_path = os.path.join(self.output_path, page.save_as)
  118. if not os.path.exists(page_path):
  119. return
  120. lastdate = getattr(page, 'date', self.now)
  121. try:
  122. lastdate = self.get_date_modified(page, lastdate)
  123. except ValueError:
  124. warning("sitemap plugin: " + page.save_as + " has invalid modification date,")
  125. warning("sitemap plugin: using date value as lastmod.")
  126. lastmod = format_date(lastdate)
  127. if isinstance(page, contents.Article):
  128. pri = self.priorities['articles']
  129. chfreq = self.changefreqs['articles']
  130. elif isinstance(page, contents.Page):
  131. pri = self.priorities['pages']
  132. chfreq = self.changefreqs['pages']
  133. else:
  134. pri = self.priorities['indexes']
  135. chfreq = self.changefreqs['indexes']
  136. pageurl = '' if page.url == 'index.html' else page.url
  137. # Exclude URLs from the sitemap:
  138. if self.format == 'xml':
  139. flag = False
  140. for regstr in self.sitemapExclude:
  141. if re.match(regstr, pageurl):
  142. flag = True
  143. break
  144. if not flag:
  145. fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri))
  146. else:
  147. fd.write(self.siteurl + '/' + pageurl + '\n')
  148. def get_date_modified(self, page, default):
  149. if hasattr(page, 'modified'):
  150. if isinstance(page.modified, datetime):
  151. return page.modified
  152. return get_date(page.modified)
  153. else:
  154. return default
  155. def set_url_wrappers_modification_date(self, wrappers):
  156. for (wrapper, articles) in wrappers:
  157. lastmod = datetime.min.replace(tzinfo=self.timezone)
  158. for article in articles:
  159. lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone))
  160. try:
  161. modified = self.get_date_modified(article, datetime.min).replace(tzinfo=self.timezone)
  162. lastmod = max(lastmod, modified)
  163. except ValueError:
  164. # Supressed: user will be notified.
  165. pass
  166. setattr(wrapper, 'modified', str(lastmod))
  167. def generate_output(self, writer):
  168. path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
  169. pages = self.context['pages'] + self.context['articles'] \
  170. + [c for (c, a) in self.context['categories']] \
  171. + [t for (t, a) in self.context['tags']] \
  172. + [a for (a, b) in self.context['authors']]
  173. self.set_url_wrappers_modification_date(self.context['categories'])
  174. self.set_url_wrappers_modification_date(self.context['tags'])
  175. self.set_url_wrappers_modification_date(self.context['authors'])
  176. for article in self.context['articles']:
  177. pages += article.translations
  178. info('writing {0}'.format(path))
  179. with open(path, 'w', encoding='utf-8') as fd:
  180. if self.format == 'xml':
  181. fd.write(XML_HEADER)
  182. else:
  183. fd.write(TXT_HEADER.format(self.siteurl))
  184. FakePage = collections.namedtuple('FakePage',
  185. ['status',
  186. 'date',
  187. 'url',
  188. 'save_as'])
  189. for standard_page_url in ['index.html',
  190. 'archives.html',
  191. 'tags.html',
  192. 'categories.html']:
  193. fake = FakePage(status='published',
  194. date=self.now,
  195. url=standard_page_url,
  196. save_as=standard_page_url)
  197. self.write_url(fake, fd)
  198. # add template pages
  199. # We use items for Py3k compat. .iteritems() otherwise
  200. for path, template_page_url in self.context['TEMPLATE_PAGES'].items():
  201. # don't add duplicate entry for index page
  202. if template_page_url == 'index.html':
  203. continue
  204. fake = FakePage(status='published',
  205. date=self.now,
  206. url=template_page_url,
  207. save_as=template_page_url)
  208. self.write_url(fake, fd)
  209. for page in pages:
  210. self.write_url(page, fd)
  211. if self.format == 'xml':
  212. fd.write(XML_FOOTER)
  213. def get_generators(generators):
  214. return SitemapGenerator
  215. def register():
  216. signals.get_generators.connect(get_generators)