sitemap.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. # -*- coding: utf-8 -*-
  2. #
  3. # cms.py - simple WSGI/Python based CMS script
  4. #
  5. # Copyright (C) 2021 Michael Buesch <m@bues.ch>
  6. #
  7. # This program is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 2 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. #from cms.cython_support cimport * #@cy
  20. from cms.db import *
  21. from cms.exception import *
  22. from cms.pageident import *
  23. #from cms.util import * #+cimport
  24. from xml.sax import saxutils
  25. __all__ = [
  26. "CMSSiteMap",
  27. ]
  28. class CMSSiteMap(object):
  29. """Site map generator.
  30. Specification: https://www.sitemaps.org/protocol.html
  31. """
  32. BASE_INDENT = 1
  33. INDENT = " "
  34. MORE_ESCAPES = {
  35. "'" : "&apos;",
  36. '"' : "&quot;",
  37. }
  38. def __init__(self, db, domain, urlBase):
  39. self.__db = db
  40. self.__domain = domain
  41. self.__urlBase = urlBase
  42. @classmethod
  43. def __xmlQuote(cls, string):
  44. return saxutils.escape(string, cls.MORE_ESCAPES)
  45. @classmethod
  46. def __oneElem(cls, ind, url, lastmod=None, changefreq=None, prio=None):
  47. ret = [ f'{ind}<url>' ]
  48. url = cls.__xmlQuote(url)
  49. ret.append(f'{ind}{cls.INDENT}<loc>{url}</loc>')
  50. if lastmod:
  51. lastmod = cls.__xmlQuote(lastmod)
  52. ret.append(f'{ind}{cls.INDENT}<lastmod>{lastmod}</lastmod>')
  53. if changefreq:
  54. changefreq = cls.__xmlQuote(changefreq)
  55. ret.append(f'{ind}{cls.INDENT}<changefreq>{changefreq}</changefreq>')
  56. if prio:
  57. prio = cls.__xmlQuote(prio)
  58. ret.append(f'{ind}{cls.INDENT}<priority>{prio}</priority>')
  59. ret.append(f'{ind}</url>')
  60. return ret
  61. def __getUrlElems(self, pageIdent, protocol, indent=BASE_INDENT):
  62. if self.__db.getNavStop(pageIdent):
  63. return
  64. ind = self.INDENT * indent
  65. if indent <= self.BASE_INDENT + 1:
  66. pageSuffix = "/" # Groups.
  67. else:
  68. pageSuffix = ".html" # Pages and sub groups.
  69. url = pageIdent.getUrl(protocol=protocol,
  70. domain=self.__domain,
  71. urlBase=self.__urlBase,
  72. pageSuffix=pageSuffix)
  73. if indent == self.BASE_INDENT + 1:
  74. # Main groups.
  75. lastmod = None
  76. changefreq = "monthly"
  77. prio = "0.3"
  78. else:
  79. # Pages, main page and sub groups.
  80. lastmod = self.__db.getPageStamp(pageIdent).strftime("%Y-%m-%dT%H:%M:%SZ")
  81. changefreq = None
  82. prio = "0.7"
  83. yield self.__oneElem(ind=ind,
  84. url=url,
  85. lastmod=lastmod,
  86. changefreq=changefreq,
  87. prio=prio)
  88. subPages = self.__db.getSubPages(pageIdent)
  89. if subPages:
  90. for pagename, pagelabel, pageprio in subPages:
  91. subPageIdent = CMSPageIdent(pageIdent + [pagename])
  92. yield from self.__getUrlElems(subPageIdent,
  93. protocol,
  94. indent + 1)
  95. def __getUserUrlElems(self, protocol):
  96. userSiteMap = self.__db.getString("site-map")
  97. if not userSiteMap:
  98. return
  99. for line in userSiteMap.splitlines():
  100. line = line.strip()
  101. if not line or line.startswith("#"):
  102. continue
  103. lineItems = line.split()
  104. if len(lineItems) == 1:
  105. url, prio, changefreq = lineItems[0], "0.7", "always"
  106. elif len(lineItems) == 2:
  107. url, prio, changefreq = lineItems[0], lineItems[1], "always"
  108. elif len(lineItems) == 3:
  109. url, prio, changefreq = lineItems[0], lineItems[1], lineItems[2]
  110. else:
  111. raise CMSException(500, "site-map: Invalid line format.")
  112. try:
  113. float(prio)
  114. except Exception:
  115. raise CMSException(500, "site-map: Invalid priority value.")
  116. if changefreq not in ("always", "hourly", "daily", "weekly",
  117. "monthly", "yearly", "never",):
  118. raise CMSException(500, "site-map: Invalid changefreq value.")
  119. url = f'{protocol}://{self.__domain}/{url}'
  120. yield self.__oneElem(ind=self.INDENT,
  121. url=url,
  122. changefreq=changefreq,
  123. prio=prio)
  124. def getSiteMap(self, rootPageIdent, protocol):
  125. ret = [ '<?xml version="1.0" encoding="UTF-8"?>' ]
  126. ret.append('<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9" '
  127. 'xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" '
  128. 'xsi:schemaLocation="https://www.sitemaps.org/schemas/sitemap/0.9 '
  129. 'https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">')
  130. for urlElemLines in self.__getUrlElems(rootPageIdent, protocol):
  131. ret.extend(urlElemLines)
  132. for urlElemLines in self.__getUserUrlElems(protocol):
  133. ret.extend(urlElemLines)
  134. ret.append('</urlset>')
  135. return "\n".join(ret)