sitemap.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. # -*- coding: utf-8 -*-
  2. '''
  3. Sitemap
  4. -------
  5. The sitemap plugin generates plain-text or XML sitemaps.
  6. '''
  7. from __future__ import unicode_literals
  8. import collections
  9. import os.path
  10. from datetime import datetime
  11. from logging import warning, info
  12. from codecs import open
  13. from pytz import timezone
  14. from pelican import signals, contents
  15. from pelican.utils import get_date
  16. TXT_HEADER = """{0}/index.html
  17. {0}/archives.html
  18. {0}/tags.html
  19. {0}/categories.html
  20. """
  21. XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
  22. <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  23. xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  24. xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  25. """
  26. XML_URL = """
  27. <url>
  28. <loc>{0}/{1}</loc>
  29. <lastmod>{2}</lastmod>
  30. <changefreq>{3}</changefreq>
  31. <priority>{4}</priority>
  32. </url>
  33. """
  34. XML_FOOTER = """
  35. </urlset>
  36. """
  37. def format_date(date):
  38. if date.tzinfo:
  39. tz = date.strftime('%z')
  40. tz = tz[:-2] + ':' + tz[-2:]
  41. else:
  42. tz = "-00:00"
  43. return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
  44. class SitemapGenerator(object):
  45. def __init__(self, context, settings, path, theme, output_path, *null):
  46. self.output_path = output_path
  47. self.context = context
  48. self.now = datetime.now()
  49. self.siteurl = settings.get('SITEURL')
  50. self.default_timezone = settings.get('TIMEZONE', 'UTC')
  51. self.timezone = getattr(self, 'timezone', self.default_timezone)
  52. self.timezone = timezone(self.timezone)
  53. self.format = 'xml'
  54. self.changefreqs = {
  55. 'articles': 'monthly',
  56. 'indexes': 'daily',
  57. 'pages': 'monthly'
  58. }
  59. self.priorities = {
  60. 'articles': 0.5,
  61. 'indexes': 0.5,
  62. 'pages': 0.5
  63. }
  64. config = settings.get('SITEMAP', {})
  65. if not isinstance(config, dict):
  66. warning("sitemap plugin: the SITEMAP setting must be a dict")
  67. else:
  68. fmt = config.get('format')
  69. pris = config.get('priorities')
  70. chfreqs = config.get('changefreqs')
  71. if fmt not in ('xml', 'txt'):
  72. warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
  73. warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
  74. elif fmt == 'txt':
  75. self.format = fmt
  76. return
  77. valid_keys = ('articles', 'indexes', 'pages')
  78. valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
  79. 'yearly', 'never')
  80. if isinstance(pris, dict):
  81. # We use items for Py3k compat. .iteritems() otherwise
  82. for k, v in pris.items():
  83. if k in valid_keys and not isinstance(v, (int, float)):
  84. default = self.priorities[k]
  85. warning("sitemap plugin: priorities must be numbers")
  86. warning("sitemap plugin: setting SITEMAP['priorities']"
  87. "['{0}'] on {1}".format(k, default))
  88. pris[k] = default
  89. self.priorities.update(pris)
  90. elif pris is not None:
  91. warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
  92. warning("sitemap plugin: using the default values")
  93. if isinstance(chfreqs, dict):
  94. # .items() for py3k compat.
  95. for k, v in chfreqs.items():
  96. if k in valid_keys and v not in valid_chfreqs:
  97. default = self.changefreqs[k]
  98. warning("sitemap plugin: invalid changefreq `{0}'".format(v))
  99. warning("sitemap plugin: setting SITEMAP['changefreqs']"
  100. "['{0}'] on '{1}'".format(k, default))
  101. chfreqs[k] = default
  102. self.changefreqs.update(chfreqs)
  103. elif chfreqs is not None:
  104. warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
  105. warning("sitemap plugin: using the default values")
  106. def write_url(self, page, fd):
  107. if getattr(page, 'status', 'published') != 'published':
  108. return
  109. # We can disable categories/authors/etc by using False instead of ''
  110. if not page.save_as:
  111. return
  112. page_path = os.path.join(self.output_path, page.save_as)
  113. if not os.path.exists(page_path):
  114. return
  115. lastdate = getattr(page, 'date', self.now)
  116. try:
  117. lastdate = self.get_date_modified(page, lastdate)
  118. except ValueError:
  119. warning("sitemap plugin: " + page.save_as + " has invalid modification date,")
  120. warning("sitemap plugin: using date value as lastmod.")
  121. lastmod = format_date(lastdate)
  122. if isinstance(page, contents.Article):
  123. pri = self.priorities['articles']
  124. chfreq = self.changefreqs['articles']
  125. elif isinstance(page, contents.Page):
  126. pri = self.priorities['pages']
  127. chfreq = self.changefreqs['pages']
  128. else:
  129. pri = self.priorities['indexes']
  130. chfreq = self.changefreqs['indexes']
  131. pageurl = '' if page.url == 'index.html' else page.url
  132. #Exclude URLs from the sitemap:
  133. sitemapExclude = []
  134. if self.format == 'xml':
  135. if pageurl not in sitemapExclude:
  136. fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri))
  137. else:
  138. fd.write(self.siteurl + '/' + pageurl + '\n')
  139. def get_date_modified(self, page, default):
  140. if hasattr(page, 'modified'):
  141. if isinstance(page.modified, datetime):
  142. return page.modified
  143. return get_date(page.modified)
  144. else:
  145. return default
  146. def set_url_wrappers_modification_date(self, wrappers):
  147. for (wrapper, articles) in wrappers:
  148. lastmod = datetime.min.replace(tzinfo=self.timezone)
  149. for article in articles:
  150. lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone))
  151. try:
  152. modified = self.get_date_modified(article, datetime.min).replace(tzinfo=self.timezone)
  153. lastmod = max(lastmod, modified)
  154. except ValueError:
  155. # Supressed: user will be notified.
  156. pass
  157. setattr(wrapper, 'modified', str(lastmod))
  158. def generate_output(self, writer):
  159. path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
  160. pages = self.context['pages'] + self.context['articles'] \
  161. + [ c for (c, a) in self.context['categories']] \
  162. + [ t for (t, a) in self.context['tags']] \
  163. + [ a for (a, b) in self.context['authors']]
  164. self.set_url_wrappers_modification_date(self.context['categories'])
  165. self.set_url_wrappers_modification_date(self.context['tags'])
  166. self.set_url_wrappers_modification_date(self.context['authors'])
  167. for article in self.context['articles']:
  168. pages += article.translations
  169. info('writing {0}'.format(path))
  170. with open(path, 'w', encoding='utf-8') as fd:
  171. if self.format == 'xml':
  172. fd.write(XML_HEADER)
  173. else:
  174. fd.write(TXT_HEADER.format(self.siteurl))
  175. FakePage = collections.namedtuple('FakePage',
  176. ['status',
  177. 'date',
  178. 'url',
  179. 'save_as'])
  180. for standard_page_url in ['index.html',
  181. 'archives.html',
  182. 'tags.html',
  183. 'categories.html']:
  184. fake = FakePage(status='published',
  185. date=self.now,
  186. url=standard_page_url,
  187. save_as=standard_page_url)
  188. self.write_url(fake, fd)
  189. for page in pages:
  190. self.write_url(page, fd)
  191. if self.format == 'xml':
  192. fd.write(XML_FOOTER)
  193. def get_generators(generators):
  194. return SitemapGenerator
  195. def register():
  196. signals.get_generators.connect(get_generators)