sitemap.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. # -*- coding: utf-8 -*-
  2. '''
  3. Sitemap
  4. -------
  5. The sitemap plugin generates plain-text or XML sitemaps.
  6. '''
  7. from __future__ import unicode_literals
  8. import collections
  9. import os.path
  10. from datetime import datetime
  11. from logging import warning, info
  12. from codecs import open
  13. from pelican import signals, contents
  14. from pelican.utils import get_date
  15. TXT_HEADER = """{0}/index.html
  16. {0}/archives.html
  17. {0}/tags.html
  18. {0}/categories.html
  19. """
  20. XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
  21. <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  22. xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  23. xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  24. """
  25. XML_URL = """
  26. <url>
  27. <loc>{0}/{1}</loc>
  28. <lastmod>{2}</lastmod>
  29. <changefreq>{3}</changefreq>
  30. <priority>{4}</priority>
  31. </url>
  32. """
  33. XML_FOOTER = """
  34. </urlset>
  35. """
  36. def format_date(date):
  37. if date.tzinfo:
  38. tz = date.strftime('%s')
  39. tz = tz[:-2] + ':' + tz[-2:]
  40. else:
  41. tz = "-00:00"
  42. return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
  43. class SitemapGenerator(object):
  44. def __init__(self, context, settings, path, theme, output_path, *null):
  45. self.output_path = output_path
  46. self.context = context
  47. self.now = datetime.now()
  48. self.siteurl = settings.get('SITEURL')
  49. self.format = 'xml'
  50. self.changefreqs = {
  51. 'articles': 'monthly',
  52. 'indexes': 'daily',
  53. 'pages': 'monthly'
  54. }
  55. self.priorities = {
  56. 'articles': 0.5,
  57. 'indexes': 0.5,
  58. 'pages': 0.5
  59. }
  60. config = settings.get('SITEMAP', {})
  61. if not isinstance(config, dict):
  62. warning("sitemap plugin: the SITEMAP setting must be a dict")
  63. else:
  64. fmt = config.get('format')
  65. pris = config.get('priorities')
  66. chfreqs = config.get('changefreqs')
  67. if fmt not in ('xml', 'txt'):
  68. warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
  69. warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
  70. elif fmt == 'txt':
  71. self.format = fmt
  72. return
  73. valid_keys = ('articles', 'indexes', 'pages')
  74. valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
  75. 'yearly', 'never')
  76. if isinstance(pris, dict):
  77. # We use items for Py3k compat. .iteritems() otherwise
  78. for k, v in pris.items():
  79. if k in valid_keys and not isinstance(v, (int, float)):
  80. default = self.priorities[k]
  81. warning("sitemap plugin: priorities must be numbers")
  82. warning("sitemap plugin: setting SITEMAP['priorities']"
  83. "['{0}'] on {1}".format(k, default))
  84. pris[k] = default
  85. self.priorities.update(pris)
  86. elif pris is not None:
  87. warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
  88. warning("sitemap plugin: using the default values")
  89. if isinstance(chfreqs, dict):
  90. # .items() for py3k compat.
  91. for k, v in chfreqs.items():
  92. if k in valid_keys and v not in valid_chfreqs:
  93. default = self.changefreqs[k]
  94. warning("sitemap plugin: invalid changefreq `{0}'".format(v))
  95. warning("sitemap plugin: setting SITEMAP['changefreqs']"
  96. "['{0}'] on '{1}'".format(k, default))
  97. chfreqs[k] = default
  98. self.changefreqs.update(chfreqs)
  99. elif chfreqs is not None:
  100. warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
  101. warning("sitemap plugin: using the default values")
  102. def write_url(self, page, fd):
  103. if getattr(page, 'status', 'published') != 'published':
  104. return
  105. page_path = os.path.join(self.output_path, page.url)
  106. if not os.path.exists(page_path):
  107. return
  108. lastdate = getattr(page, 'date', self.now)
  109. try:
  110. lastdate = self.get_date_modified(page, lastdate)
  111. except ValueError:
  112. warning("sitemap plugin: " + page.url + " has invalid modification date,")
  113. warning("sitemap plugin: using date value as lastmod.")
  114. lastmod = format_date(lastdate)
  115. if isinstance(page, contents.Article):
  116. pri = self.priorities['articles']
  117. chfreq = self.changefreqs['articles']
  118. elif isinstance(page, contents.Page):
  119. pri = self.priorities['pages']
  120. chfreq = self.changefreqs['pages']
  121. else:
  122. pri = self.priorities['indexes']
  123. chfreq = self.changefreqs['indexes']
  124. if self.format == 'xml':
  125. fd.write(XML_URL.format(self.siteurl, page.url, lastmod, chfreq, pri))
  126. else:
  127. fd.write(self.siteurl + '/' + loc + '\n')
  128. def get_date_modified(self, page, default):
  129. if hasattr(page, 'modified'):
  130. if isinstance(page.modified, datetime):
  131. return page.modified
  132. return get_date(page.modified)
  133. else:
  134. return default
  135. def set_url_wrappers_modification_date(self, wrappers):
  136. for (wrapper, articles) in wrappers:
  137. lastmod = datetime.min
  138. for article in articles:
  139. lastmod = max(lastmod, article.date)
  140. try:
  141. modified = self.get_date_modified(article, datetime.min);
  142. lastmod = max(lastmod, modified)
  143. except ValueError:
  144. # Supressed: user will be notified.
  145. pass
  146. setattr(wrapper, 'modified', str(lastmod))
  147. def generate_output(self, writer):
  148. path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
  149. pages = self.context['pages'] + self.context['articles'] \
  150. + [ c for (c, a) in self.context['categories']] \
  151. + [ t for (t, a) in self.context['tags']] \
  152. + [ a for (a, b) in self.context['authors']]
  153. self.set_url_wrappers_modification_date(self.context['categories'])
  154. self.set_url_wrappers_modification_date(self.context['tags'])
  155. self.set_url_wrappers_modification_date(self.context['authors'])
  156. for article in self.context['articles']:
  157. pages += article.translations
  158. info('writing {0}'.format(path))
  159. with open(path, 'w', encoding='utf-8') as fd:
  160. if self.format == 'xml':
  161. fd.write(XML_HEADER)
  162. else:
  163. fd.write(TXT_HEADER.format(self.siteurl))
  164. FakePage = collections.namedtuple('FakePage',
  165. ['status',
  166. 'date',
  167. 'url'])
  168. for standard_page_url in ['index.html',
  169. 'archives.html',
  170. 'tags.html',
  171. 'categories.html']:
  172. fake = FakePage(status='published',
  173. date=self.now,
  174. url=standard_page_url)
  175. self.write_url(fake, fd)
  176. for page in pages:
  177. self.write_url(page, fd)
  178. if self.format == 'xml':
  179. fd.write(XML_FOOTER)
  180. def get_generators(generators):
  181. return SitemapGenerator
  182. def register():
  183. signals.get_generators.connect(get_generators)