sitemap.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. # -*- coding: utf-8 -*-
  2. from __future__ import unicode_literals
  3. import collections
  4. import os.path
  5. from datetime import datetime
  6. from logging import warning, info
  7. from codecs import open
  8. from pelican import signals, contents
  9. TXT_HEADER = """{0}/index.html
  10. {0}/archives.html
  11. {0}/tags.html
  12. {0}/categories.html
  13. """
  14. XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
  15. <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  16. xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  17. xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  18. """
  19. XML_URL = """
  20. <url>
  21. <loc>{0}/{1}</loc>
  22. <lastmod>{2}</lastmod>
  23. <changefreq>{3}</changefreq>
  24. <priority>{4}</priority>
  25. </url>
  26. """
  27. XML_FOOTER = """
  28. </urlset>
  29. """
  30. def format_date(date):
  31. if date.tzinfo:
  32. tz = date.strftime('%s')
  33. tz = tz[:-2] + ':' + tz[-2:]
  34. else:
  35. tz = "-00:00"
  36. return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
  37. class SitemapGenerator(object):
  38. def __init__(self, context, settings, path, theme, output_path, *null):
  39. self.output_path = output_path
  40. self.context = context
  41. self.now = datetime.now()
  42. self.siteurl = settings.get('SITEURL')
  43. self.format = 'xml'
  44. self.changefreqs = {
  45. 'articles': 'monthly',
  46. 'indexes': 'daily',
  47. 'pages': 'monthly'
  48. }
  49. self.priorities = {
  50. 'articles': 0.5,
  51. 'indexes': 0.5,
  52. 'pages': 0.5
  53. }
  54. config = settings.get('SITEMAP', {})
  55. if not isinstance(config, dict):
  56. warning("sitemap plugin: the SITEMAP setting must be a dict")
  57. else:
  58. fmt = config.get('format')
  59. pris = config.get('priorities')
  60. chfreqs = config.get('changefreqs')
  61. if fmt not in ('xml', 'txt'):
  62. warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
  63. warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
  64. elif fmt == 'txt':
  65. self.format = fmt
  66. return
  67. valid_keys = ('articles', 'indexes', 'pages')
  68. valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
  69. 'yearly', 'never')
  70. if isinstance(pris, dict):
  71. # We use items for Py3k compat. .iteritems() otherwise
  72. for k, v in pris.items():
  73. if k in valid_keys and not isinstance(v, (int, float)):
  74. default = self.priorities[k]
  75. warning("sitemap plugin: priorities must be numbers")
  76. warning("sitemap plugin: setting SITEMAP['priorities']"
  77. "['{0}'] on {1}".format(k, default))
  78. pris[k] = default
  79. self.priorities.update(pris)
  80. elif pris is not None:
  81. warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
  82. warning("sitemap plugin: using the default values")
  83. if isinstance(chfreqs, dict):
  84. # .items() for py3k compat.
  85. for k, v in chfreqs.items():
  86. if k in valid_keys and v not in valid_chfreqs:
  87. default = self.changefreqs[k]
  88. warning("sitemap plugin: invalid changefreq `{0}'".format(v))
  89. warning("sitemap plugin: setting SITEMAP['changefreqs']"
  90. "['{0}'] on '{1}'".format(k, default))
  91. chfreqs[k] = default
  92. self.changefreqs.update(chfreqs)
  93. elif chfreqs is not None:
  94. warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
  95. warning("sitemap plugin: using the default values")
  96. def write_url(self, page, fd):
  97. if getattr(page, 'status', 'published') != 'published':
  98. return
  99. page_path = os.path.join(self.output_path, page.url)
  100. if not os.path.exists(page_path):
  101. return
  102. lastmod = format_date(getattr(page, 'date', self.now))
  103. if isinstance(page, contents.Article):
  104. pri = self.priorities['articles']
  105. chfreq = self.changefreqs['articles']
  106. elif isinstance(page, contents.Page):
  107. pri = self.priorities['pages']
  108. chfreq = self.changefreqs['pages']
  109. else:
  110. pri = self.priorities['indexes']
  111. chfreq = self.changefreqs['indexes']
  112. if self.format == 'xml':
  113. fd.write(XML_URL.format(self.siteurl, page.url, lastmod, chfreq, pri))
  114. else:
  115. fd.write(self.siteurl + '/' + loc + '\n')
  116. def generate_output(self, writer):
  117. path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
  118. pages = self.context['pages'] + self.context['articles'] \
  119. + [ c for (c, a) in self.context['categories']] \
  120. + [ t for (t, a) in self.context['tags']] \
  121. + [ a for (a, b) in self.context['authors']]
  122. for article in self.context['articles']:
  123. pages += article.translations
  124. info('writing {0}'.format(path))
  125. with open(path, 'w', encoding='utf-8') as fd:
  126. if self.format == 'xml':
  127. fd.write(XML_HEADER)
  128. else:
  129. fd.write(TXT_HEADER.format(self.siteurl))
  130. FakePage = collections.namedtuple('FakePage',
  131. ['status',
  132. 'date',
  133. 'url'])
  134. for standard_page_url in ['index.html',
  135. 'archives.html',
  136. 'tags.html',
  137. 'categories.html']:
  138. fake = FakePage(status='published',
  139. date=self.now,
  140. url=standard_page_url)
  141. self.write_url(fake, fd)
  142. for page in pages:
  143. self.write_url(page, fd)
  144. if self.format == 'xml':
  145. fd.write(XML_FOOTER)
  146. def get_generators(generators):
  147. return SitemapGenerator
  148. def register():
  149. signals.get_generators.connect(get_generators)