sitemap.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. # -*- coding: utf-8 -*-
  2. '''
  3. Sitemap
  4. -------
  5. The sitemap plugin generates plain-text or XML sitemaps.
  6. '''
  7. from __future__ import unicode_literals
  8. import collections
  9. import os.path
  10. from datetime import datetime
  11. from logging import warning, info
  12. from codecs import open
  13. from pelican import signals, contents
  14. TXT_HEADER = """{0}/index.html
  15. {0}/archives.html
  16. {0}/tags.html
  17. {0}/categories.html
  18. """
  19. XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
  20. <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  21. xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  22. xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  23. """
  24. XML_URL = """
  25. <url>
  26. <loc>{0}/{1}</loc>
  27. <lastmod>{2}</lastmod>
  28. <changefreq>{3}</changefreq>
  29. <priority>{4}</priority>
  30. </url>
  31. """
  32. XML_FOOTER = """
  33. </urlset>
  34. """
  35. def format_date(date):
  36. if date.tzinfo:
  37. tz = date.strftime('%s')
  38. tz = tz[:-2] + ':' + tz[-2:]
  39. else:
  40. tz = "-00:00"
  41. return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
  42. class SitemapGenerator(object):
  43. def __init__(self, context, settings, path, theme, output_path, *null):
  44. self.output_path = output_path
  45. self.context = context
  46. self.now = datetime.now()
  47. self.siteurl = settings.get('SITEURL')
  48. self.format = 'xml'
  49. self.changefreqs = {
  50. 'articles': 'monthly',
  51. 'indexes': 'daily',
  52. 'pages': 'monthly'
  53. }
  54. self.priorities = {
  55. 'articles': 0.5,
  56. 'indexes': 0.5,
  57. 'pages': 0.5
  58. }
  59. config = settings.get('SITEMAP', {})
  60. if not isinstance(config, dict):
  61. warning("sitemap plugin: the SITEMAP setting must be a dict")
  62. else:
  63. fmt = config.get('format')
  64. pris = config.get('priorities')
  65. chfreqs = config.get('changefreqs')
  66. if fmt not in ('xml', 'txt'):
  67. warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
  68. warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
  69. elif fmt == 'txt':
  70. self.format = fmt
  71. return
  72. valid_keys = ('articles', 'indexes', 'pages')
  73. valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
  74. 'yearly', 'never')
  75. if isinstance(pris, dict):
  76. # We use items for Py3k compat. .iteritems() otherwise
  77. for k, v in pris.items():
  78. if k in valid_keys and not isinstance(v, (int, float)):
  79. default = self.priorities[k]
  80. warning("sitemap plugin: priorities must be numbers")
  81. warning("sitemap plugin: setting SITEMAP['priorities']"
  82. "['{0}'] on {1}".format(k, default))
  83. pris[k] = default
  84. self.priorities.update(pris)
  85. elif pris is not None:
  86. warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
  87. warning("sitemap plugin: using the default values")
  88. if isinstance(chfreqs, dict):
  89. # .items() for py3k compat.
  90. for k, v in chfreqs.items():
  91. if k in valid_keys and v not in valid_chfreqs:
  92. default = self.changefreqs[k]
  93. warning("sitemap plugin: invalid changefreq `{0}'".format(v))
  94. warning("sitemap plugin: setting SITEMAP['changefreqs']"
  95. "['{0}'] on '{1}'".format(k, default))
  96. chfreqs[k] = default
  97. self.changefreqs.update(chfreqs)
  98. elif chfreqs is not None:
  99. warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
  100. warning("sitemap plugin: using the default values")
  101. def write_url(self, page, fd):
  102. if getattr(page, 'status', 'published') != 'published':
  103. return
  104. page_path = os.path.join(self.output_path, page.url)
  105. if not os.path.exists(page_path):
  106. return
  107. lastmod = format_date(getattr(page, 'date', self.now))
  108. if isinstance(page, contents.Article):
  109. pri = self.priorities['articles']
  110. chfreq = self.changefreqs['articles']
  111. elif isinstance(page, contents.Page):
  112. pri = self.priorities['pages']
  113. chfreq = self.changefreqs['pages']
  114. else:
  115. pri = self.priorities['indexes']
  116. chfreq = self.changefreqs['indexes']
  117. if self.format == 'xml':
  118. fd.write(XML_URL.format(self.siteurl, page.url, lastmod, chfreq, pri))
  119. else:
  120. fd.write(self.siteurl + '/' + loc + '\n')
  121. def generate_output(self, writer):
  122. path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
  123. pages = self.context['pages'] + self.context['articles'] \
  124. + [ c for (c, a) in self.context['categories']] \
  125. + [ t for (t, a) in self.context['tags']] \
  126. + [ a for (a, b) in self.context['authors']]
  127. for article in self.context['articles']:
  128. pages += article.translations
  129. info('writing {0}'.format(path))
  130. with open(path, 'w', encoding='utf-8') as fd:
  131. if self.format == 'xml':
  132. fd.write(XML_HEADER)
  133. else:
  134. fd.write(TXT_HEADER.format(self.siteurl))
  135. FakePage = collections.namedtuple('FakePage',
  136. ['status',
  137. 'date',
  138. 'url'])
  139. for standard_page_url in ['index.html',
  140. 'archives.html',
  141. 'tags.html',
  142. 'categories.html']:
  143. fake = FakePage(status='published',
  144. date=self.now,
  145. url=standard_page_url)
  146. self.write_url(fake, fd)
  147. for page in pages:
  148. self.write_url(page, fd)
  149. if self.format == 'xml':
  150. fd.write(XML_FOOTER)
  151. def get_generators(generators):
  152. return SitemapGenerator
  153. def register():
  154. signals.get_generators.connect(get_generators)