sitemap.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. # -*- coding: utf-8 -*-
  2. '''
  3. Sitemap
  4. -------
  5. The sitemap plugin generates plain-text or XML sitemaps.
  6. '''
  7. from __future__ import unicode_literals
  8. import re
  9. import collections
  10. import os.path
  11. from datetime import datetime
  12. from logging import warning, info
  13. from codecs import open
  14. from pytz import timezone
  15. from pelican import signals, contents
  16. from pelican.utils import get_date
  17. TXT_HEADER = """{0}/index.html
  18. {0}/archives.html
  19. {0}/tags.html
  20. {0}/categories.html
  21. """
  22. XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
  23. <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  24. xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  25. xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  26. """
  27. XML_URL = """
  28. <url>
  29. <loc>{0}/{1}</loc>
  30. <lastmod>{2}</lastmod>
  31. <changefreq>{3}</changefreq>
  32. <priority>{4}</priority>
  33. </url>
  34. """
  35. XML_FOOTER = """
  36. </urlset>
  37. """
  38. def format_date(date):
  39. if date.tzinfo:
  40. tz = date.strftime('%z')
  41. tz = tz[:-2] + ':' + tz[-2:]
  42. else:
  43. tz = "-00:00"
  44. return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
  45. class SitemapGenerator(object):
  46. def __init__(self, context, settings, path, theme, output_path, *null):
  47. self.output_path = output_path
  48. self.context = context
  49. self.now = datetime.now()
  50. self.siteurl = settings.get('SITEURL')
  51. self.default_timezone = settings.get('TIMEZONE', 'UTC')
  52. self.timezone = getattr(self, 'timezone', self.default_timezone)
  53. self.timezone = timezone(self.timezone)
  54. self.format = 'xml'
  55. self.changefreqs = {
  56. 'articles': 'monthly',
  57. 'indexes': 'daily',
  58. 'pages': 'monthly'
  59. }
  60. self.priorities = {
  61. 'articles': 0.5,
  62. 'indexes': 0.5,
  63. 'pages': 0.5
  64. }
  65. self.sitemapExclude = []
  66. config = settings.get('SITEMAP', {})
  67. if not isinstance(config, dict):
  68. warning("sitemap plugin: the SITEMAP setting must be a dict")
  69. else:
  70. fmt = config.get('format')
  71. pris = config.get('priorities')
  72. chfreqs = config.get('changefreqs')
  73. self.sitemapExclude = config.get('exclude', [])
  74. if fmt not in ('xml', 'txt'):
  75. warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
  76. warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
  77. elif fmt == 'txt':
  78. self.format = fmt
  79. return
  80. valid_keys = ('articles', 'indexes', 'pages')
  81. valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
  82. 'yearly', 'never')
  83. if isinstance(pris, dict):
  84. # We use items for Py3k compat. .iteritems() otherwise
  85. for k, v in pris.items():
  86. if k in valid_keys and not isinstance(v, (int, float)):
  87. default = self.priorities[k]
  88. warning("sitemap plugin: priorities must be numbers")
  89. warning("sitemap plugin: setting SITEMAP['priorities']"
  90. "['{0}'] on {1}".format(k, default))
  91. pris[k] = default
  92. self.priorities.update(pris)
  93. elif pris is not None:
  94. warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
  95. warning("sitemap plugin: using the default values")
  96. if isinstance(chfreqs, dict):
  97. # .items() for py3k compat.
  98. for k, v in chfreqs.items():
  99. if k in valid_keys and v not in valid_chfreqs:
  100. default = self.changefreqs[k]
  101. warning("sitemap plugin: invalid changefreq `{0}'".format(v))
  102. warning("sitemap plugin: setting SITEMAP['changefreqs']"
  103. "['{0}'] on '{1}'".format(k, default))
  104. chfreqs[k] = default
  105. self.changefreqs.update(chfreqs)
  106. elif chfreqs is not None:
  107. warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
  108. warning("sitemap plugin: using the default values")
  109. def write_url(self, page, fd):
  110. if getattr(page, 'status', 'published') != 'published':
  111. return
  112. # We can disable categories/authors/etc by using False instead of ''
  113. if not page.save_as:
  114. return
  115. page_path = os.path.join(self.output_path, page.save_as)
  116. if not os.path.exists(page_path):
  117. return
  118. lastdate = getattr(page, 'date', self.now)
  119. try:
  120. lastdate = self.get_date_modified(page, lastdate)
  121. except ValueError:
  122. warning("sitemap plugin: " + page.save_as + " has invalid modification date,")
  123. warning("sitemap plugin: using date value as lastmod.")
  124. lastmod = format_date(lastdate)
  125. if isinstance(page, contents.Article):
  126. pri = self.priorities['articles']
  127. chfreq = self.changefreqs['articles']
  128. elif isinstance(page, contents.Page):
  129. pri = self.priorities['pages']
  130. chfreq = self.changefreqs['pages']
  131. else:
  132. pri = self.priorities['indexes']
  133. chfreq = self.changefreqs['indexes']
  134. pageurl = '' if page.url == 'index.html' else page.url
  135. #Exclude URLs from the sitemap:
  136. if self.format == 'xml':
  137. flag = False
  138. for regstr in self.sitemapExclude:
  139. if re.match(regstr, pageurl):
  140. flag = True
  141. break
  142. if not flag:
  143. fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri))
  144. else:
  145. fd.write(self.siteurl + '/' + pageurl + '\n')
  146. def get_date_modified(self, page, default):
  147. if hasattr(page, 'modified'):
  148. if isinstance(page.modified, datetime):
  149. return page.modified
  150. return get_date(page.modified)
  151. else:
  152. return default
  153. def set_url_wrappers_modification_date(self, wrappers):
  154. for (wrapper, articles) in wrappers:
  155. lastmod = datetime.min.replace(tzinfo=self.timezone)
  156. for article in articles:
  157. lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone))
  158. try:
  159. modified = self.get_date_modified(article, datetime.min).replace(tzinfo=self.timezone)
  160. lastmod = max(lastmod, modified)
  161. except ValueError:
  162. # Supressed: user will be notified.
  163. pass
  164. setattr(wrapper, 'modified', str(lastmod))
  165. def generate_output(self, writer):
  166. path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
  167. pages = self.context['pages'] + self.context['articles'] \
  168. + [ c for (c, a) in self.context['categories']] \
  169. + [ t for (t, a) in self.context['tags']] \
  170. + [ a for (a, b) in self.context['authors']]
  171. self.set_url_wrappers_modification_date(self.context['categories'])
  172. self.set_url_wrappers_modification_date(self.context['tags'])
  173. self.set_url_wrappers_modification_date(self.context['authors'])
  174. for article in self.context['articles']:
  175. pages += article.translations
  176. info('writing {0}'.format(path))
  177. with open(path, 'w', encoding='utf-8') as fd:
  178. if self.format == 'xml':
  179. fd.write(XML_HEADER)
  180. else:
  181. fd.write(TXT_HEADER.format(self.siteurl))
  182. FakePage = collections.namedtuple('FakePage',
  183. ['status',
  184. 'date',
  185. 'url',
  186. 'save_as'])
  187. for standard_page_url in ['index.html',
  188. 'archives.html',
  189. 'tags.html',
  190. 'categories.html']:
  191. fake = FakePage(status='published',
  192. date=self.now,
  193. url=standard_page_url,
  194. save_as=standard_page_url)
  195. self.write_url(fake, fd)
  196. # add template pages
  197. # We use items for Py3k compat. .iteritems() otherwise
  198. for path, template_page_url in self.context['TEMPLATE_PAGES'].items():
  199. # don't add duplicate entry for index page
  200. if template_page_url == 'index.html':
  201. continue
  202. fake = FakePage(status='published',
  203. date=self.now,
  204. url=template_page_url,
  205. save_as=template_page_url)
  206. self.write_url(fake, fd)
  207. for page in pages:
  208. self.write_url(page, fd)
  209. if self.format == 'xml':
  210. fd.write(XML_FOOTER)
  211. def get_generators(generators):
  212. return SitemapGenerator
  213. def register():
  214. signals.get_generators.connect(get_generators)