# -*- coding: utf-8 -*- ''' Sitemap ------- The sitemap plugin generates plain-text or XML sitemaps. ''' from __future__ import unicode_literals import collections import os.path from datetime import datetime from logging import warning, info from codecs import open from pelican import signals, contents from pelican.utils import get_date TXT_HEADER = """{0}/index.html {0}/archives.html {0}/tags.html {0}/categories.html """ XML_HEADER = """ """ XML_URL = """ {0}/{1} {2} {3} {4} """ XML_FOOTER = """ """ def format_date(date): if date.tzinfo: tz = date.strftime('%s') tz = tz[:-2] + ':' + tz[-2:] else: tz = "-00:00" return date.strftime("%Y-%m-%dT%H:%M:%S") + tz class SitemapGenerator(object): def __init__(self, context, settings, path, theme, output_path, *null): self.output_path = output_path self.context = context self.now = datetime.now() self.siteurl = settings.get('SITEURL') self.format = 'xml' self.changefreqs = { 'articles': 'monthly', 'indexes': 'daily', 'pages': 'monthly' } self.priorities = { 'articles': 0.5, 'indexes': 0.5, 'pages': 0.5 } config = settings.get('SITEMAP', {}) if not isinstance(config, dict): warning("sitemap plugin: the SITEMAP setting must be a dict") else: fmt = config.get('format') pris = config.get('priorities') chfreqs = config.get('changefreqs') if fmt not in ('xml', 'txt'): warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'") warning("sitemap plugin: Setting SITEMAP['format'] on `xml'") elif fmt == 'txt': self.format = fmt return valid_keys = ('articles', 'indexes', 'pages') valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never') if isinstance(pris, dict): # We use items for Py3k compat. .iteritems() otherwise for k, v in pris.items(): if k in valid_keys and not isinstance(v, (int, float)): default = self.priorities[k] warning("sitemap plugin: priorities must be numbers") warning("sitemap plugin: setting SITEMAP['priorities']" "['{0}'] on {1}".format(k, default)) pris[k] = default self.priorities.update(pris) elif pris is not None: warning("sitemap plugin: SITEMAP['priorities'] must be a dict") warning("sitemap plugin: using the default values") if isinstance(chfreqs, dict): # .items() for py3k compat. for k, v in chfreqs.items(): if k in valid_keys and v not in valid_chfreqs: default = self.changefreqs[k] warning("sitemap plugin: invalid changefreq `{0}'".format(v)) warning("sitemap plugin: setting SITEMAP['changefreqs']" "['{0}'] on '{1}'".format(k, default)) chfreqs[k] = default self.changefreqs.update(chfreqs) elif chfreqs is not None: warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict") warning("sitemap plugin: using the default values") def write_url(self, page, fd): if getattr(page, 'status', 'published') != 'published': return page_path = os.path.join(self.output_path, page.url) if not os.path.exists(page_path): return lastdate = getattr(page, 'date', self.now) try: lastdate = self.get_date_modified(page, lastdate) except ValueError: warning("sitemap plugin: " + page.url + " has invalid modification date,") warning("sitemap plugin: using date value as lastmod.") lastmod = format_date(lastdate) if isinstance(page, contents.Article): pri = self.priorities['articles'] chfreq = self.changefreqs['articles'] elif isinstance(page, contents.Page): pri = self.priorities['pages'] chfreq = self.changefreqs['pages'] else: pri = self.priorities['indexes'] chfreq = self.changefreqs['indexes'] if self.format == 'xml': fd.write(XML_URL.format(self.siteurl, page.url, lastmod, chfreq, pri)) else: fd.write(self.siteurl + '/' + loc + '\n') def get_date_modified(self, page, defalut): if hasattr(page, 'modified'): return get_date(getattr(page, 'modified')) else: return defalut def set_url_wrappers_modification_date(self, wrappers): for (wrapper, articles) in wrappers: lastmod = datetime.min for article in articles: lastmod = max(lastmod, article.date) try: modified = self.get_date_modified(article, datetime.min); lastmod = max(lastmod, modified) except ValueError: # Supressed: user will be notified. pass setattr(wrapper, 'modified', str(lastmod)) def generate_output(self, writer): path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format)) pages = self.context['pages'] + self.context['articles'] \ + [ c for (c, a) in self.context['categories']] \ + [ t for (t, a) in self.context['tags']] \ + [ a for (a, b) in self.context['authors']] self.set_url_wrappers_modification_date(self.context['categories']) self.set_url_wrappers_modification_date(self.context['tags']) self.set_url_wrappers_modification_date(self.context['authors']) for article in self.context['articles']: pages += article.translations info('writing {0}'.format(path)) with open(path, 'w', encoding='utf-8') as fd: if self.format == 'xml': fd.write(XML_HEADER) else: fd.write(TXT_HEADER.format(self.siteurl)) FakePage = collections.namedtuple('FakePage', ['status', 'date', 'url']) for standard_page_url in ['index.html', 'archives.html', 'tags.html', 'categories.html']: fake = FakePage(status='published', date=self.now, url=standard_page_url) self.write_url(fake, fd) for page in pages: self.write_url(page, fd) if self.format == 'xml': fd.write(XML_FOOTER) def get_generators(generators): return SitemapGenerator def register(): signals.get_generators.connect(get_generators)