extract_toc.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. # -*- coding: utf-8 -*-
  2. """
  3. Extract Table of Content
  4. ========================
  5. A Pelican plugin to extract table of contents (ToC) from `article.content` and
  6. place it in its own `article.toc` variable for use in templates.
  7. """
  8. from os import path
  9. from bs4 import BeautifulSoup
  10. from pelican import signals, readers, contents
  11. try:
  12. from pandoc_reader import PandocReader
  13. except ImportError:
  14. PandocReader = False
  15. def extract_toc(content):
  16. if isinstance(content, contents.Static):
  17. return
  18. soup = BeautifulSoup(content._content,'html.parser')
  19. filename = content.source_path
  20. extension = path.splitext(filename)[1][1:]
  21. toc = None
  22. # default Markdown reader
  23. if not toc and readers.MarkdownReader.enabled and extension in readers.MarkdownReader.file_extensions:
  24. toc = soup.find('div', class_='toc')
  25. if toc: toc.extract()
  26. # default reStructuredText reader
  27. if not toc and readers.RstReader.enabled and extension in readers.RstReader.file_extensions:
  28. toc = soup.find('div', class_='contents topic')
  29. if toc: toc.extract()
  30. if toc:
  31. tag=BeautifulSoup(str(toc), 'html.parser')
  32. tag.div['class']='toc'
  33. tag.div['id']=''
  34. p=tag.find('p', class_='topic-title first')
  35. if p:p.extract()
  36. toc=tag
  37. # Pandoc reader (markdown and other formats)
  38. if not toc and PandocReader and PandocReader.enabled and extension in PandocReader.file_extensions:
  39. toc = soup.find('nav', id='TOC')
  40. if toc:
  41. toc.extract()
  42. content._content = soup.decode()
  43. content.toc = toc.decode()
  44. if content.toc.startswith('<html>'):
  45. content.toc = content.toc[12:-14]
  46. def register():
  47. signals.content_object_init.connect(extract_toc)