extract_toc.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. # -*- coding: utf-8 -*-
  2. """
  3. Extract Table of Content
  4. ========================
  5. A Pelican plugin to extract table of contents (ToC) from `article.content` and
  6. place it in its own `article.toc` variable for use in templates.
  7. """
  8. from os import path
  9. from bs4 import BeautifulSoup
  10. from pelican import signals, readers, contents
  11. def extract_toc(content):
  12. if isinstance(content, contents.Static):
  13. return
  14. soup = BeautifulSoup(content._content,'html.parser')
  15. filename = content.source_path
  16. extension = path.splitext(filename)[1][1:]
  17. toc = None
  18. # if it is a Markdown file
  19. if extension in readers.MarkdownReader.file_extensions:
  20. toc = soup.find('div', class_='toc')
  21. if toc: toc.extract()
  22. # else if it is a reST file
  23. elif extension in readers.RstReader.file_extensions:
  24. toc = soup.find('div', class_='contents topic')
  25. if toc: toc.extract()
  26. if toc:
  27. tag=BeautifulSoup(str(toc), 'html.parser')
  28. tag.div['class']='toc'
  29. tag.div['id']=''
  30. p=tag.find('p', class_='topic-title first')
  31. if p:p.extract()
  32. toc=tag
  33. elif not toc: # Pandoc reader
  34. toc = soup.find('nav', id='TOC')
  35. if toc:
  36. toc.extract()
  37. content._content = soup.decode()
  38. content.toc = toc.decode()
  39. if content.toc.startswith('<html>'):
  40. content.toc = content.toc[12:-14]
  41. def register():
  42. signals.content_object_init.connect(extract_toc)