post_stats.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. # -*- coding: utf-8 -*-
  2. """
  3. Post Statistics
  4. ========================
  5. This plugin calculates various Statistics about a post and stores them in an article.stats disctionary:
  6. wc: how many words
  7. read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
  8. word_counts: frquency count of all the words in the article; can be used for tag/word clouds/
  9. """
  10. from pelican import signals
  11. from bs4 import BeautifulSoup
  12. import re
  13. from collections import Counter
  14. def calculate_stats(instance):
  15. if instance._content is not None:
  16. stats = {}
  17. content = instance._content
  18. # How fast do average people read?
  19. WPM = 250
  20. # Pre-process the text to remove entities
  21. entities = r'\&\#?.+?;'
  22. content = content.replace(' ', ' ')
  23. content = re.sub(entities, '', content)
  24. # Pre-process the text to remove punctuation
  25. drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
  26. content = content.translate(dict((ord(c), u'') for c in drop))
  27. # Use BeautifulSoup to get readable/visible text
  28. raw_text = BeautifulSoup(content).getText()
  29. # Count the words in the text
  30. words = raw_text.lower().split()
  31. word_count = Counter(words)
  32. # Return the stats
  33. stats['word_counts'] = word_count
  34. stats['wc'] = sum(word_count.values())
  35. # Calulate how long it'll take to read, rounding up
  36. stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
  37. if stats['read_mins'] == 0:
  38. stats['read_mins'] = 1
  39. instance.stats = stats
  40. def register():
  41. signals.content_object_init.connect(calculate_stats)