post_stats.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. # -*- coding: utf-8 -*-
  2. """
  3. Post Statistics
  4. ========================
  5. This plugin calculates various statistics about a post and stores them in an article.stats dictionary:
  6. wc: how many words
  7. read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
  8. word_counts: frquency count of all the words in the article; can be used for tag/word clouds/
  9. fi: Flesch-kincaid Index/ Reading Ease
  10. fk: Flesch-kincaid Grade Level
  11. """
  12. from pelican import signals
  13. from bs4 import BeautifulSoup
  14. import re
  15. from collections import Counter
  16. from .readability import *
  17. def calculate_stats(instance):
  18. if instance._content is not None:
  19. stats = {}
  20. content = instance._content
  21. # How fast do average people read?
  22. WPM = 250
  23. # Use BeautifulSoup to get readable/visible text
  24. raw_text = BeautifulSoup(content, 'html.parser').getText()
  25. # Process the text to remove entities
  26. entities = r'\&\#?.+?;'
  27. raw_text = raw_text.replace(' ', ' ')
  28. raw_text = re.sub(entities, '', raw_text)
  29. # Flesch-kincaid readbility stats counts sentances,
  30. # so save before removing punctuation
  31. tmp = raw_text
  32. # Process the text to remove punctuation
  33. drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
  34. raw_text = raw_text.translate(dict((ord(c), u'') for c in drop))
  35. # Count the words in the text
  36. words = raw_text.lower().split()
  37. word_count = Counter(words)
  38. # Return the stats
  39. stats['word_counts'] = word_count
  40. stats['wc'] = sum(word_count.values())
  41. # Calulate how long it'll take to read, rounding up
  42. stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
  43. if stats['read_mins'] == 0:
  44. stats['read_mins'] = 1
  45. # Calculate Flesch-kincaid readbility stats
  46. readability_stats = stcs, words, sbls = text_stats(tmp, stats['wc'])
  47. stats['fi'] = "{:.2f}".format(flesch_index(readability_stats))
  48. stats['fk'] = "{:.2f}".format(flesch_kincaid_level(readability_stats))
  49. instance.stats = stats
  50. def register():
  51. signals.content_object_init.connect(calculate_stats)