post_stats.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. # -*- coding: utf-8 -*-
  2. """
  3. Post Statistics
  4. ========================
  5. This plugin calculates various Statistics about a post and stores them in an article.stats disctionary.
  6. wc: how many words
  7. read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
  8. word_count: frquency count of all the words in the article; can be used for tag/word clouds/
  9. """
  10. from pelican import signals
  11. # import math
  12. # import nltk
  13. from bs4 import BeautifulSoup
  14. # import lxml.html
  15. # from lxml.html.clean import Cleaner
  16. import re
  17. from collections import Counter
  18. def calculate_stats(instance):
  19. # How fast do average people read?
  20. WPM = 250
  21. if instance._content is not None:
  22. stats = {}
  23. content = instance._content
  24. # print content
  25. entities = r'\&\#?.+?;'
  26. content = content.replace(' ', ' ')
  27. content = re.sub(entities, '', content)
  28. # print content
  29. # Pre-process the text to remove punctuation
  30. drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
  31. content = content.translate(dict((ord(c), u'') for c in drop))
  32. # nltk
  33. # raw_text = nltk.clean_html(content)
  34. # BeautifulSoup
  35. raw_text = BeautifulSoup(content).getText()
  36. # raw_text = ''.join(BeautifulSoup(content).findAll(text=True))
  37. # lxml
  38. # cleaner = Cleaner(style=True)
  39. # html = lxml.html.fromstring(content)
  40. # raw_text = cleaner.clean_html(html).text_content()
  41. # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1
  42. # print raw_text
  43. words = raw_text.lower().split()
  44. word_count = Counter(words)
  45. # print word_count
  46. stats['word_counts'] = word_count
  47. stats['wc'] = sum(word_count.values())
  48. # stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM))
  49. stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM
  50. instance.stats = stats
  51. instance.raw_text = raw_text
  52. def register():
  53. signals.content_object_init.connect(calculate_stats)