post_stats.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. # -*- coding: utf-8 -*-
  2. """
  3. Post Statistics
  4. ========================
  5. This plugin calculates various Statistics about a post and stores them in an article.stats disctionary.
  6. wc: how many words
  7. read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
  8. word_count: frquency count of all the words in the article; can be used for tag/word clouds/
  9. """
  10. from pelican import signals, contents
  11. # import nltk
  12. from bs4 import BeautifulSoup
  13. # import lxml.html
  14. # from lxml.html.clean import Cleaner
  15. import re
  16. from collections import Counter
  17. def calculate_stats(instance):
  18. WPM = 250
  19. if instance._content is not None:
  20. stats = {}
  21. content = instance._content
  22. # print content
  23. entities = r'\&\#?.+?;'
  24. content = content.replace(' ', ' ')
  25. content = re.sub(entities, '', content)
  26. # print content
  27. # Pre-process the text to remove punctuation
  28. drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
  29. content = content.translate(dict((ord(c), u'') for c in drop))
  30. # nltk
  31. # raw_text = nltk.clean_html(content)
  32. # BeautifulSoup
  33. raw_text = BeautifulSoup(content).getText()
  34. # raw_text = ''.join(BeautifulSoup(content).findAll(text=True))
  35. # lxml
  36. # cleaner = Cleaner(style=True)
  37. # html = lxml.html.fromstring(content)
  38. # raw_text = cleaner.clean_html(html).text_content()
  39. # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1
  40. # print raw_text
  41. words = raw_text.lower().split()
  42. word_count = Counter(words)
  43. # print word_count
  44. stats['word_counts'] = word_count
  45. stats['wc'] = sum(word_count.values())
  46. stats['read_minutes'] = (stats['wc'] + WPM // 2) // WPM
  47. instance.stats = stats
  48. instance.raw_text = raw_text
  49. def register():
  50. signals.content_object_init.connect(calculate_stats)