Browse Source

Tidies up, removed commented out nltk & lxml alternative versions.

Duncan Lock 11 years ago
parent
commit
546d003682
1 changed files with 14 additions and 36 deletions
  1. 14 36
      post_stats/post_stats.py

+ 14 - 36
post_stats/post_stats.py

@@ -3,76 +3,54 @@
 Post Statistics
 Post Statistics
 ========================
 ========================
 
 
-This plugin calculates various Statistics about a post and stores them in an article.stats disctionary.
+This plugin calculates various Statistics about a post and stores them in an article.stats disctionary:
 
 
 wc: how many words
 wc: how many words
-read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
-word_count: frquency count of all the words in the article; can be used for tag/word clouds/
+read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
+word_counts: frquency count of all the words in the article; can be used for tag/word clouds/
 
 
 """
 """
 
 
 from pelican import signals
 from pelican import signals
-# import math
-
-# import nltk
-
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
-
-# import lxml.html
-# from lxml.html.clean import Cleaner
-
 import re
 import re
 from collections import Counter
 from collections import Counter
 
 
 
 
 def calculate_stats(instance):
 def calculate_stats(instance):
 
 
-    # How fast do average people read?
-    WPM = 250
-
     if instance._content is not None:
     if instance._content is not None:
         stats = {}
         stats = {}
         content = instance._content
         content = instance._content
 
 
-        # print content
+        # How fast do average people read?
+        WPM = 250
+
+        # Pre-process the text to remove entities
         entities = r'\&\#?.+?;'
         entities = r'\&\#?.+?;'
         content = content.replace(' ', ' ')
         content = content.replace(' ', ' ')
         content = re.sub(entities, '', content)
         content = re.sub(entities, '', content)
-        # print content
 
 
         # Pre-process the text to remove punctuation
         # Pre-process the text to remove punctuation
         drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
         drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
         content = content.translate(dict((ord(c), u'') for c in drop))
         content = content.translate(dict((ord(c), u'') for c in drop))
 
 
-        # nltk
-        # raw_text = nltk.clean_html(content)
-
-        # BeautifulSoup
+        # Use BeautifulSoup to get readable/visible text
         raw_text = BeautifulSoup(content).getText()
         raw_text = BeautifulSoup(content).getText()
-        # raw_text = ''.join(BeautifulSoup(content).findAll(text=True))
-
-        # lxml
-        # cleaner = Cleaner(style=True)
-        # html = lxml.html.fromstring(content)
-        # raw_text = cleaner.clean_html(html).text_content()
-
-        # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1
-
-        # print raw_text
 
 
+        # Count the words in the text
         words = raw_text.lower().split()
         words = raw_text.lower().split()
         word_count = Counter(words)
         word_count = Counter(words)
-        # print word_count
 
 
+        # Return the stats
         stats['word_counts'] = word_count
         stats['word_counts'] = word_count
         stats['wc'] = sum(word_count.values())
         stats['wc'] = sum(word_count.values())
-        # stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM))
-        stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM
-        if stats['read_minutes'] == 0:
-            stats['read_minutes'] = 1
+        # Calulate how long it'll take to read, rounding up
+        stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
+        if stats['read_mins'] == 0:
+            stats['read_mins'] = 1
 
 
         instance.stats = stats
         instance.stats = stats
-        instance.raw_text = raw_text
 
 
 
 
 def register():
 def register():