readtime.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import re
  2. import math
  3. from pelican import signals
  4. from html.parser import HTMLParser #use html.parser for Python 3.6
  5. # http://en.wikipedia.org/wiki/Words_per_minute
  6. WPM = 230.0
  7. class MLStripper(HTMLParser):
  8. def __init__(self):
  9. super().__init__() # subclassing HTMLParser, also need to calling
  10. # super class's '__init__' method
  11. self.reset()
  12. self.fed = []
  13. #this method is called whenever a 'data' is encountered.
  14. def handle_data(self, d):
  15. self.fed.append(d)
  16. # join all content word into one long sentence for further processing
  17. def get_data(self):
  18. return ''.join(self.fed)
  19. def strip_tags(html):
  20. s = MLStripper()
  21. s.feed(html) # Feed the class with html content, get the fed list
  22. return s.get_data()
  23. def calculate_readtime(content_object):
  24. if content_object._content is not None:
  25. content = content_object._content # get the content html from Pelican
  26. text = strip_tags(content) #strip tags and get long sentence
  27. words = re.split(r'[^0-9A-Za-z]+', text) # split the long sentence into list of words
  28. num_words = len(words) # count the words
  29. minutes = int(math.ceil(num_words / WPM)) #calculate the minutes
  30. #set minimum read time to 1 minutes.
  31. if minutes == 0:
  32. minutes = 1
  33. content_object.readtime = {
  34. "minutes": minutes,
  35. }
  36. def register():
  37. signals.content_object_init.connect(calculate_readtime) # connect with 'content_object_init' signal.