Browse Source

Always specify parser for Beautiful Soup; fixes extraneous tags

The bug that tickled this for me was an extra <html><body> wrapping
the output of page.content.

Thanks to Avaris and winlu in #pelican for helping track down the root cause of
this bug.
Alex Waite 8 years ago
parent
commit
873c525b46

+ 1 - 1
better_figures_and_images/better_figures_and_images.py

@@ -28,7 +28,7 @@ def content_object_init(instance):
 
     if instance._content is not None:
         content = instance._content
-        soup = BeautifulSoup(content)
+        soup = BeautifulSoup(content, 'html.parser')
 
         if 'img' in content:
             for img in soup('img'):

+ 1 - 1
extract_toc/extract_toc.py

@@ -29,7 +29,7 @@ def extract_toc(content):
         toc = soup.find('div', class_='contents topic')
         if toc: toc.extract()
         if toc:
-            tag=BeautifulSoup(str(toc))
+            tag=BeautifulSoup(str(toc), 'html.parser')
             tag.div['class']='toc'
             tag.div['id']=''
             p=tag.find('p', class_='topic-title first')

+ 1 - 1
post_stats/post_stats.py

@@ -31,7 +31,7 @@ def calculate_stats(instance):
         WPM = 250
 
         # Use BeautifulSoup to get readable/visible text
-        raw_text = BeautifulSoup(content).getText()
+        raw_text = BeautifulSoup(content, 'html.parser').getText()
 
         # Process the text to remove entities
         entities = r'\&\#?.+?;'

+ 1 - 1
slim/slim.py

@@ -75,7 +75,7 @@ def get_writer(sender):
                 if ('SLIM_OPTIONS' in self.settings and
                         'PRETTYIFY' in self.settings['SLIM_OPTIONS'] and
                         self.settings['SLIM_OPTIONS']['PRETTYIFY']):
-                    output = bs(output).prettify() # prettify the html
+                    output = bs(output, 'html.parser').prettify() # prettify the html
                 else:
                     output = minify(output) # minify the html
                 return output

+ 2 - 2
tipue_search/tipue_search.py

@@ -40,10 +40,10 @@ class Tipue_Search_JSON_Generator(object):
         if getattr(page, 'status', 'published') != 'published':
             return
 
-        soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '))
+        soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '), 'html.parser')
         page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '&#94;')
 
-        soup_text = BeautifulSoup(page.content)
+        soup_text = BeautifulSoup(page.content, 'html.parser')
         page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '&#94;')
         page_text = ' '.join(page_text.split())