Browse Source

Merge pull request #886 from aquinzi/interlinks-fixes

Interlinks: Improve HTML tag handling. Fixes #885
Justin Mayer 7 years ago
parent
commit
e4c2f1691a
1 changed files with 17 additions and 9 deletions
  1. 17 9
      interlinks/interlinks.py

+ 17 - 9
interlinks/interlinks.py

@@ -9,6 +9,7 @@ This plugin allows you to include "interwiki" or shortcuts links into the blog,
 """
 
 from bs4 import BeautifulSoup
+from bs4 import SoupStrainer
 from pelican import signals
 import re
 
@@ -23,32 +24,39 @@ def getSettings (generator):
 		for key, value in generator.settings['INTERLINKS'].items():
 			interlinks[key] = value
 
-def content_object_init(instance):
+			
+def parse_links(instance):
 
 	if instance._content is not None:
 		content = instance._content
-		# use Python's built-in parser so no duplicated html & body tags appear, or use tag.unwrap()
-		text = BeautifulSoup(content, "html.parser")
 		
-		if 'a' in content:
-			for link in text.find_all(href=re.compile("(.+?)>")):
+		if '<a' in content:
+			text = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer("a"))
+			for link in text.find_all("a",href=re.compile("(.+?)>")):
+				old_tag = str(link)
 				url = link.get('href')
 				m = re.search(r"(.+?)>", url).groups()
 				name = m[0]
 				if name in interlinks:
-					hi = url.replace(name+">",interlinks[name])
+					hi = url.replace(name + ">", interlinks[name])
 					link['href'] = hi
-		if 'img' in content:
+				
+				content = content.replace(old_tag, str(link))
+
+		if '<img' in content:
+			text = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer("img"))
 			for img in text.find_all('img', src=re.compile("(.+?)>")):
+				old_tag = str(img)
 				url = img.get('src')
 				m = re.search(r"(.+?)>", url).groups()
 				name = m[0]
 				if name in interlinks:
 					hi = url.replace(name+">",interlinks[name])
 					img['src'] = hi
+				content = content.replace(old_tag, str(link))
 
-		instance._content = text.decode()
+		instance._content = content
 
 def register():
 	signals.generator_init.connect(getSettings)
-	signals.content_object_init.connect(content_object_init)
+	signals.content_object_init.connect(parse_links)