|
@@ -0,0 +1,165 @@
|
|
|
+#! python3.6
|
|
|
+"""
|
|
|
+Export Comments from BLogger XML
|
|
|
+
|
|
|
+Takes in a Blogger export XML file and spits out each comment in a seperate
|
|
|
+file, such that can be used with the [Pelican Comment System]
|
|
|
+(https://bernhard.scheirle.de/posts/2014/March/29/static-comments-via-email/).
|
|
|
+
|
|
|
+May be simple to extend to export posts as well.
|
|
|
+
|
|
|
+For a more detailed desciption, read my blog post at
|
|
|
+ http://blog.minchin.ca/2016/12/blogger-comments-exported.html
|
|
|
+
|
|
|
+Author: Wm. Minchin -- minchinweb@gmail.com
|
|
|
+License: MIT
|
|
|
+Changes:
|
|
|
+
|
|
|
+ - 2016.12.29 -- initial release
|
|
|
+ - 2017.01.10 -- clean-up for addition in Pelican Comment System repo
|
|
|
+"""
|
|
|
+
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+import untangle
|
|
|
+
|
|
|
+###############################################################################
|
|
|
+# Constants #
|
|
|
+###############################################################################
|
|
|
+
|
|
|
+BLOGGER_EXPORT = r'c:\tmp\blog.xml'
|
|
|
+COMMENTS_DIR = 'comments'
|
|
|
+COMMENT_EXT = '.md'
|
|
|
+AUTHORS_FILENAME = 'authors.txt'
|
|
|
+
|
|
|
+###############################################################################
|
|
|
+# Main Code Body #
|
|
|
+###############################################################################
|
|
|
+
|
|
|
+authors_and_pics = []
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ obj = untangle.parse(BLOGGER_EXPORT)
|
|
|
+
|
|
|
+ templates = 0
|
|
|
+ posts = 0
|
|
|
+ comments = 0
|
|
|
+ settings = 0
|
|
|
+ others = 0
|
|
|
+
|
|
|
+ for entry in obj.feed.entry:
|
|
|
+ try:
|
|
|
+ full_type = entry.category['term']
|
|
|
+ except TypeError:
|
|
|
+ # if a post is under multiple categories
|
|
|
+ for my_category in entry.category:
|
|
|
+ full_type = my_category['term']
|
|
|
+ # str.find() uses a return of `-1` to denote failure
|
|
|
+ if full_type.find('#') != -1:
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ others += 1
|
|
|
+
|
|
|
+ simple_type = full_type[full_type.find('#')+1:]
|
|
|
+
|
|
|
+ if 'settings' == simple_type:
|
|
|
+ settings += 1
|
|
|
+ elif 'post' == simple_type:
|
|
|
+ posts += 1
|
|
|
+ # process posts here
|
|
|
+ elif 'comment' == simple_type:
|
|
|
+ comments += 1
|
|
|
+ process_comment(entry, obj)
|
|
|
+ elif 'template' == simple_type:
|
|
|
+ templates += 1
|
|
|
+ else:
|
|
|
+ others += 1
|
|
|
+
|
|
|
+ export_authors()
|
|
|
+
|
|
|
+ print('''
|
|
|
+ {} template
|
|
|
+ {} posts (including drafts)
|
|
|
+ {} comments
|
|
|
+ {} settings
|
|
|
+ {} other entries'''.format(templates,
|
|
|
+ posts,
|
|
|
+ comments,
|
|
|
+ settings,
|
|
|
+ others))
|
|
|
+
|
|
|
+
|
|
|
+def process_comment(entry, obj):
|
|
|
+ # e.g. "tag:blogger.com,1999:blog-26967745.post-4115122471434984978"
|
|
|
+ comment_id = entry.id.cdata
|
|
|
+ # in ISO 8601 format, usable as is
|
|
|
+ comment_published = entry.published.cdata
|
|
|
+ comment_body = entry.content.cdata
|
|
|
+ comment_post_id = entry.thr_in_reply_to['ref']
|
|
|
+ comment_author = entry.author.name.cdata
|
|
|
+ comment_author_pic = entry.author.gd_image['src']
|
|
|
+ comment_author_email = entry.author.email.cdata
|
|
|
+
|
|
|
+ # add author and pic to global list
|
|
|
+ global authors_and_pics
|
|
|
+ authors_and_pics.append((comment_author, comment_author_pic))
|
|
|
+
|
|
|
+ # use this for a filename for the comment
|
|
|
+ # e.g. "4115122471434984978"
|
|
|
+ comment_short_id = comment_id[comment_id.find('post-')+5:]
|
|
|
+
|
|
|
+ comment_text = "date: {}\nauthor: {}\nemail: {}\n\n{}\n"\
|
|
|
+ .format(comment_published,
|
|
|
+ comment_author,
|
|
|
+ comment_author_email,
|
|
|
+ comment_body)
|
|
|
+
|
|
|
+ # article
|
|
|
+ for entry in obj.feed.entry:
|
|
|
+ entry_id = entry.id.cdata
|
|
|
+ if entry_id == comment_post_id:
|
|
|
+ article_entry = entry
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ print("No matching article for comment", comment_id, comment_post_id)
|
|
|
+ # don't process comment further
|
|
|
+ return
|
|
|
+
|
|
|
+ # article slug
|
|
|
+ for link in article_entry.link:
|
|
|
+ if link['rel'] == 'alternate':
|
|
|
+ article_link = link['href']
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ article_title = article_entry.title.cdata
|
|
|
+ print('Could not find slug for', article_title)
|
|
|
+ article_link = article_title.lower().replace(' ', '-')
|
|
|
+
|
|
|
+ article_slug = article_link[article_link.rfind('/')+1:
|
|
|
+ article_link.find('.html')]
|
|
|
+
|
|
|
+ comment_filename = Path(COMMENTS_DIR).resolve()
|
|
|
+ # folder; if it doesn't exist, create it
|
|
|
+ comment_filename = comment_filename / article_slug
|
|
|
+ comment_filename.mkdir(parents=True, exist_ok=True)
|
|
|
+ # write the comment file
|
|
|
+ comment_filename = comment_filename / (comment_short_id + COMMENT_EXT)
|
|
|
+ comment_filename.write_text(comment_text)
|
|
|
+
|
|
|
+
|
|
|
+def export_authors():
|
|
|
+ to_export = set(authors_and_pics)
|
|
|
+ to_export = list(to_export)
|
|
|
+ to_export.sort()
|
|
|
+
|
|
|
+ str_export = ''
|
|
|
+ for i in to_export:
|
|
|
+ str_export += (i[0] + '\t\t' + i[1] + '\n')
|
|
|
+
|
|
|
+ authors_filename = Path(COMMENTS_DIR).resolve() / AUTHORS_FILENAME
|
|
|
+ authors_filename.write_text(str_export)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|