123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- #! python3.6
- """
- Export Comments from BLogger XML
- Takes in a Blogger export XML file and spits out each comment in a seperate
- file, such that can be used with the [Pelican Comment System]
- (https://bernhard.scheirle.de/posts/2014/March/29/static-comments-via-email/).
- May be simple to extend to export posts as well.
- For a more detailed desciption, read my blog post at
- http://blog.minchin.ca/2016/12/blogger-comments-exported.html
- Author: Wm. Minchin -- minchinweb@gmail.com
- License: MIT
- Changes:
- - 2016.12.29 -- initial release
- - 2017.01.10 -- clean-up for addition in Pelican Comment System repo
- """
- from pathlib import Path
- import untangle
- ###############################################################################
- # Constants #
- ###############################################################################
- BLOGGER_EXPORT = r'c:\tmp\blog.xml'
- COMMENTS_DIR = 'comments'
- COMMENT_EXT = '.md'
- AUTHORS_FILENAME = 'authors.txt'
- ###############################################################################
- # Main Code Body #
- ###############################################################################
- authors_and_pics = []
- def main():
- obj = untangle.parse(BLOGGER_EXPORT)
- templates = 0
- posts = 0
- comments = 0
- settings = 0
- others = 0
- for entry in obj.feed.entry:
- try:
- full_type = entry.category['term']
- except TypeError:
- # if a post is under multiple categories
- for my_category in entry.category:
- full_type = my_category['term']
- # str.find() uses a return of `-1` to denote failure
- if full_type.find('#') != -1:
- break
- else:
- others += 1
- simple_type = full_type[full_type.find('#')+1:]
- if 'settings' == simple_type:
- settings += 1
- elif 'post' == simple_type:
- posts += 1
- # process posts here
- elif 'comment' == simple_type:
- comments += 1
- process_comment(entry, obj)
- elif 'template' == simple_type:
- templates += 1
- else:
- others += 1
- export_authors()
- print('''
- {} template
- {} posts (including drafts)
- {} comments
- {} settings
- {} other entries'''.format(templates,
- posts,
- comments,
- settings,
- others))
- def process_comment(entry, obj):
- # e.g. "tag:blogger.com,1999:blog-26967745.post-4115122471434984978"
- comment_id = entry.id.cdata
- # in ISO 8601 format, usable as is
- comment_published = entry.published.cdata
- comment_body = entry.content.cdata
- comment_post_id = entry.thr_in_reply_to['ref']
- comment_author = entry.author.name.cdata
- comment_author_pic = entry.author.gd_image['src']
- comment_author_email = entry.author.email.cdata
- # add author and pic to global list
- global authors_and_pics
- authors_and_pics.append((comment_author, comment_author_pic))
- # use this for a filename for the comment
- # e.g. "4115122471434984978"
- comment_short_id = comment_id[comment_id.find('post-')+5:]
- comment_text = "date: {}\nauthor: {}\nemail: {}\n\n{}\n"\
- .format(comment_published,
- comment_author,
- comment_author_email,
- comment_body)
- # article
- for entry in obj.feed.entry:
- entry_id = entry.id.cdata
- if entry_id == comment_post_id:
- article_entry = entry
- break
- else:
- print("No matching article for comment", comment_id, comment_post_id)
- # don't process comment further
- return
- # article slug
- for link in article_entry.link:
- if link['rel'] == 'alternate':
- article_link = link['href']
- break
- else:
- article_title = article_entry.title.cdata
- print('Could not find slug for', article_title)
- article_link = article_title.lower().replace(' ', '-')
- article_slug = article_link[article_link.rfind('/')+1:
- article_link.find('.html')]
- comment_filename = Path(COMMENTS_DIR).resolve()
- # folder; if it doesn't exist, create it
- comment_filename = comment_filename / article_slug
- comment_filename.mkdir(parents=True, exist_ok=True)
- # write the comment file
- comment_filename = comment_filename / (comment_short_id + COMMENT_EXT)
- comment_filename.write_text(comment_text)
- def export_authors():
- to_export = set(authors_and_pics)
- to_export = list(to_export)
- to_export.sort()
- str_export = ''
- for i in to_export:
- str_export += (i[0] + '\t\t' + i[1] + '\n')
- authors_filename = Path(COMMENTS_DIR).resolve() / AUTHORS_FILENAME
- authors_filename.write_text(str_export)
- if __name__ == "__main__":
- main()
|