blogger_comment_export.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. #! python3.6
  2. """
  3. Export Comments from BLogger XML
  4. Takes in a Blogger export XML file and spits out each comment in a seperate
  5. file, such that can be used with the [Pelican Comment System]
  6. (https://bernhard.scheirle.de/posts/2014/March/29/static-comments-via-email/).
  7. May be simple to extend to export posts as well.
  8. For a more detailed desciption, read my blog post at
  9. http://blog.minchin.ca/2016/12/blogger-comments-exported.html
  10. Author: Wm. Minchin -- minchinweb@gmail.com
  11. License: MIT
  12. Changes:
  13. - 2016.12.29 -- initial release
  14. - 2017.01.10 -- clean-up for addition in Pelican Comment System repo
  15. """
  16. from pathlib import Path
  17. import untangle
  18. ###############################################################################
  19. # Constants #
  20. ###############################################################################
  21. BLOGGER_EXPORT = r'c:\tmp\blog.xml'
  22. COMMENTS_DIR = 'comments'
  23. COMMENT_EXT = '.md'
  24. AUTHORS_FILENAME = 'authors.txt'
  25. ###############################################################################
  26. # Main Code Body #
  27. ###############################################################################
  28. authors_and_pics = []
  29. def main():
  30. obj = untangle.parse(BLOGGER_EXPORT)
  31. templates = 0
  32. posts = 0
  33. comments = 0
  34. settings = 0
  35. others = 0
  36. for entry in obj.feed.entry:
  37. try:
  38. full_type = entry.category['term']
  39. except TypeError:
  40. # if a post is under multiple categories
  41. for my_category in entry.category:
  42. full_type = my_category['term']
  43. # str.find() uses a return of `-1` to denote failure
  44. if full_type.find('#') != -1:
  45. break
  46. else:
  47. others += 1
  48. simple_type = full_type[full_type.find('#')+1:]
  49. if 'settings' == simple_type:
  50. settings += 1
  51. elif 'post' == simple_type:
  52. posts += 1
  53. # process posts here
  54. elif 'comment' == simple_type:
  55. comments += 1
  56. process_comment(entry, obj)
  57. elif 'template' == simple_type:
  58. templates += 1
  59. else:
  60. others += 1
  61. export_authors()
  62. print('''
  63. {} template
  64. {} posts (including drafts)
  65. {} comments
  66. {} settings
  67. {} other entries'''.format(templates,
  68. posts,
  69. comments,
  70. settings,
  71. others))
  72. def process_comment(entry, obj):
  73. # e.g. "tag:blogger.com,1999:blog-26967745.post-4115122471434984978"
  74. comment_id = entry.id.cdata
  75. # in ISO 8601 format, usable as is
  76. comment_published = entry.published.cdata
  77. comment_body = entry.content.cdata
  78. comment_post_id = entry.thr_in_reply_to['ref']
  79. comment_author = entry.author.name.cdata
  80. comment_author_pic = entry.author.gd_image['src']
  81. comment_author_email = entry.author.email.cdata
  82. # add author and pic to global list
  83. global authors_and_pics
  84. authors_and_pics.append((comment_author, comment_author_pic))
  85. # use this for a filename for the comment
  86. # e.g. "4115122471434984978"
  87. comment_short_id = comment_id[comment_id.find('post-')+5:]
  88. comment_text = "date: {}\nauthor: {}\nemail: {}\n\n{}\n"\
  89. .format(comment_published,
  90. comment_author,
  91. comment_author_email,
  92. comment_body)
  93. # article
  94. for entry in obj.feed.entry:
  95. entry_id = entry.id.cdata
  96. if entry_id == comment_post_id:
  97. article_entry = entry
  98. break
  99. else:
  100. print("No matching article for comment", comment_id, comment_post_id)
  101. # don't process comment further
  102. return
  103. # article slug
  104. for link in article_entry.link:
  105. if link['rel'] == 'alternate':
  106. article_link = link['href']
  107. break
  108. else:
  109. article_title = article_entry.title.cdata
  110. print('Could not find slug for', article_title)
  111. article_link = article_title.lower().replace(' ', '-')
  112. article_slug = article_link[article_link.rfind('/')+1:
  113. article_link.find('.html')]
  114. comment_filename = Path(COMMENTS_DIR).resolve()
  115. # folder; if it doesn't exist, create it
  116. comment_filename = comment_filename / article_slug
  117. comment_filename.mkdir(parents=True, exist_ok=True)
  118. # write the comment file
  119. comment_filename = comment_filename / (comment_short_id + COMMENT_EXT)
  120. comment_filename.write_text(comment_text)
  121. def export_authors():
  122. to_export = set(authors_and_pics)
  123. to_export = list(to_export)
  124. to_export.sort()
  125. str_export = ''
  126. for i in to_export:
  127. str_export += (i[0] + '\t\t' + i[1] + '\n')
  128. authors_filename = Path(COMMENTS_DIR).resolve() / AUTHORS_FILENAME
  129. authors_filename.write_text(str_export)
  130. if __name__ == "__main__":
  131. main()