Преглед изворни кода

Add support for exporting Markdown articles as PDF. Refs #573

The method used in this commit might not be the best way to implement this
feature. Nevertheless, it actually builds upon a documented feature of rst2pdf
and simplifies the code, as only a single interface is used.

The main logic behind this commit is:

 1- Load the markdown source files and pandoc's markdown reader;
 2- Convert the markdown input into an HTML representation + metadata;
 3- Merge the metadata and text into an ReStructured Text string;
 4- Feed this new string into rst2pdf.

For this feature to work, the user must have xhtml2pdf installed. A bug in
rst2pdf (https://github.com/rst2pdf/rst2pdf/issues/521) prevents it from
finding the PyPDF version used by latest xhtml2pdf versions. Hence, we have to
monkeypatch xhtml2pdf to provide the class rst2pdf expects *prior* to importing
rst2pdf.
Renato Cunha пре 8 година
родитељ
комит
360f5452d5
2 измењених фајлова са 71 додато и 16 уклоњено
  1. 59 11
      pdf/pdf.py
  2. 12 5
      pdf/test_pdf.py

+ 59 - 11
pdf/pdf.py

@@ -8,19 +8,34 @@ The pdf plugin generates PDF files from RST sources.
 
 from __future__ import unicode_literals, print_function
 
+from io import open
 from pelican import signals
 from pelican.generators import Generator
-from rst2pdf.createpdf import RstToPdf
+from pelican.readers import MarkdownReader
 
 import os
 import logging
 
 logger = logging.getLogger(__name__)
 
+import xhtml2pdf.util
+if 'pyPdf' not in dir(xhtml2pdf.util):
+    try:
+        from xhtml2pdf.util import PyPDF2
+        xhtml2pdf.util.pyPdf = PyPDF2
+    except ImportError:
+        logger.error('Failed to monkeypatch xhtml2pdf. ' +
+                     'You have missing dependencies')
+        raise
+
+from rst2pdf.createpdf import RstToPdf
+
 
 class PdfGenerator(Generator):
-    """Generate PDFs on the output dir, for all articles and pages coming from
-    rst"""
+    "Generate PDFs on the output dir, for all articles and pages"
+
+    supported_md_fields = ['date']
+
     def __init__(self, *args, **kwargs):
         super(PdfGenerator, self).__init__(*args, **kwargs)
 
@@ -36,16 +51,49 @@ class PdfGenerator(Generator):
 
         self.pdfcreator = RstToPdf(breakside=0,
                                    stylesheets=pdf_style,
-                                   style_path=pdf_style_path)
+                                   style_path=pdf_style_path,
+                                   raw_html=True)
 
     def _create_pdf(self, obj, output_path):
-        if obj.source_path.endswith('.rst'):
-            filename = obj.slug + ".pdf"
-            output_pdf = os.path.join(output_path, filename)
-            # print('Generating pdf for', obj.source_path, 'in', output_pdf)
-            with open(obj.source_path) as f:
-                self.pdfcreator.createPdf(text=f.read(), output=output_pdf)
-            logger.info(' [ok] writing %s' % output_pdf)
+        filename = obj.slug + '.pdf'
+        output_pdf = os.path.join(output_path, filename)
+        mdreader = MarkdownReader(self.settings)
+        _, ext = os.path.splitext(obj.source_path)
+        if ext == '.rst':
+            with open(obj.source_path, encoding='utf-8') as f:
+                text = f.read()
+            header = ''
+        elif ext[1:] in mdreader.file_extensions and mdreader.enabled:
+            text, meta = mdreader.read(obj.source_path)
+            header = ''
+
+            if 'title' in meta:
+                title = meta['title']
+                header = title + '\n' + '#' * len(title) + '\n\n'
+                del meta['title']
+
+            for k in meta.keys():
+                # We can't support all fields, so we strip the ones that won't
+                # look good
+                if k not in self.supported_md_fields:
+                    del meta[k]
+
+            header += '\n'.join([':%s: %s' % (k, meta[k]) for k in meta])
+            header += '\n\n.. raw:: html\n\n\t'
+            text = text.replace('\n', '\n\t')
+
+            # rst2pdf casts the text to str and will break if it finds
+            # non-escaped characters. Here we nicely escape them to XML/HTML
+            # entities before proceeding
+            text = text.encode('ascii', 'xmlcharrefreplace')
+        else:
+            # We don't support this format
+            logger.warn('Ignoring unsupported file ' + obj.source_path)
+            return
+
+        logger.info(' [ok] writing %s' % output_pdf)
+        self.pdfcreator.createPdf(text=(header+text),
+                                  output=output_pdf)
 
     def generate_context(self):
         pass

+ 12 - 5
pdf/test_pdf.py

@@ -6,17 +6,19 @@ import pdf
 
 from tempfile import mkdtemp
 from pelican import Pelican
+from pelican.readers import MarkdownReader
 from pelican.settings import read_settings
 from shutil import rmtree
 
 CUR_DIR = os.path.dirname(__file__)
 
+
 class TestPdfGeneration(unittest.TestCase):
     def setUp(self, override=None):
-        import pdf
         self.temp_path = mkdtemp(prefix='pelicantests.')
         settings = {
-            'PATH': os.path.join(os.path.dirname(CUR_DIR), '..', 'test_data', 'content'),
+            'PATH': os.path.join(os.path.dirname(CUR_DIR), '..', 'test_data',
+                                 'content'),
             'OUTPUT_PATH': self.temp_path,
             'PLUGINS': [pdf],
             'LOCALE': locale.normalize('en_US'),
@@ -30,12 +32,17 @@ class TestPdfGeneration(unittest.TestCase):
         try:
             pelican.run()
         except ValueError:
-            logging.warn('Relative links in the form of |filename|images/test.png are not yet handled by the pdf generator')
+            logging.warn('Relative links in the form of ' +
+                         '|filename|images/test.png are not yet handled by ' +
+                         ' the pdf generator')
             pass
 
-
     def tearDown(self):
         rmtree(self.temp_path)
 
     def test_existence(self):
-        assert os.path.exists(os.path.join(self.temp_path, 'pdf', 'this-is-a-super-article.pdf'))
+        assert os.path.exists(os.path.join(self.temp_path, 'pdf',
+                                           'this-is-a-super-article.pdf'))
+        if MarkdownReader.enabled:
+            assert os.path.exists(os.path.join(self.temp_path, 'pdf',
+                                  'a-markdown-powered-article.pdf'))