Browse Source

Added JPEG metadata reader to create articles

See README.md for more detail. Has basic pytest cases. Creates full size image and thumbnail at the destination. Requires Exiv2 to be installed, doesn’t use the bindings which will not install on python3 for me, uses process output.
Mitchell Currie 6 years ago
parent
commit
b39fe8231e

+ 23 - 9
better_figures_and_images/better_figures_and_images.py

@@ -22,6 +22,7 @@ from pelican import signals
 from bs4 import BeautifulSoup
 from PIL import Image
 import pysvg.parser
+import cssutils
 
 import logging
 logger = logging.getLogger(__name__)
@@ -43,24 +44,37 @@ def content_object_init(instance):
             logger.debug('Better Fig. img_path: %s', img_path)
             logger.debug('Better Fig. img_fname: %s', img_filename)
 
+            # If the image already has attributes... then we can skip it. Assuming it's already optimised
+            if 'style' in img.attrs:
+                sheet = cssutils.parseStyle(img['style'])
+                if len(sheet.width) > 0 or len(sheet.height) > 0:
+                    continue
+
             # Pelican 3.5+ supports {attach} macro for auto copy, in this use case the content does not exist in output
             # due to the fact it has not been copied, hence we take it from the source (same as current document)
             if img_filename.startswith('{attach}'):
                 img_path = os.path.dirname(instance.source_path)
                 img_filename = img_filename[8:]
                 src = os.path.join(img_path, img_filename)
-            else:
+            elif img_path.startswith(('{filename}', '|filename|')):
                 # Strip off {filename}, |filename| or /static
-                if img_path.startswith(('{filename}', '|filename|')):
-                    img_path = img_path[10:]
-                elif img_path.startswith('/static'):
-                    img_path = img_path[7:]
-                elif img_path.startswith('data:image'):
-                    # Image is encoded in-line (not a file).
-                    continue
+                img_path = img_path[10:]
+            elif img_path.startswith('/static'):
+                img_path = img_path[7:]
+            elif img_path.startswith('data:image'):
+                # Image is encoded in-line (not a file).
+                continue
+            else:
+                # Check the location in the output as some plugins create them there.
+                output_path = path.dirname(instance.save_as)
+                image_output_location = path.join(instance.settings['OUTPUT_PATH'], output_path, img_filename)
+                if path.isfile(image_output_location):
+                    src = image_output_location
+                    logger.info('{src} located in output, missing from content.'.format(src=img_filename))
                 else:
-                    logger.warning('Better Fig. Error: img_path should start with either {filename}, |filename| or /static')
+                    logger.warning('Better Fig. Error: img_path should start with either {attach}, {filename}, |filename| or /static')
 
+            if src is None:
                 # search src path list
                 # 1. Build the source image filename from PATH
                 # 2. Build the source image filename from STATIC_PATHS

+ 1 - 0
jpeg_reader/.gitignore

@@ -0,0 +1 @@
+.cache

+ 45 - 0
jpeg_reader/README.md

@@ -0,0 +1,45 @@
+### Pelican JPEG Reader.
+
+Original author: [Mitchell Currie](https://github.com/mitchins)
+
+##### Requirements:
+
+* Python3
+* Pelican
+* Pillow library (PIL for python3)
+* Exiv2 binary accessible by $PATH
+
+To avoid undesired creation of content, the specific extension must be `jpeg_article`, i.e. "myPhoto.jpeg_article", it's a regular JPEG image, but this avoids your other JPEG images getting picked up. It can work for pages or blogs, and determines based on whether it's `content/blog` or `content/pages` (or whatever you use for content).
+
+#### Most relevant EXIF/IPTC flags from Exiv2 that are used
+
+|  Page/Article Field | Exiv2 Key  |  Description |
+|---|---|---|
+| title  | `Exif.Image.ImageDescription`  |  Defaults to 'Untitled' |
+| author  | `Exif.Image.Artist`  |  Default to Unknown. Currently Scalar |
+| date  |  `Exif.Photo.DateTimeOriginal` |  Undefined behaviour if not present as required |
+|  slug |  `Iptc.Application2.Headline` |  Defaults to title's value |
+|  body |  `Exif.Photo.UserComment` |  This goes under image in page/article, blank default |
+|  summary |  `Iptc.Application2.Caption` |  Used for article index, defaults to first 140 characters of the body |
+|  category |  `Iptc.Application2.SuppCategory` |  Specifies the category of page/article if `USE_FOLDER_AS_CATEGORY` not set  |
+|  template |  `Iptc.Application2.ObjectName` |  If specified will set the template metadata property to tell pelican where to look  |
+|  tags |  `Iptc.Application2.Keywords` |  For each entry found with this key, a tag is created with the value of the entry |
+|  `metadata['exiv2']` | ***Everything***|  All exiv2 fields from the image are shoved into the metadata dictionary of the item, under `exiv2` key for template usage |
+
+
+
+#### Pelican Settings Added or Honoured:
+
+|  Key in pelicanconf.py |  Description |
+|---|---|
+| `PATH`  |  **Content Path** |
+| `OUTPUT_PATH` |  **Output Path** |
+| `USE_FOLDER_AS_CATEGORY` | **Category from folder name** If enabled, takes the category from the name of the folder the file is in. Otherwise the category will attempt to be read from `Iptc.Application2.SuppCategory` |
+|  `SITEURL` | **Site Url** The optional absolute Url for the site, defaults to '' usually. |
+|  `PAGE_URL` | **Page Url** The format string to specify where page html files are saved to |
+|  `PAGE_SAVE_AS` | **Page Save Path** The format string to specify where page html files are physically written to disk |
+|  `ARTICLE_URL` | ** Article Url** The format string to specify where page html files are saved to |
+|  `ARTICLE_SAVE_AS` | **Article Save Path** The format string to specify where page html files are physically written to disk |
+
+
+

+ 1 - 0
jpeg_reader/__init__.py

@@ -0,0 +1 @@
+from .jpeg_reader import *

+ 42 - 0
jpeg_reader/constants.py

@@ -0,0 +1,42 @@
+from enum import Enum
+
+
+class Exiv(Enum):
+    DESCRIPTION = 'Exif.Image.ImageDescription'
+    ARTIST = 'Exif.Image.Artist'
+    DATETIME = 'Exif.Photo.DateTimeOriginal'
+    HEADLINE = 'Iptc.Application2.Headline'
+    COMMENT = 'Exif.Photo.UserComment'
+    CAPTION = 'Iptc.Application2.Caption'
+    KEYWORDS = 'Iptc.Application2.Keywords'
+    CATEGORY = 'Iptc.Application2.SuppCategory'
+    OBJECT_NAME = 'Iptc.Application2.ObjectName'
+
+
+class PelicanConfig(Enum):
+    PATH = 'PATH'
+    OUTPUT_PATH = 'OUTPUT_PATH'
+    USE_FOLDER_AS_CATEGORY = 'USE_FOLDER_AS_CATEGORY'
+    SITE_URL = 'SITEURL'
+    PAGE_URL = 'PAGE_URL'
+    PAGE_SAVE_AS = 'PAGE_SAVE_AS'
+    ARTICLE_URL = 'ARTICLE_URL'
+    ARTICLE_SAVE_AS = 'ARTICLE_SAVE_AS'
+
+
+class PelicanMetadata(Enum):
+    TITLE = 'title'
+    AUTHORS = 'authors'
+    DATE = 'date'
+    SLUG = 'slug'
+    TAGS = 'tags'
+    CATEGORY = 'category'
+    SUMMARY = 'summary'
+    FEATURED_IMAGE = 'featured_image'  # Acts as a thumbnail
+    TEMPLATE = 'template'
+    CUSTOM_ALL = 'exiv2'  # Not officially part of metadata, but we add it ourselves
+
+
+class PelicanClass(Enum):
+    BLOG = 'blog'
+    PAGES = 'pages'

+ 61 - 0
jpeg_reader/exiv2_parser.py

@@ -0,0 +1,61 @@
+import re
+import subprocess
+from typing import List, Tuple
+
+from . import util
+
+
+class Keyword:
+    def __init__(self, *, keyword:str, kind: str, count: int):
+        self.keyword = keyword
+        self.kind = kind
+        self.count = count
+
+
+class Exiv2Parser:
+    @classmethod
+    def get_exiv2_version(cls) -> Tuple[str, str]:
+        commands = ['exiv2', '--version']
+        process = subprocess.Popen(commands, stdout=subprocess.PIPE)
+        output = util.to_str(process.communicate()[0])
+        match = re.search(r'exiv2 ([\d.]+) (\w+)', output)
+        if match is not None:
+            return match.groups()
+        return None
+
+    @classmethod
+    def get_values(cls, file_path: str) -> dict:
+        keywords = cls.__get_keys(file_path)
+        result = dict()
+        for key in keywords:
+            commands = ['exiv2', '-K', key.keyword, '-P', 't', 'print', file_path]
+            process = subprocess.Popen(commands, stdout=subprocess.PIPE)
+            output = util.to_str(process.communicate()[0]).rstrip('\n')
+            # Check if the key is a list or scalar
+            if key.count > 1:
+                result[key.keyword] = output.split('\n')  # Assume the output is like keywords, one per line
+            else:
+                result[key.keyword] = output  # Assume the whole input is the value
+        return result
+
+    @classmethod
+    def __get_keys(cls, file_path: str) -> List[Keyword]:
+        found_keywords = dict()
+        commands = ['exiv2', '-P', 'ky', 'print', file_path]
+        process = subprocess.Popen(commands, stdout=subprocess.PIPE)
+        output = util.to_str(process.communicate()[0])
+        for match in re.finditer(r'([\w.]+)\W+(\w+)\W*\n?', output):
+            code, kind = match.groups()
+            keyword = found_keywords.get(code, Keyword(keyword=code, kind=kind, count=0))
+            keyword.count += 1
+            found_keywords[code] = keyword
+
+        return list(found_keywords.values())
+
+if __name__ == '__main__':
+    #data = Exiv2Parser.get_values('content/blog/terms2.jpeg')
+    #print(data)
+    version_info = Exiv2Parser.get_exiv2_version()
+    print(version_info)
+
+

+ 128 - 0
jpeg_reader/jpeg_reader.py

@@ -0,0 +1,128 @@
+"""
+This plugin uses the metadata from JPEG images (EXIF and IPTC) to construct a meaningful page or gallery.
+Possible uses are gallery pages or a blog article that's mainly about an image.
+With this tool, it's posible to just dump an image without any extra data/linkage to create coherent output.
+The note here is that the extension is `jpeg_article` so it doesn't pick up {attach} or other static resources.
+"""
+
+import logging
+from datetime import datetime
+from os import makedirs, sep
+from os.path import join, dirname, isdir, splitext
+from typing import Tuple
+
+from PIL import Image
+from pelican import signals
+from pelican.readers import BaseReader
+from pelican.urlwrappers import URLWrapper, Category, Author, Tag
+
+from .constants import Exiv, PelicanConfig, PelicanMetadata, PelicanClass
+from .exiv2_parser import Exiv2Parser
+
+
+class JpegReader(BaseReader):
+    logger = logging.getLogger('JpegReader')
+    enabled = True
+    file_extensions = ('jpeg_article')
+    thumb_size = 250, 250
+
+    def __init__(self, settings):
+        super(JpegReader, self).__init__(settings)
+
+    def read(self, source_path):
+        try:
+            if Exiv2Parser.get_exiv2_version() is not None:
+                content, metadata = self.parse_jpeg(source_path=source_path)
+
+        except ValueError:      # if file can't be parsed, ignore it
+            pass
+        else:
+            return content, metadata
+
+    def parse_jpeg(self, *, source_path: str) -> Tuple[str, dict]:
+        JpegReader.logger.info(source_path)
+
+        img = Image.open(source_path)
+
+        image_data = Exiv2Parser.get_values(source_path)
+
+        title = image_data.get(Exiv.DESCRIPTION.value, 'Untitled')
+        author = image_data.get(Exiv.ARTIST.value, 'Unknown')
+        date_string = image_data.get(Exiv.DATETIME.value, '')
+
+        date = datetime.strptime(date_string, "%Y:%m:%d %H:%M:%S")
+        slug = URLWrapper(image_data.get(Exiv.HEADLINE.value, title), self.settings).slug
+        description_long = image_data.get(Exiv.COMMENT.value, '')
+        summary = image_data.get(Exiv.CAPTION.value, description_long[:140])
+
+        tags = [Tag(tag, self.settings) for tag in image_data.get(Exiv.KEYWORDS.value, list())]
+
+        content_root = self.settings[PelicanConfig.PATH.value]
+        path_output = self.settings[PelicanConfig.OUTPUT_PATH.value]
+        relative_source = dirname(source_path[len(content_root):]).lstrip(sep)
+        if self.settings[PelicanConfig.USE_FOLDER_AS_CATEGORY.value]:
+            category = relative_source.split(sep)[-1]
+        else:
+            category = image_data.get(Exiv.CATEGORY.value, None)
+
+        type_of_content = relative_source.split(sep)[0]  # either 'blog' or 'pages' as far as I know.
+        url_site = self.settings[PelicanConfig.SITE_URL.value]
+
+        if type_of_content.lower() == PelicanClass.PAGES.value:
+            url_document = self.settings[PelicanConfig.PAGE_URL.value]
+            document_save_as = self.settings[PelicanConfig.PAGE_SAVE_AS.value]
+        else:  # Assume PelicanClass.BLOG
+            url_document = self.settings[PelicanConfig.ARTICLE_URL.value]
+            document_save_as = self.settings[PelicanConfig.ARTICLE_SAVE_AS.value]
+
+        page_url_complete = join(url_site, url_document)
+
+        author_wrapper = Author(author, self.settings)
+
+        # Move image in place:
+        metadata = {PelicanMetadata.TITLE.value: title, PelicanMetadata.AUTHORS.value: [author_wrapper],
+                    PelicanMetadata.DATE.value: date, PelicanMetadata.SLUG.value: slug,
+                    PelicanMetadata.TAGS.value: tags,
+                    PelicanMetadata.CUSTOM_ALL.value: image_data}
+        if category is not None:
+            metadata[PelicanMetadata.CATEGORY.value] = Category(category, self.settings)
+
+        thumb_name = '{0}_thumb.jpg'.format(slug)
+        original_name = '{0}.jpg'.format(slug)
+
+        path_output_html = join(path_output, document_save_as).format(**metadata)
+        path_output_dir = dirname(path_output_html)
+        path_output_original = join(path_output_dir, original_name)
+        path_output_thumb = join(path_output_dir, thumb_name)
+
+        # Here we generate the summary info incase this is used for articles we get nice thumbnails and summary
+        metadata[PelicanMetadata.SUMMARY.value] = summary
+        metadata[PelicanMetadata.FEATURED_IMAGE.value] = join(url_site, path_output_thumb[len(path_output):])
+        if Exiv.OBJECT_NAME.value in image_data:
+            metadata[PelicanMetadata.TEMPLATE.value] = image_data[Exiv.OBJECT_NAME.value]
+
+        # Write the size/HTML out before we reduce the image to a thumb
+        content = "<img src='{src}' alt='{alt}' style='width: {width}px; height: auto; max-width: 100%;'></img><p>{body}</p>" \
+            .format(src=original_name, alt=title, width=img.width, height=img.height, body=description_long)
+
+        # Ensure the directory levels exist
+        if not isdir(path_output_dir):
+            makedirs(path_output_dir)
+        img.save(path_output_original)
+        img.thumbnail(self.thumb_size)
+        img.save(path_output_thumb)
+
+        # Debug info if we need it
+        JpegReader.logger.debug(content)
+        JpegReader.logger.debug(str(metadata))
+        JpegReader.logger.debug(path_output_html)
+
+        return content, metadata
+
+
+def add_reader(readers):
+    readers.reader_classes['jpeg_article'] = JpegReader
+
+
+def register():
+    signals.readers_init.connect(add_reader)

+ 38 - 0
jpeg_reader/test_exiv2_parser.py

@@ -0,0 +1,38 @@
+import subprocess
+
+from .exiv2_parser import Exiv2Parser
+
+
+class MockPopen(object):
+    """Mock Popen method"""
+    def __init__(self, cmd, *, stdout):
+        pass
+
+    def communicate(self):
+        """Mock communicate method of Popen"""
+        return b'bash: command not found: exiv2', b''
+
+
+class MockPopenSuccess(MockPopen):
+    def __init__(self, cmd, *, stdout):
+        MockPopen.__init__(self, cmd, stdout=stdout)
+
+    def communicate(self):
+        """Mock communicate method of Popen"""
+        return b'exiv2 0.26 001a00 (64 bit build)', b''
+
+
+def test_get_version_fail(mocker, monkeypatch):
+    monkeypatch.setattr(subprocess, 'Popen', MockPopen)
+    version_info = Exiv2Parser.get_exiv2_version()
+    assert version_info is None
+
+
+def test_get_version_success(mocker, monkeypatch):
+    monkeypatch.setattr(subprocess, 'Popen', MockPopenSuccess)
+    version, commit = Exiv2Parser.get_exiv2_version()
+    assert version == '0.26'
+    assert commit == '001a00'
+
+
+

+ 14 - 0
jpeg_reader/util.py

@@ -0,0 +1,14 @@
+def to_str(bytes_or_str):
+    if isinstance(bytes_or_str, bytes):
+        value = bytes_or_str.decode('utf-8')
+    else:
+        value = bytes_or_str
+    return value  # Instance of Str
+
+
+def to_bytes(bytes_or_str):
+    if isinstance(bytes_or_str, str):
+        value = bytes_or_str.encode('utf-8')
+    else:
+        value = bytes_or_str
+    return value  # Instance of Bytes