Przeglądaj źródła

Merge pull request #967 from mitchins/master

Add jpeg_reader plugin to create articles from JPEG metadata
Justin Mayer 7 lat temu
rodzic
commit
85fe303e6e

+ 23 - 9
better_figures_and_images/better_figures_and_images.py

@@ -22,6 +22,7 @@ from pelican import signals
 from bs4 import BeautifulSoup
 from PIL import Image
 import pysvg.parser
+import cssutils
 
 import logging
 logger = logging.getLogger(__name__)
@@ -43,24 +44,37 @@ def content_object_init(instance):
             logger.debug('Better Fig. img_path: %s', img_path)
             logger.debug('Better Fig. img_fname: %s', img_filename)
 
+            # If the image already has attributes... then we can skip it. Assuming it's already optimised
+            if 'style' in img.attrs:
+                sheet = cssutils.parseStyle(img['style'])
+                if len(sheet.width) > 0 or len(sheet.height) > 0:
+                    continue
+
             # Pelican 3.5+ supports {attach} macro for auto copy, in this use case the content does not exist in output
             # due to the fact it has not been copied, hence we take it from the source (same as current document)
             if img_filename.startswith('{attach}'):
                 img_path = os.path.dirname(instance.source_path)
                 img_filename = img_filename[8:]
                 src = os.path.join(img_path, img_filename)
-            else:
+            elif img_path.startswith(('{filename}', '|filename|')):
                 # Strip off {filename}, |filename| or /static
-                if img_path.startswith(('{filename}', '|filename|')):
-                    img_path = img_path[10:]
-                elif img_path.startswith('/static'):
-                    img_path = img_path[7:]
-                elif img_path.startswith('data:image'):
-                    # Image is encoded in-line (not a file).
-                    continue
+                img_path = img_path[10:]
+            elif img_path.startswith('/static'):
+                img_path = img_path[7:]
+            elif img_path.startswith('data:image'):
+                # Image is encoded in-line (not a file).
+                continue
+            else:
+                # Check the location in the output as some plugins create them there.
+                output_path = path.dirname(instance.save_as)
+                image_output_location = path.join(instance.settings['OUTPUT_PATH'], output_path, img_filename)
+                if path.isfile(image_output_location):
+                    src = image_output_location
+                    logger.info('{src} located in output, missing from content.'.format(src=img_filename))
                 else:
-                    logger.warning('Better Fig. Error: img_path should start with either {filename}, |filename| or /static')
+                    logger.warning('Better Fig. Error: img_path should start with either {attach}, {filename}, |filename| or /static')
 
+            if src is None:
                 # search src path list
                 # 1. Build the source image filename from PATH
                 # 2. Build the source image filename from STATIC_PATHS

+ 1 - 0
jpeg_reader/.gitignore

@@ -0,0 +1 @@
+.cache

+ 45 - 0
jpeg_reader/README.md

@@ -0,0 +1,45 @@
+### Pelican JPEG Reader.
+
+Original author: [Mitchell Currie](https://github.com/mitchins)
+
+##### Requirements:
+
+* Python3
+* Pelican
+* Pillow library (PIL for python3)
+* Exiv2 binary accessible by $PATH
+
+To avoid undesired creation of content, the specific extension must be `jpeg_article`, i.e. "myPhoto.jpeg_article", it's a regular JPEG image, but this avoids your other JPEG images getting picked up. It can work for pages or blogs, and determines based on whether it's `content/blog` or `content/pages` (or whatever you use for content).
+
+#### Most relevant EXIF/IPTC flags from Exiv2 that are used
+
+|  Page/Article Field | Exiv2 Key  |  Description |
+|---|---|---|
+| title  | `Exif.Image.ImageDescription`  |  Defaults to 'Untitled' |
+| author  | `Exif.Image.Artist`  |  Default to Unknown. Currently Scalar |
+| date  |  `Exif.Photo.DateTimeOriginal` |  Undefined behaviour if not present as required |
+|  slug |  `Iptc.Application2.Headline` |  Defaults to title's value |
+|  body |  `Exif.Photo.UserComment` |  This goes under image in page/article, blank default |
+|  summary |  `Iptc.Application2.Caption` |  Used for article index, defaults to first 140 characters of the body |
+|  category |  `Iptc.Application2.SuppCategory` |  Specifies the category of page/article if `USE_FOLDER_AS_CATEGORY` not set  |
+|  template |  `Iptc.Application2.ObjectName` |  If specified will set the template metadata property to tell pelican where to look  |
+|  tags |  `Iptc.Application2.Keywords` |  For each entry found with this key, a tag is created with the value of the entry |
+|  `metadata['exiv2']` | ***Everything***|  All exiv2 fields from the image are shoved into the metadata dictionary of the item, under `exiv2` key for template usage |
+
+
+
+#### Pelican Settings Added or Honoured:
+
+|  Key in pelicanconf.py |  Description |
+|---|---|
+| `PATH`  |  **Content Path** |
+| `OUTPUT_PATH` |  **Output Path** |
+| `USE_FOLDER_AS_CATEGORY` | **Category from folder name** If enabled, takes the category from the name of the folder the file is in. Otherwise the category will attempt to be read from `Iptc.Application2.SuppCategory` |
+|  `SITEURL` | **Site Url** The optional absolute Url for the site, defaults to '' usually. |
+|  `PAGE_URL` | **Page Url** The format string to specify where page html files are saved to |
+|  `PAGE_SAVE_AS` | **Page Save Path** The format string to specify where page html files are physically written to disk |
+|  `ARTICLE_URL` | ** Article Url** The format string to specify where page html files are saved to |
+|  `ARTICLE_SAVE_AS` | **Article Save Path** The format string to specify where page html files are physically written to disk |
+
+
+

+ 1 - 0
jpeg_reader/__init__.py

@@ -0,0 +1 @@
+from .jpeg_reader import *

+ 42 - 0
jpeg_reader/constants.py

@@ -0,0 +1,42 @@
+from enum import Enum
+
+
+class Exiv(Enum):
+    DESCRIPTION = 'Exif.Image.ImageDescription'
+    ARTIST = 'Exif.Image.Artist'
+    DATETIME = 'Exif.Photo.DateTimeOriginal'
+    HEADLINE = 'Iptc.Application2.Headline'
+    COMMENT = 'Exif.Photo.UserComment'
+    CAPTION = 'Iptc.Application2.Caption'
+    KEYWORDS = 'Iptc.Application2.Keywords'
+    CATEGORY = 'Iptc.Application2.SuppCategory'
+    OBJECT_NAME = 'Iptc.Application2.ObjectName'
+
+
+class PelicanConfig(Enum):
+    PATH = 'PATH'
+    OUTPUT_PATH = 'OUTPUT_PATH'
+    USE_FOLDER_AS_CATEGORY = 'USE_FOLDER_AS_CATEGORY'
+    SITE_URL = 'SITEURL'
+    PAGE_URL = 'PAGE_URL'
+    PAGE_SAVE_AS = 'PAGE_SAVE_AS'
+    ARTICLE_URL = 'ARTICLE_URL'
+    ARTICLE_SAVE_AS = 'ARTICLE_SAVE_AS'
+
+
+class PelicanMetadata(Enum):
+    TITLE = 'title'
+    AUTHORS = 'authors'
+    DATE = 'date'
+    SLUG = 'slug'
+    TAGS = 'tags'
+    CATEGORY = 'category'
+    SUMMARY = 'summary'
+    FEATURED_IMAGE = 'featured_image'  # Acts as a thumbnail
+    TEMPLATE = 'template'
+    CUSTOM_ALL = 'exiv2'  # Not officially part of metadata, but we add it ourselves
+
+
+class PelicanClass(Enum):
+    BLOG = 'blog'
+    PAGES = 'pages'

+ 61 - 0
jpeg_reader/exiv2_parser.py

@@ -0,0 +1,61 @@
+import re
+import subprocess
+from typing import List, Tuple
+
+from . import util
+
+
+class Keyword:
+    def __init__(self, *, keyword:str, kind: str, count: int):
+        self.keyword = keyword
+        self.kind = kind
+        self.count = count
+
+
+class Exiv2Parser:
+    @classmethod
+    def get_exiv2_version(cls) -> Tuple[str, str]:
+        commands = ['exiv2', '--version']
+        process = subprocess.Popen(commands, stdout=subprocess.PIPE)
+        output = util.to_str(process.communicate()[0])
+        match = re.search(r'exiv2 ([\d.]+) (\w+)', output)
+        if match is not None:
+            return match.groups()
+        return None
+
+    @classmethod
+    def get_values(cls, file_path: str) -> dict:
+        keywords = cls.__get_keys(file_path)
+        result = dict()
+        for key in keywords:
+            commands = ['exiv2', '-K', key.keyword, '-P', 't', 'print', file_path]
+            process = subprocess.Popen(commands, stdout=subprocess.PIPE)
+            output = util.to_str(process.communicate()[0]).rstrip('\n')
+            # Check if the key is a list or scalar
+            if key.count > 1:
+                result[key.keyword] = output.split('\n')  # Assume the output is like keywords, one per line
+            else:
+                result[key.keyword] = output  # Assume the whole input is the value
+        return result
+
+    @classmethod
+    def __get_keys(cls, file_path: str) -> List[Keyword]:
+        found_keywords = dict()
+        commands = ['exiv2', '-P', 'ky', 'print', file_path]
+        process = subprocess.Popen(commands, stdout=subprocess.PIPE)
+        output = util.to_str(process.communicate()[0])
+        for match in re.finditer(r'([\w.]+)\W+(\w+)\W*\n?', output):
+            code, kind = match.groups()
+            keyword = found_keywords.get(code, Keyword(keyword=code, kind=kind, count=0))
+            keyword.count += 1
+            found_keywords[code] = keyword
+
+        return list(found_keywords.values())
+
+if __name__ == '__main__':
+    #data = Exiv2Parser.get_values('content/blog/terms2.jpeg')
+    #print(data)
+    version_info = Exiv2Parser.get_exiv2_version()
+    print(version_info)
+
+

+ 128 - 0
jpeg_reader/jpeg_reader.py

@@ -0,0 +1,128 @@
+"""
+This plugin uses the metadata from JPEG images (EXIF and IPTC) to construct a meaningful page or gallery.
+Possible uses are gallery pages or a blog article that's mainly about an image.
+With this tool, it's posible to just dump an image without any extra data/linkage to create coherent output.
+The note here is that the extension is `jpeg_article` so it doesn't pick up {attach} or other static resources.
+"""
+
+import logging
+from datetime import datetime
+from os import makedirs, sep
+from os.path import join, dirname, isdir, splitext
+from typing import Tuple
+
+from PIL import Image
+from pelican import signals
+from pelican.readers import BaseReader
+from pelican.urlwrappers import URLWrapper, Category, Author, Tag
+
+from .constants import Exiv, PelicanConfig, PelicanMetadata, PelicanClass
+from .exiv2_parser import Exiv2Parser
+
+
+class JpegReader(BaseReader):
+    logger = logging.getLogger('JpegReader')
+    enabled = True
+    file_extensions = ('jpeg_article')
+    thumb_size = 250, 250
+
+    def __init__(self, settings):
+        super(JpegReader, self).__init__(settings)
+
+    def read(self, source_path):
+        try:
+            if Exiv2Parser.get_exiv2_version() is not None:
+                content, metadata = self.parse_jpeg(source_path=source_path)
+
+        except ValueError:      # if file can't be parsed, ignore it
+            pass
+        else:
+            return content, metadata
+
+    def parse_jpeg(self, *, source_path: str) -> Tuple[str, dict]:
+        JpegReader.logger.info(source_path)
+
+        img = Image.open(source_path)
+
+        image_data = Exiv2Parser.get_values(source_path)
+
+        title = image_data.get(Exiv.DESCRIPTION.value, 'Untitled')
+        author = image_data.get(Exiv.ARTIST.value, 'Unknown')
+        date_string = image_data.get(Exiv.DATETIME.value, '')
+
+        date = datetime.strptime(date_string, "%Y:%m:%d %H:%M:%S")
+        slug = URLWrapper(image_data.get(Exiv.HEADLINE.value, title), self.settings).slug
+        description_long = image_data.get(Exiv.COMMENT.value, '')
+        summary = image_data.get(Exiv.CAPTION.value, description_long[:140])
+
+        tags = [Tag(tag, self.settings) for tag in image_data.get(Exiv.KEYWORDS.value, list())]
+
+        content_root = self.settings[PelicanConfig.PATH.value]
+        path_output = self.settings[PelicanConfig.OUTPUT_PATH.value]
+        relative_source = dirname(source_path[len(content_root):]).lstrip(sep)
+        if self.settings[PelicanConfig.USE_FOLDER_AS_CATEGORY.value]:
+            category = relative_source.split(sep)[-1]
+        else:
+            category = image_data.get(Exiv.CATEGORY.value, None)
+
+        type_of_content = relative_source.split(sep)[0]  # either 'blog' or 'pages' as far as I know.
+        url_site = self.settings[PelicanConfig.SITE_URL.value]
+
+        if type_of_content.lower() == PelicanClass.PAGES.value:
+            url_document = self.settings[PelicanConfig.PAGE_URL.value]
+            document_save_as = self.settings[PelicanConfig.PAGE_SAVE_AS.value]
+        else:  # Assume PelicanClass.BLOG
+            url_document = self.settings[PelicanConfig.ARTICLE_URL.value]
+            document_save_as = self.settings[PelicanConfig.ARTICLE_SAVE_AS.value]
+
+        page_url_complete = join(url_site, url_document)
+
+        author_wrapper = Author(author, self.settings)
+
+        # Move image in place:
+        metadata = {PelicanMetadata.TITLE.value: title, PelicanMetadata.AUTHORS.value: [author_wrapper],
+                    PelicanMetadata.DATE.value: date, PelicanMetadata.SLUG.value: slug,
+                    PelicanMetadata.TAGS.value: tags,
+                    PelicanMetadata.CUSTOM_ALL.value: image_data}
+        if category is not None:
+            metadata[PelicanMetadata.CATEGORY.value] = Category(category, self.settings)
+
+        thumb_name = '{0}_thumb.jpg'.format(slug)
+        original_name = '{0}.jpg'.format(slug)
+
+        path_output_html = join(path_output, document_save_as).format(**metadata)
+        path_output_dir = dirname(path_output_html)
+        path_output_original = join(path_output_dir, original_name)
+        path_output_thumb = join(path_output_dir, thumb_name)
+
+        # Here we generate the summary info incase this is used for articles we get nice thumbnails and summary
+        metadata[PelicanMetadata.SUMMARY.value] = summary
+        metadata[PelicanMetadata.FEATURED_IMAGE.value] = join(url_site, path_output_thumb[len(path_output):])
+        if Exiv.OBJECT_NAME.value in image_data:
+            metadata[PelicanMetadata.TEMPLATE.value] = image_data[Exiv.OBJECT_NAME.value]
+
+        # Write the size/HTML out before we reduce the image to a thumb
+        content = "<img src='{src}' alt='{alt}' style='width: {width}px; height: auto; max-width: 100%;'></img><p>{body}</p>" \
+            .format(src=original_name, alt=title, width=img.width, height=img.height, body=description_long)
+
+        # Ensure the directory levels exist
+        if not isdir(path_output_dir):
+            makedirs(path_output_dir)
+        img.save(path_output_original)
+        img.thumbnail(self.thumb_size)
+        img.save(path_output_thumb)
+
+        # Debug info if we need it
+        JpegReader.logger.debug(content)
+        JpegReader.logger.debug(str(metadata))
+        JpegReader.logger.debug(path_output_html)
+
+        return content, metadata
+
+
+def add_reader(readers):
+    readers.reader_classes['jpeg_article'] = JpegReader
+
+
+def register():
+    signals.readers_init.connect(add_reader)

+ 38 - 0
jpeg_reader/test_exiv2_parser.py

@@ -0,0 +1,38 @@
+import subprocess
+
+from .exiv2_parser import Exiv2Parser
+
+
+class MockPopen(object):
+    """Mock Popen method"""
+    def __init__(self, cmd, *, stdout):
+        pass
+
+    def communicate(self):
+        """Mock communicate method of Popen"""
+        return b'bash: command not found: exiv2', b''
+
+
+class MockPopenSuccess(MockPopen):
+    def __init__(self, cmd, *, stdout):
+        MockPopen.__init__(self, cmd, stdout=stdout)
+
+    def communicate(self):
+        """Mock communicate method of Popen"""
+        return b'exiv2 0.26 001a00 (64 bit build)', b''
+
+
+def test_get_version_fail(mocker, monkeypatch):
+    monkeypatch.setattr(subprocess, 'Popen', MockPopen)
+    version_info = Exiv2Parser.get_exiv2_version()
+    assert version_info is None
+
+
+def test_get_version_success(mocker, monkeypatch):
+    monkeypatch.setattr(subprocess, 'Popen', MockPopenSuccess)
+    version, commit = Exiv2Parser.get_exiv2_version()
+    assert version == '0.26'
+    assert commit == '001a00'
+
+
+

+ 14 - 0
jpeg_reader/util.py

@@ -0,0 +1,14 @@
+def to_str(bytes_or_str):
+    if isinstance(bytes_or_str, bytes):
+        value = bytes_or_str.decode('utf-8')
+    else:
+        value = bytes_or_str
+    return value  # Instance of Str
+
+
+def to_bytes(bytes_or_str):
+    if isinstance(bytes_or_str, str):
+        value = bytes_or_str.encode('utf-8')
+    else:
+        value = bytes_or_str
+    return value  # Instance of Bytes