소스 검색

Update gzip_cache to produce consistent .gz files.

The gzip protocol includes the timestamp in the file
by default. This causes the contents of the file to
be slightly different every time, causing problems
for some long-term caching strategies.

This patch removes the timestamp, allowing gzip to produce
the same file from the same source every time.
Paul McLanahan 10 년 전
부모
커밋
dda2e8c19b
2개의 변경된 파일33개의 추가작업 그리고 7개의 파일을 삭제
  1. 12 7
      gzip_cache/gzip_cache.py
  2. 21 0
      gzip_cache/test_gzip_cache.py

+ 12 - 7
gzip_cache/gzip_cache.py

@@ -39,6 +39,7 @@ EXCLUDE_TYPES = [
     '.mp4',
 ]
 
+
 def create_gzip_cache(pelican):
     '''Create a gzip cache file for every file that a webserver would
     reasonably want to cache (e.g., text type files).
@@ -51,6 +52,7 @@ def create_gzip_cache(pelican):
                 filepath = os.path.join(dirpath, name)
                 create_gzip_file(filepath)
 
+
 def should_compress(filename):
     '''Check if the filename is a type of file that should be compressed.
 
@@ -62,6 +64,7 @@ def should_compress(filename):
 
     return True
 
+
 def create_gzip_file(filepath):
     '''Create a gzipped file in the same directory with a filepath.gz name.
 
@@ -70,14 +73,16 @@ def create_gzip_file(filepath):
     compressed_path = filepath + '.gz'
 
     with open(filepath, 'rb') as uncompressed:
-        try:
+        # Explicitly set mtime to 0 so gzip content is fully determined
+        # by file content (0 = "no timestamp" according to gzip spec)
+        with gzip.GzipFile(compressed_path, 'wb',
+                           compresslevel=9, mtime=0) as compressed:
             logger.debug('Compressing: %s' % filepath)
-            compressed = gzip.open(compressed_path, 'wb')
-            compressed.writelines(uncompressed)
-        except Exception as ex:
-            logger.critical('Gzip compression failed: %s' % ex)
-        finally:
-            compressed.close()
+            try:
+                compressed.writelines(uncompressed)
+            except Exception as ex:
+                logger.critical('Gzip compression failed: %s' % ex)
+
 
 def register():
     signals.finalized.connect(create_gzip_cache)

+ 21 - 0
gzip_cache/test_gzip_cache.py

@@ -4,10 +4,12 @@
 import os
 import tempfile
 import unittest
+import time
 
 from contextlib import contextmanager
 from tempfile import mkdtemp
 from shutil import rmtree
+from hashlib import md5
 
 import gzip_cache
 
@@ -52,3 +54,22 @@ class TestGzipCache(unittest.TestCase):
             gzip_cache.create_gzip_file(a_html_filename)
             self.assertTrue(os.path.exists(a_html_filename + '.gz'))
 
+    def test_creates_same_gzip_file(self):
+        # Should create the same gzip file from the same contents.
+
+        # gzip will create a slightly different file because it includes
+        # a timestamp in the compressed file by default. This can cause
+        # problems for some caching strategies.
+        with temporary_folder() as tempdir:
+            _, a_html_filename = tempfile.mkstemp(suffix='.html', dir=tempdir)
+            a_gz_filename = a_html_filename + '.gz'
+            gzip_cache.create_gzip_file(a_html_filename)
+            gzip_hash = get_md5(a_gz_filename)
+            time.sleep(1)
+            gzip_cache.create_gzip_file(a_html_filename)
+            self.assertEqual(gzip_hash, get_md5(a_gz_filename))
+
+
+def get_md5(filepath):
+    with open(filepath, 'rb') as fh:
+        return md5(fh.read()).hexdigest()