Browse Source

Merge pull request #226 from pmclanahan/deterministic-gzip

Update gzip_cache to produce consistent .gz files.
Alexis Metaireau 10 years ago
parent
commit
f1805be031
2 changed files with 33 additions and 7 deletions
  1. 12 7
      gzip_cache/gzip_cache.py
  2. 21 0
      gzip_cache/test_gzip_cache.py

+ 12 - 7
gzip_cache/gzip_cache.py

@@ -39,6 +39,7 @@ EXCLUDE_TYPES = [
     '.mp4',
 ]
 
+
 def create_gzip_cache(pelican):
     '''Create a gzip cache file for every file that a webserver would
     reasonably want to cache (e.g., text type files).
@@ -51,6 +52,7 @@ def create_gzip_cache(pelican):
                 filepath = os.path.join(dirpath, name)
                 create_gzip_file(filepath)
 
+
 def should_compress(filename):
     '''Check if the filename is a type of file that should be compressed.
 
@@ -62,6 +64,7 @@ def should_compress(filename):
 
     return True
 
+
 def create_gzip_file(filepath):
     '''Create a gzipped file in the same directory with a filepath.gz name.
 
@@ -70,14 +73,16 @@ def create_gzip_file(filepath):
     compressed_path = filepath + '.gz'
 
     with open(filepath, 'rb') as uncompressed:
-        try:
+        # Explicitly set mtime to 0 so gzip content is fully determined
+        # by file content (0 = "no timestamp" according to gzip spec)
+        with gzip.GzipFile(compressed_path, 'wb',
+                           compresslevel=9, mtime=0) as compressed:
             logger.debug('Compressing: %s' % filepath)
-            compressed = gzip.open(compressed_path, 'wb')
-            compressed.writelines(uncompressed)
-        except Exception as ex:
-            logger.critical('Gzip compression failed: %s' % ex)
-        finally:
-            compressed.close()
+            try:
+                compressed.writelines(uncompressed)
+            except Exception as ex:
+                logger.critical('Gzip compression failed: %s' % ex)
+
 
 def register():
     signals.finalized.connect(create_gzip_cache)

+ 21 - 0
gzip_cache/test_gzip_cache.py

@@ -4,10 +4,12 @@
 import os
 import tempfile
 import unittest
+import time
 
 from contextlib import contextmanager
 from tempfile import mkdtemp
 from shutil import rmtree
+from hashlib import md5
 
 import gzip_cache
 
@@ -52,3 +54,22 @@ class TestGzipCache(unittest.TestCase):
             gzip_cache.create_gzip_file(a_html_filename)
             self.assertTrue(os.path.exists(a_html_filename + '.gz'))
 
+    def test_creates_same_gzip_file(self):
+        # Should create the same gzip file from the same contents.
+
+        # gzip will create a slightly different file because it includes
+        # a timestamp in the compressed file by default. This can cause
+        # problems for some caching strategies.
+        with temporary_folder() as tempdir:
+            _, a_html_filename = tempfile.mkstemp(suffix='.html', dir=tempdir)
+            a_gz_filename = a_html_filename + '.gz'
+            gzip_cache.create_gzip_file(a_html_filename)
+            gzip_hash = get_md5(a_gz_filename)
+            time.sleep(1)
+            gzip_cache.create_gzip_file(a_html_filename)
+            self.assertEqual(gzip_hash, get_md5(a_gz_filename))
+
+
+def get_md5(filepath):
+    with open(filepath, 'rb') as fh:
+        return md5(fh.read()).hexdigest()