Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate MD5 for each S3 multipart upload incrementally #672

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS
Expand Up @@ -36,6 +36,7 @@ By order of apparition, thanks:
* Alex Watt (Google Cloud Storage patch)
* Jumpei Yoshimura (S3 docs)
* Jon Dufresne
* Michal Charemza



Expand Down
13 changes: 11 additions & 2 deletions storages/backends/s3boto3.py
@@ -1,3 +1,5 @@
import base64
import hashlib
import io
import mimetypes
import os
Expand Down Expand Up @@ -72,6 +74,7 @@ def __init__(self, name, mode, storage, buffer_size=None):
self.obj.load()
self._is_dirty = False
self._file = None
self._file_md5 = hashlib.md5()
self._multipart = None
# 5 MB is the minimum part size (if there is more than one part).
# Amazon allows up to 10,000 parts. The default supports uploads
Expand Down Expand Up @@ -127,7 +130,9 @@ def write(self, content):
self._multipart = self.obj.initiate_multipart_upload(**parameters)
if self.buffer_size <= self._buffer_file_size:
self._flush_write_buffer()
return super(S3Boto3StorageFile, self).write(force_bytes(content))
content_bytes = force_bytes(content)
self._file_md5.update(content_bytes)
return super(S3Boto3StorageFile, self).write(content_bytes)

@property
def _buffer_file_size(self):
Expand All @@ -145,9 +150,13 @@ def _flush_write_buffer(self):
self._write_counter += 1
self.file.seek(0)
part = self._multipart.Part(self._write_counter)
part.upload(Body=self.file.read())
part.upload(
Body=self.file.read(),
ContentMD5=base64.b64encode(self._file_md5.digest()).decode('utf-8')
)
self.file.seek(0)
self.file.truncate()
self._file_md5 = hashlib.md5()

def close(self):
if self._is_dirty:
Expand Down
15 changes: 14 additions & 1 deletion tests/test_s3boto3.py
@@ -1,7 +1,9 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import base64
import gzip
import hashlib
import pickle
import threading
import warnings
Expand Down Expand Up @@ -269,7 +271,9 @@ def test_storage_open_write(self):
file.close()
multipart.Part.assert_called_with(1)
part = multipart.Part.return_value
part.upload.assert_called_with(Body=content.encode('utf-8'))
content_bytes = content.encode('utf-8')
content_md5 = base64.b64encode(hashlib.md5(content_bytes).digest()).decode('utf-8')
part.upload.assert_called_with(Body=content_bytes, ContentMD5=content_md5)
multipart.complete.assert_called_once_with(
MultipartUpload={'Parts': [{'ETag': '123', 'PartNumber': 1}]})

Expand Down Expand Up @@ -328,6 +332,15 @@ def test_storage_write_beyond_buffer_size(self):
for args_list in part.upload.call_args_list)
)
self.assertEqual(uploaded_content, written_content)
uploaded_md5s = [
args_list[1]['ContentMD5']
for args_list in part.upload.call_args_list
]
correct_md5s = [
base64.b64encode(hashlib.md5(args_list[1]['Body']).digest()).decode('utf-8')
for args_list in part.upload.call_args_list
]
self.assertListEqual(uploaded_md5s, correct_md5s)
multipart.complete.assert_called_once_with(
MultipartUpload={'Parts': [
{'ETag': '123', 'PartNumber': 1},
Expand Down