Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache S3 key metadata to avoid unnecessary calls especially with collectstatic #1367

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
87 changes: 70 additions & 17 deletions storages/backends/s3.py
Expand Up @@ -3,13 +3,15 @@
import posixpath
import tempfile
import threading
import typing
import warnings
from datetime import datetime
from datetime import timedelta
from urllib.parse import parse_qsl
from urllib.parse import urlencode
from urllib.parse import urlsplit

from cachetools import LRUCache
from django.contrib.staticfiles.storage import ManifestFilesMixin
from django.core.exceptions import ImproperlyConfigured
from django.core.exceptions import SuspiciousOperation
Expand Down Expand Up @@ -98,6 +100,12 @@ def _filter_download_params(params):
}


# Use named tuple to make it clear what the values are
class _S3Meta(typing.NamedTuple):
content_length: int
last_modified: datetime


@deconstructible
class S3File(CompressedFileMixin, File):
"""
Expand Down Expand Up @@ -474,6 +482,8 @@ def _normalize_name(self, name):

def _open(self, name, mode="rb"):
name = self._normalize_name(clean_name(name))
# Remove cached metadata since writing will alter it
self._evict_from_cache(name)
try:
f = S3File(name, mode, self)
except ClientError as err:
Expand All @@ -485,6 +495,8 @@ def _open(self, name, mode="rb"):
def _save(self, name, content):
cleaned_name = clean_name(name)
name = self._normalize_name(cleaned_name)
# Remove cached metadata since writing will alter it
self._evict_from_cache(name)
params = self._get_write_parameters(name, content)

if is_seekable(content):
Expand Down Expand Up @@ -516,6 +528,8 @@ def _save(self, name, content):
def delete(self, name):
try:
name = self._normalize_name(clean_name(name))
# Remove cached metadata since the key will be deleted
self._evict_from_cache(name)
self.bucket.Object(name).delete()
except ClientError as err:
if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
Expand All @@ -527,15 +541,11 @@ def delete(self, name):

def exists(self, name):
name = self._normalize_name(clean_name(name))
try:
self.connection.meta.client.head_object(Bucket=self.bucket_name, Key=name)
key_meta = self._get_from_cache(name) # type: _S3Meta
if key_meta:
return True
except ClientError as err:
if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
return False

# Some other error was encountered. Re-raise it.
raise
key_meta = self._fetch_to_cache(name)
return key_meta is not None

def listdir(self, name):
path = self._normalize_name(clean_name(name))
Expand All @@ -560,12 +570,14 @@ def listdir(self, name):

def size(self, name):
name = self._normalize_name(clean_name(name))
try:
return self.bucket.Object(name).content_length
except ClientError as err:
if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
raise FileNotFoundError("File does not exist: %s" % name)
raise # Let it bubble up if it was some other error
key_meta = self._get_from_cache(name) # type: _S3Meta
if key_meta:
# return the size from the cache
return key_meta.content_length
key_meta = self._fetch_to_cache(name)
if key_meta:
return key_meta.content_length
raise FileNotFoundError("File does not exist: %s" % name)

def _get_write_parameters(self, name, content=None):
params = self.get_object_parameters(name)
Expand Down Expand Up @@ -601,12 +613,20 @@ def get_modified_time(self, name):
USE_TZ is True, otherwise returns a naive datetime in the local timezone.
"""
name = self._normalize_name(clean_name(name))
entry = self.bucket.Object(name)
key_meta = self._get_from_cache(name) # type: _S3Meta
if key_meta:
# use the cached last modified time
last_modified = key_meta.last_modified
else:
key_meta = self._fetch_to_cache(name)
if not key_meta:
raise FileNotFoundError("File does not exist: %s" % name)
last_modified = key_meta.last_modified
if setting("USE_TZ"):
# boto3 returns TZ aware timestamps
return entry.last_modified
return last_modified
else:
return make_naive(entry.last_modified)
return make_naive(last_modified)

def _strip_signing_parameters(self, url):
# Boto3 does not currently support generating URLs that are unsigned. Instead
Expand Down Expand Up @@ -678,6 +698,39 @@ def get_available_name(self, name, max_length=None):
return get_available_overwrite_name(name, max_length)
return super().get_available_name(name, max_length)

@property
def _cache(self):
cache = getattr(self, "__cache", None)
if cache is None:
# Even a small cache can be very helpful in reducing the number of
# requests to S3 and increasing the speed for example collectstatic command
self.__cache = LRUCache(maxsize=32)
cache = self.__cache
return cache

def _add_to_cache(self, name, value):
content_length = value["ContentLength"]
last_modified = value["LastModified"]
self._cache[name] = _S3Meta(content_length, last_modified)

def _evict_from_cache(self, name):
self._cache.pop(name, None)

def _get_from_cache(self, name):
return self._cache.get(name, None)

def _fetch_to_cache(self, name):
try:
key_meta = self.connection.meta.client.head_object(
Bucket=self.bucket_name, Key=name
)
self._add_to_cache(name, key_meta)
return self._get_from_cache(name)
except ClientError as err:
if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
return None
raise


class S3StaticStorage(S3Storage):
"""Querystring auth must be disabled so that url() returns a consistent output."""
Expand Down
75 changes: 69 additions & 6 deletions tests/test_s3.py
Expand Up @@ -642,14 +642,16 @@ def test_storage_listdir_empty(self):
self.assertEqual(files, [])

def test_storage_size(self):
obj = self.storage.bucket.Object.return_value
obj.content_length = 4098
self.storage.connection.meta.client.head_object.return_value = {
"ContentLength": 4098,
"LastModified": datetime.datetime.utcnow(),
}

name = "file.txt"
self.assertEqual(self.storage.size(name), obj.content_length)
self.assertEqual(self.storage.size(name), 4098)

def test_storage_size_not_exists(self):
self.storage.bucket.Object.side_effect = ClientError(
self.storage.connection.meta.client.head_object.side_effect = ClientError(
{"Error": {}, "ResponseMetadata": {"HTTPStatusCode": 404}},
"HeadObject",
)
Expand All @@ -666,8 +668,10 @@ def test_storage_mtime(self):
self._test_storage_mtime(use_tz)

def _test_storage_mtime(self, use_tz):
obj = self.storage.bucket.Object.return_value
obj.last_modified = datetime.datetime.now(datetime.timezone.utc)
self.storage.connection.meta.client.head_object.return_value = {
"ContentLength": 4098,
"LastModified": datetime.datetime.now(datetime.timezone.utc),
}

name = "file.txt"
self.assertIs(
Expand Down Expand Up @@ -982,6 +986,65 @@ def test_auth_config(self):
access_key="foo", secret_key="boo", session_profile="moo"
)

def test_open_should_evict_from_cache(self):
self._setup_test_data_to_cache()
self.assertTrue(self.storage.exists("file.txt"))
self.assertIsNotNone(self.storage._get_from_cache("file.txt"))
with self.storage.open("file.txt") as _:
self.assertIsNone(self.storage._get_from_cache("file.txt"))
self.assertIsNone(self.storage._get_from_cache("file.txt"))

def test_save_should_evict_from_cache(self):
self._setup_test_data_to_cache()
self.assertTrue(self.storage.exists("file.txt"))
self.assertIsNotNone(self.storage._get_from_cache("file.txt"))
self.storage.save("file.txt", ContentFile(b"abc"))
self.assertIsNone(self.storage._get_from_cache("file.txt"))

def test_delete_should_evict_from_cache(self):
self._setup_test_data_to_cache()
self.assertTrue(self.storage.exists("file.txt"))
self.assertIsNotNone(self.storage._get_from_cache("file.txt"))
self.storage.delete("file.txt")
self.assertIsNone(self.storage._get_from_cache("file.txt"))

def test_exists_should_use_cached_value(self):
self._setup_test_data_to_cache()
self.assertTrue(self.storage.exists("file.txt"))
self.assertTrue(self.storage.exists("file.txt"))

def test_size_should_use_cached_value(self):
self._setup_test_data_to_cache()
self.assertEqual(self.storage.size("file.txt"), 123)
self.assertEqual(self.storage.size("file.txt"), 123)

def test_get_modified_time_should_use_cached_value(self):
self._setup_test_data_to_cache()
modified_at = datetime.datetime(2021, 1, 1, 0, 0, 0)
self.assertEqual(self.storage.get_modified_time("file.txt"), modified_at)
self.assertEqual(self.storage.get_modified_time("file.txt"), modified_at)

def test_get_modified_time_not_exists(self):
self.storage.connection.meta.client.head_object.side_effect = ClientError(
{"Error": {}, "ResponseMetadata": {"HTTPStatusCode": 404}},
"HeadObject",
)
with self.assertRaisesMessage(
FileNotFoundError, "File does not exist: file.txt"
):
self.storage.get_modified_time("file.txt")

def _setup_test_data_to_cache(self):
mock_connection = mock.MagicMock()
self.storage._connections.connection = mock_connection
mock_connection.meta.client.head_object.side_effect = [
{
"ContentLength": 123,
"LastModified": datetime.datetime(2021, 1, 1, 0, 0, 0),
},
Exception("Should not be called"),
]


class S3StaticStorageTests(TestCase):
def setUp(self):
Expand Down