Skip to content

Commit

Permalink
Change how the key ring cache is updated (#54675)
Browse files Browse the repository at this point in the history
* Change how the key ring cache is updated

I wanted to address two concerns:

1. If there is a cached key ring, but it is expired, the first thread to discover this will _synchronously_ refresh the cache - other threads will continue to use the old value.
2. If a key ring refresh is forced, it will always hit the backing repository, even if several threads want a refresh at the same time.

With this change, _all_ key ring updates are computed on a thread-pool thread and callers block exactly when there is no cached key ring for them to fall back on (first run, for example) or if they are forcing a refresh (an in-flight refresh is considered satisfactory).

Moving this work to a background thread will give us more room to be generous with retries when (e.g.) Azure KeyVault is unreachable or a file is locked.

The old behavior can be restored using the appcontext switch `Microsoft.AspNetCore.DataProtection.KeyManagement.DisableAsyncKeyRingUpdate`.  This is a safety valve and not part of configuration - we'll remove the switch and the old code path in the next release if nothing blows up.

Micro-benchmarking (on 9.0 on x64) shows that the new version has comparable performance in the common case of finding an unexpired key ring in the cache.
  • Loading branch information
amcasey committed Apr 18, 2024
1 parent 8321d83 commit c7aae8f
Show file tree
Hide file tree
Showing 4 changed files with 310 additions and 44 deletions.
5 changes: 5 additions & 0 deletions src/DataProtection/DataProtection/src/Error.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,9 @@ public static InvalidOperationException KeyRingProvider_DefaultKeyRevoked(Guid i
var message = string.Format(CultureInfo.CurrentCulture, Resources.KeyRingProvider_DefaultKeyRevoked, id);
return new InvalidOperationException(message);
}

public static InvalidOperationException KeyRingProvider_RefreshFailedOnOtherThread()
{
return new InvalidOperationException(Resources.KeyRingProvider_RefreshFailedOnOtherThread);
}
}
163 changes: 163 additions & 0 deletions src/DataProtection/DataProtection/src/KeyManagement/KeyRingProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Cryptography;
using Microsoft.AspNetCore.DataProtection.KeyManagement.Internal;
using Microsoft.Extensions.Logging;
Expand All @@ -16,13 +17,17 @@ namespace Microsoft.AspNetCore.DataProtection.KeyManagement;

internal sealed class KeyRingProvider : ICacheableKeyRingProvider, IKeyRingProvider
{
private const string DisableAsyncKeyRingUpdateSwitchKey = "Microsoft.AspNetCore.DataProtection.KeyManagement.DisableAsyncKeyRingUpdate";

private CacheableKeyRing? _cacheableKeyRing;
private readonly object _cacheableKeyRingLockObj = new object();
private Task<CacheableKeyRing>? _cacheableKeyRingTask; // Also covered by _cacheableKeyRingLockObj
private readonly IDefaultKeyResolver _defaultKeyResolver;
private readonly bool _autoGenerateKeys;
private readonly TimeSpan _newKeyLifetime;
private readonly IKeyManager _keyManager;
private readonly ILogger _logger;
private readonly bool _disableAsyncKeyRingUpdate;

public KeyRingProvider(
IKeyManager keyManager,
Expand Down Expand Up @@ -52,6 +57,8 @@ internal sealed class KeyRingProvider : ICacheableKeyRingProvider, IKeyRingProvi

// We will automatically refresh any unknown keys for 2 minutes see https://github.com/dotnet/aspnetcore/issues/3975
AutoRefreshWindowEnd = DateTime.UtcNow.AddMinutes(2);

AppContext.TryGetSwitch(DisableAsyncKeyRingUpdateSwitchKey, out _disableAsyncKeyRingUpdate);
}

// for testing
Expand Down Expand Up @@ -195,6 +202,19 @@ internal IKeyRing RefreshCurrentKeyRing()

internal IKeyRing GetCurrentKeyRingCore(DateTime utcNow, bool forceRefresh = false)
{
// We're making a big, scary change to the way this cache is updated: now threads
// only block during computation of the new value if no old value is available
// (or if they force it). We'll leave the old code in place, behind an appcontext
// switch in case it turns out to have unwelcome emergent behavior.
// TODO: Delete one of these codepaths in 10.0.
return _disableAsyncKeyRingUpdate
? GetCurrentKeyRingCoreOld(utcNow, forceRefresh)
: GetCurrentKeyRingCoreNew(utcNow, forceRefresh);
}

private IKeyRing GetCurrentKeyRingCoreOld(DateTime utcNow, bool forceRefresh)
{
// DateTimes are only meaningfully comparable if they share the same Kind - require Utc for consistency
Debug.Assert(utcNow.Kind == DateTimeKind.Utc);

// Can we return the cached keyring to the caller?
Expand Down Expand Up @@ -292,6 +312,149 @@ internal IKeyRing GetCurrentKeyRingCore(DateTime utcNow, bool forceRefresh = fal
}
}

private IKeyRing GetCurrentKeyRingCoreNew(DateTime utcNow, bool forceRefresh)
{
// DateTimes are only meaningfully comparable if they share the same Kind - require Utc for consistency
Debug.Assert(utcNow.Kind == DateTimeKind.Utc);

// The 99% and perf-critical case is that there is no task in-flight and the cached
// key ring is valid. We do what we can to avoid unnecessary overhead (locking,
// context switching, etc) on this path.

CacheableKeyRing? existingCacheableKeyRing = null;

// Can we return the cached keyring to the caller?
if (!forceRefresh)
{
existingCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
if (CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow))
{
return existingCacheableKeyRing.KeyRing;
}
}

// If work kicked off by a previous caller has completed, we should use those results
// We check this outside the lock to reduce contention in the common case (no task).
// Logically, it would probably make more sense to check this before checking whether
// the cache is valid - there could be a newer value available - but keeping that path
// fast is more important. The next forced refresh or cache expiration will cause the
// new value to be picked up.
var existingTask = Volatile.Read(ref _cacheableKeyRingTask);
if (existingTask is not null && existingTask.IsCompleted)
{
var taskKeyRing = GetKeyRingFromCompletedTask(existingTask, utcNow); // Throws if the task failed
if (taskKeyRing is not null)
{
return taskKeyRing;
}
}

// The cached keyring hasn't been created or must be refreshed. We'll allow one thread to
// create a task to update the keyring, and all threads will continue to use the existing cached
// keyring while the first thread performs the update. There is an exception: if there
// is no usable existing cached keyring, all callers must block until the keyring exists.
lock (_cacheableKeyRingLockObj)
{
// Update existingTask, in case we're not the first to acquire the lock
existingTask = Volatile.Read(ref _cacheableKeyRingTask);
if (existingTask is null)
{
// If there's no existing task, make one now
// PERF: Closing over utcNow substantially slows down the fast case (valid cache) in micro-benchmarks
// (closing over `this` for CacheableKeyRingProvider doesn't seem impactful)
existingTask = Task.Factory.StartNew(
utcNow => CacheableKeyRingProvider.GetCacheableKeyRing((DateTime)utcNow!),
utcNow,
CancellationToken.None, // GetKeyRingFromCompletedTask will need to react if this becomes cancellable
TaskCreationOptions.DenyChildAttach,
TaskScheduler.Default);
Volatile.Write(ref _cacheableKeyRingTask, existingTask);
}
}

if (existingCacheableKeyRing is not null)
{
Debug.Assert(!forceRefresh, "Read cached key ring even though forceRefresh is true");
Debug.Assert(!CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow), "Should already have returned a valid cached key ring");
return existingCacheableKeyRing.KeyRing;
}

// Since there's no cached key ring we can return, we have to wait. It's not ideal to wait for a task we
// just scheduled, but it makes the code a lot simpler (compared to having a separate, synchronous code path).
// Cleverness: swallow any exceptions - they'll be surfaced by GetKeyRingFromCompletedTask, if appropriate.
existingTask
.ContinueWith(static _ => { }, TaskScheduler.Default)
.Wait();

var newKeyRing = GetKeyRingFromCompletedTask(existingTask, utcNow); // Throws if the task failed (winning thread only)
if (newKeyRing is null)
{
// Another thread won - check whether it cached a new key ring
var newCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
if (newCacheableKeyRing is null)
{
// There will have been a better exception from the winning thread
throw Error.KeyRingProvider_RefreshFailedOnOtherThread();
}

newKeyRing = newCacheableKeyRing.KeyRing;
}

return newKeyRing;
}

/// <summary>
/// Returns null if another thread already processed the completed task.
/// Otherwise, if the given completed task completed successfully, clears the task
/// and either caches and returns the resulting key ring or throws, according to the
/// successfulness of the task.
/// </summary>
private IKeyRing? GetKeyRingFromCompletedTask(Task<CacheableKeyRing> task, DateTime utcNow)
{
Debug.Assert(task.IsCompleted);

lock (_cacheableKeyRingLockObj)
{
// If the parameter doesn't match the field, another thread has already consumed the task (and it's reflected in _cacheableKeyRing)
if (!ReferenceEquals(task, Volatile.Read(ref _cacheableKeyRingTask)))
{
return null;
}

Volatile.Write(ref _cacheableKeyRingTask, null);

if (task.Status == TaskStatus.RanToCompletion)
{
var newCacheableKeyRing = task.Result;
Volatile.Write(ref _cacheableKeyRing, newCacheableKeyRing);

// An unconsumed task result is considered to satisfy forceRefresh. One could quibble that this isn't really
// a forced refresh, but we'll still return a key ring newer than the one the caller was dissatisfied with.
return newCacheableKeyRing.KeyRing;
}

Debug.Assert(!task.IsCanceled, "How did a task with no cancellation token get canceled?");
Debug.Assert(task.Exception is not null, "Task should have either completed successfully or with an exception");
var exception = task.Exception;

var existingCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
if (existingCacheableKeyRing is not null && !CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow))
{
// If reading failed, we probably don't want to try again for a little bit, so slightly extend the
// lifetime of the current cache entry
Volatile.Write(ref _cacheableKeyRing, existingCacheableKeyRing.WithTemporaryExtendedLifetime(utcNow));

_logger.ErrorOccurredWhileRefreshingKeyRing(exception); // This one mentions the no-retry window
}
else
{
_logger.ErrorOccurredWhileReadingKeyRing(exception);
}

throw exception.InnerExceptions.Count == 1 ? exception.InnerExceptions[0] : exception;
}
}

private static TimeSpan GetRefreshPeriodWithJitter(TimeSpan refreshPeriod)
{
// We'll fudge the refresh period up to -20% so that multiple applications don't try to
Expand Down
3 changes: 3 additions & 0 deletions src/DataProtection/DataProtection/src/Resources.resx
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@
<data name="KeyRingProvider_DefaultKeyRevoked" xml:space="preserve">
<value>The key ring's default data protection key {0:B} has been revoked.</value>
</data>
<data name="KeyRingProvider_RefreshFailedOnOtherThread" xml:space="preserve">
<value>Another thread failed to refresh the key ring.</value>
</data>
<data name="LifetimeMustNotBeNegative" xml:space="preserve">
<value>{0} must not be negative.</value>
</data>
Expand Down

0 comments on commit c7aae8f

Please sign in to comment.