From da8f7427c27f0d26092d0536d1343421250fc4cf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 29 Sep 2022 15:46:03 +0100 Subject: [PATCH] Fix S3 query canonicalization (#2800) (#2801) * Fix S3 query canonicalization (#2800) * Disable listing with spaces on azurite and localstack --- object_store/src/aws/client.rs | 16 ++----------- object_store/src/aws/credential.rs | 37 +++++++++++++++++++++++++++++- object_store/src/aws/mod.rs | 20 ++++++++++++++-- object_store/src/azure/mod.rs | 15 +++++------- object_store/src/lib.rs | 22 ++++++++++++++++++ object_store/src/path/mod.rs | 11 +++++++++ 6 files changed, 95 insertions(+), 26 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index f800fec3dc5..5ec9390ec89 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -16,6 +16,7 @@ // under the License. use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; +use crate::aws::STRICT_PATH_ENCODE_SET; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::multipart::UploadPart; @@ -26,26 +27,13 @@ use crate::{ }; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; -use percent_encoding::{utf8_percent_encode, AsciiSet, PercentEncode, NON_ALPHANUMERIC}; +use percent_encoding::{utf8_percent_encode, PercentEncode}; use reqwest::{Client as ReqwestClient, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::ops::Range; use std::sync::Arc; -// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html -// -// Do not URI-encode any of the unreserved characters that RFC 3986 defines: -// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). -const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC - .remove(b'-') - .remove(b'.') - .remove(b'_') - .remove(b'~'); - -/// This struct is used to maintain the URI path encoding -const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); - /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 1abf42be910..d4461645f3c 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aws::STRICT_ENCODE_SET; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::util::hmac_sha256; @@ -22,6 +23,7 @@ use crate::{Result, RetryConfig}; use bytes::Buf; use chrono::{DateTime, Utc}; use futures::TryFutureExt; +use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; @@ -29,6 +31,7 @@ use std::collections::BTreeMap; use std::sync::Arc; use std::time::Instant; use tracing::warn; +use url::Url; type StdError = Box; @@ -103,13 +106,14 @@ impl<'a> RequestSigner<'a> { request.headers_mut().insert(HASH_HEADER, header_digest); let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); + let canonical_query = canonicalize_query(request.url()); // https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html let canonical_request = format!( "{}\n{}\n{}\n{}\n{}\n{}", request.method().as_str(), request.url().path(), // S3 doesn't percent encode this like other services - request.url().query().unwrap_or(""), // This assumes the query pairs are in order + canonical_query, canonical_headers, signed_headers, digest @@ -207,6 +211,37 @@ fn hex_encode(bytes: &[u8]) -> String { out } +/// Canonicalizes query parameters into the AWS canonical form +/// +/// +fn canonicalize_query(url: &Url) -> String { + use std::fmt::Write; + + let capacity = match url.query() { + Some(q) if !q.is_empty() => q.len(), + _ => return String::new(), + }; + let mut encoded = String::with_capacity(capacity + 1); + + let mut headers = url.query_pairs().collect::>(); + headers.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + + let mut first = true; + for (k, v) in headers { + if !first { + encoded.push('&'); + } + first = false; + let _ = write!( + encoded, + "{}={}", + utf8_percent_encode(k.as_ref(), &STRICT_ENCODE_SET), + utf8_percent_encode(v.as_ref(), &STRICT_ENCODE_SET) + ); + } + encoded +} + /// Canonicalizes headers into the AWS Canonical Form. /// /// diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index d1d0a12cdaf..d186c7f47e3 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -58,6 +58,20 @@ use crate::{ mod client; mod credential; +// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html +// +// Do not URI-encode any of the unreserved characters that RFC 3986 defines: +// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). +pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = + percent_encoding::NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); + +/// This struct is used to maintain the URI path encoding +const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -551,7 +565,7 @@ mod tests { use super::*; use crate::tests::{ get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, + put_get_delete_list_opts, rename_and_copy, stream_get, }; use bytes::Bytes; use std::env; @@ -677,9 +691,11 @@ mod tests { #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); - put_get_delete_list(&integration).await; + // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 + put_get_delete_list_opts(&integration, is_local).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index dd1cde9c7a2..f7ca4cf4e8c 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -595,7 +595,7 @@ mod tests { use super::*; use crate::tests::{ copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, + put_get_delete_list, put_get_delete_list_opts, rename_and_copy, stream_get, }; use std::env; @@ -663,9 +663,10 @@ mod tests { #[tokio::test] async fn azure_blob_test() { + let use_emulator = env::var("AZURE_USE_EMULATOR").is_ok(); let integration = maybe_skip_integration!().build().unwrap(); - - put_get_delete_list(&integration).await; + // Azurite doesn't support listing with spaces - https://github.com/localstack/localstack/issues/6328 + put_get_delete_list_opts(&integration, use_emulator).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -687,13 +688,9 @@ mod tests { .with_container_name( env::var("OBJECT_STORE_BUCKET").expect("must be set OBJECT_STORE_BUCKET"), ) - .with_client_secret_authorization( - env::var("AZURE_STORAGE_CLIENT_ID") + .with_access_key( + env::var("AZURE_STORAGE_ACCESS_KEY") .expect("must be set AZURE_STORAGE_CLIENT_ID"), - env::var("AZURE_STORAGE_CLIENT_SECRET") - .expect("must be set AZURE_STORAGE_CLIENT_SECRET"), - env::var("AZURE_STORAGE_TENANT_ID") - .expect("must be set AZURE_STORAGE_TENANT_ID"), ); let integration = builder.build().unwrap(); diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 16f0c6f3a2a..5eaaabaf294 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -506,6 +506,13 @@ mod tests { use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { + put_get_delete_list_opts(storage, false).await + } + + pub(crate) async fn put_get_delete_list_opts( + storage: &DynObjectStore, + skip_list_with_spaces: bool, + ) { delete_fixtures(storage).await; let content_list = flatten_list_stream(storage, None).await.unwrap(); @@ -701,6 +708,21 @@ mod tests { assert_eq!(files, vec![path.clone()]); storage.delete(&path).await.unwrap(); + + let path = Path::parse("foo bar/I contain spaces.parquet").unwrap(); + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + storage.head(&path).await.unwrap(); + + if !skip_list_with_spaces { + let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + } + storage.delete(&path).await.unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert!(files.is_empty(), "{:?}", files); } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index e5a7b6443bb..80e0f792aa5 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -534,4 +534,15 @@ mod tests { needle ); } + + #[test] + fn path_containing_spaces() { + let a = Path::from_iter(["foo bar", "baz"]); + let b = Path::from("foo bar/baz"); + let c = Path::parse("foo bar/baz").unwrap(); + + assert_eq!(a.raw, "foo bar/baz"); + assert_eq!(a.raw, b.raw); + assert_eq!(b.raw, c.raw); + } }