Add s3 support to I/O operations. (#126)

fabiocaccamo · Oct 12, 2022 · 5ac299d · 5ac299d
1 parent a1e5b9f
commit 5ac299d
Show file tree

Hide file tree

Showing 13 changed files with 194 additions and 59 deletions.
diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml
@@ -27,7 +27,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
         cache: 'pip'
 
-    - name: Install dependencies
+    - name: Install requirements
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt

diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@ python-benedict is a dict subclass with **keylist/keypath** support, **I/O** sho
 -   **Keypath** support using **keypath-separator** *(dot syntax by default)*.
 -   Keypath **list-index** support  *(also negative)* using the standard `[n]` suffix.
 -   Normalized **I/O operations** with most common formats: `base64`, `csv`, `ini`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xls`, `xml`, `yaml`.
+-   `NEW` Multiple **I/O operations** backends: `filepath` *(read/write)*, `url` *(read-only)*, `s3` *(read/write)*.
 -   Many **utility** and **parse methods** to retrieve data as needed *(check the [API](#api) section)*.
 -   Well **tested**. ;)
 
@@ -437,7 +438,7 @@ d.unique()
 
 ### I/O methods
 
-It is possible to create a `benedict` instance directly from data source (filepath, url or data-string) by passing the data source and the data format (default 'json') in the constructor.
+It is possible to create a `benedict` instance directly from data-source (`filepath`, `url`, `s3` or `data-string`) by passing the data source and the data format (optional, default 'json') in the constructor.
 
 ```python
 # filepath
@@ -446,11 +447,14 @@ d = benedict('/root/data.yml', format='yaml')
 # url
 d = benedict('https://localhost:8000/data.xml', format='xml')
 
+# s3
+d = benedict('s3://my-bucket/data.xml', s3_options={"aws_access_key_id": "...", "aws_secret_access_key": "..."})
+
 # data-string
 d = benedict('{"a": 1, "b": 2, "c": 3, "x": 7, "y": 8, "z": 9}')
 ```
 
-These methods simplify I/O operations with most common formats: `base64`, `csv`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xml`, `yaml`.
+These methods simplify I/O operations with most common formats: `base64`, `csv`, `ini`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xls`, `xml`, `yaml`.
 
 In all `from_*` methods, the first argument can be: **url**, **filepath** or **data-string**.
 

diff --git a/benedict/dicts/io/io_dict.py b/benedict/dicts/io/io_dict.py
@@ -44,10 +44,7 @@ def _decode(s, format, **kwargs):
 
     @staticmethod
     def _encode(d, format, **kwargs):
-        filepath = kwargs.pop("filepath", None)
         s = io_util.encode(d, format, **kwargs)
-        if filepath:
-            io_util.write_file(filepath, s)
         return s
 
     @classmethod

diff --git a/benedict/dicts/io/io_util.py b/benedict/dicts/io/io_util.py
@@ -3,15 +3,18 @@
 from benedict.serializers import (
     get_format_by_path,
     get_serializer_by_format,
-    get_serializers_extensions,
 )
 
+# from botocore.exceptions import ClientError
+from urllib.parse import urlparse
+
+import boto3
 import fsutil
 import tempfile
 
 
 def autodetect_format(s):
-    if is_url(s) or is_filepath(s):
+    if any([is_url(s), is_s3(s), is_filepath(s)]):
         return get_format_by_path(s)
     return None
 
@@ -20,20 +23,23 @@ def decode(s, format, **kwargs):
     serializer = get_serializer_by_format(format)
     if not serializer:
         raise ValueError(f"Invalid format: {format}.")
-    decode_opts = kwargs.copy()
+    options = kwargs.copy()
     if format in ["b64", "base64"]:
-        decode_opts.setdefault("subformat", "json")
-    content = read_content(s, format)
-    data = serializer.decode(content, **decode_opts)
+        options.setdefault("subformat", "json")
+    content = read_content(s, format, **options)
+    data = serializer.decode(content, **options)
     return data
 
 
-def encode(d, format, **kwargs):
+def encode(d, format, filepath=None, **kwargs):
     serializer = get_serializer_by_format(format)
     if not serializer:
         raise ValueError(f"Invalid format: {format}.")
-    s = serializer.encode(d, **kwargs)
-    return s
+    options = kwargs.copy()
+    content = serializer.encode(d, **options)
+    if filepath:
+        write_content(filepath, content, **options)
+    return content
 
 
 def is_binary_format(format):
@@ -49,51 +55,94 @@ def is_data(s):
 
 
 def is_filepath(s):
-    if any([s.endswith(ext) for ext in get_serializers_extensions()]):
-        return True
-    return fsutil.is_file(s)
+    return fsutil.is_file(s) or get_format_by_path(s)
+
+
+def is_s3(s):
+    return s.startswith("s3://") and get_format_by_path(s)
 
 
 def is_url(s):
     return any([s.startswith(protocol) for protocol in ["http://", "https://"]])
 
 
-def read_content(s, format):
+def parse_s3_url(url):
+    parsed = urlparse(url, allow_fragments=False)
+    bucket = parsed.netloc
+    key = parsed.path.lstrip("/")
+    if parsed.query:
+        key += "?" + self._parsed.query
+    url = parsed.geturl()
+    return {
+        "url": url,
+        "bucket": bucket,
+        "key": key,
+    }
+
+
+def read_content(s, format=None, **options):
     # s -> filepath or url or data
+    options.setdefault("format", format)
     s = s.strip()
     if is_data(s):
         return s
     elif is_url(s):
-        return read_content_from_url(s, format)
+        return read_content_from_url(s, **options)
+    elif is_s3(s):
+        return read_content_from_s3(s, **options)
     elif is_filepath(s):
-        return read_content_from_file(s, format)
+        return read_content_from_file(s, **options)
     # one-line data?!
     return s
 
 
-def read_content_from_file(filepath, format):
+def read_content_from_file(filepath, format=None, **options):
     binary_format = is_binary_format(format)
     if binary_format:
         return filepath
-    return read_file(filepath)
+    return fsutil.read_file(filepath)
+
+
+def read_content_from_s3(url, s3_options, format=None, **options):
+    s3_url = parse_s3_url(url)
+    dirpath = tempfile.gettempdir()
+    filename = fsutil.get_filename(s3_url["key"])
+    filepath = fsutil.join_path(dirpath, filename)
+    s3 = boto3.client("s3", **s3_options)
+    s3.download_file(s3_url["bucket"], s3_url["key"], filepath)
+    s3.close()
+    content = read_content_from_file(filepath, format, **options)
+    return content
 
 
-def read_content_from_url(url, format, **options):
+def read_content_from_url(url, requests_options=None, format=None, **options):
+    requests_options = requests_options or {}
     binary_format = is_binary_format(format)
     if binary_format:
         dirpath = tempfile.gettempdir()
-        filepath = fsutil.download_file(url, dirpath, **options)
+        filepath = fsutil.download_file(url, dirpath, **requests_options)
         return filepath
-    return read_url(url, **options)
+    return fsutil.read_file_from_url(url, **requests_options)
 
 
-def read_file(filepath, **options):
-    return fsutil.read_file(filepath, **options)
+def write_content(filepath, content, **options):
+    if is_s3(filepath):
+        write_content_to_s3(filepath, content, **options)
+    else:
+        write_content_to_file(filepath, content, **options)
 
 
-def read_url(url, **options):
-    return fsutil.read_file_from_url(url, **options)
+def write_content_to_file(filepath, content, **options):
+    fsutil.write_file(filepath, content)
 
 
-def write_file(filepath, content, **options):
-    fsutil.write_file(filepath, content, **options)
+def write_content_to_s3(url, content, s3_options, **options):
+    s3_url = parse_s3_url(url)
+    dirpath = tempfile.gettempdir()
+    filename = fsutil.get_filename(s3_url["key"])
+    filepath = fsutil.join_path(dirpath, filename)
+    fsutil.write_file(filepath, content)
+    s3 = boto3.client("s3", **s3_options)
+    s3.upload_file(filepath, s3_url["bucket"], s3_url["key"])
+    s3.close()
+    fsutil.remove_file(filepath)
diff --git a/benedict/serializers/__init__.py b/benedict/serializers/__init__.py
@@ -79,10 +79,4 @@ def get_serializer_by_format(format):
     format_key = (format or "").lower().strip()
     format_key = re.sub(r"[\s\-\_]*", "", format_key)
     serializer = _SERIALIZERS_BY_EXTENSION.get(format_key, None)
-    if not serializer:
-        raise ValueError(f"Invalid format: {format}.")
     return serializer
-
-
-def get_serializers_extensions():
-    return list(_SERIALIZERS_EXTENSIONS)
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -2,4 +2,5 @@ codecov == 2.1.12
 coverage == 6.5.0
 flake8 == 5.0.4
 orjson == 3.8.0
+python-decouple == 3.6
 tox == 3.26.0
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+boto3 == 1.24.89
 ftfy == 6.1.1
 mailchecker == 5.0.2
 openpyxl == 3.0.10

diff --git a/setup.py b/setup.py
@@ -94,6 +94,7 @@
         "unique",
     ],
     install_requires=[
+        "boto3 >= 1.24.89, < 2.0.0",
         "ftfy >= 6.0.0, < 7.0.0",
         "mailchecker >= 4.1.0, < 6.0.0",
         "openpyxl >= 3.0.0, < 4.0.0",

diff --git a/tests/dicts/base/test_base_dict.py b/tests/dicts/base/test_base_dict.py
@@ -234,12 +234,12 @@ def test__str__with_pointer(self):
     def test__unicode__(self):
         d = BaseDict()
         d["name"] = "pythòn-bènèdìçt"
-        print(unicode(d))
+        # print(unicode(d))
 
     @unittest.skipIf(sys.version_info[0] > 2, "No unicode in Python > 2")
     def test__unicode__with_pointer(self):
         d = BaseDict({"name": "pythòn-bènèdìçt"})
-        print(unicode(d))
+        # print(unicode(d))
 
     def test_clear(self):
         d = {

diff --git a/tests/dicts/io/test_io_dict_xls.py b/tests/dicts/io/test_io_dict_xls.py
@@ -2,6 +2,8 @@
 
 from benedict.dicts.io import IODict
 
+from decouple import config
+
 from .test_io_dict import io_dict_test_case
 
 
@@ -104,7 +106,8 @@ def test_from_xls_with_valid_url_valid_content(self):
             with self.subTest(
                 msg=f"test_from_xls_({extension})_with_valid_url_valid_content"
             ):
-                url = f"https://github.com/fabiocaccamo/python-benedict/raw/xls/tests/dicts/io/input/valid-content.{extension}"
+                # url = f"https://github.com/fabiocaccamo/python-benedict/raw/s3/tests/dicts/io/input/valid-content.{extension}"
+                url = f"https://github.com/fabiocaccamo/python-benedict/raw/master/tests/dicts/io/input/valid-content.{extension}"
                 # static method
                 d = IODict.from_xls(url)
                 self.assertTrue(isinstance(d, dict))
@@ -118,6 +121,65 @@ def test_from_xls_with_valid_url_valid_content(self):
                 self.assertTrue(isinstance(d, dict))
                 self.assertEqual(d, expected_dict)
 
+    def test_from_xls_with_valid_s3_url_valid_content(self):
+        aws_access_key_id = config("AWS_ACCESS_KEY_ID", default=None)
+        aws_secret_access_key = config("AWS_SECRET_ACCESS_KEY", default=None)
+        if not all([aws_access_key_id, aws_secret_access_key]):
+            # don't use s3 on GH CI
+            return
+        s3_options = {
+            "aws_access_key_id": aws_access_key_id,
+            "aws_secret_access_key": aws_secret_access_key,
+        }
+        expected_dict = {
+            "values": [
+                {
+                    "mon": 10,
+                    "tue": 11,
+                    "wed": 12,
+                    "thu": 13,
+                    "fri": 14,
+                    "sat": 15,
+                    "sun": 16,
+                },
+                {
+                    "mon": 20,
+                    "tue": 21,
+                    "wed": 22,
+                    "thu": 23,
+                    "fri": 24,
+                    "sat": 25,
+                    "sun": 26,
+                },
+                {
+                    "mon": 30,
+                    "tue": 31,
+                    "wed": 32,
+                    "thu": 33,
+                    "fri": 34,
+                    "sat": 35,
+                    "sun": 36,
+                },
+            ]
+        }
+        for extension in self._extensions:
+            with self.subTest(
+                msg=f"test_from_xls_({extension})_with_valid_s3_url_valid_content"
+            ):
+                url = f"s3://python-benedict/valid-content.{extension}"
+                # static method
+                d = IODict.from_xls(url, s3_options=s3_options)
+                self.assertTrue(isinstance(d, dict))
+                self.assertEqual(d, expected_dict)
+                # constructor explicit format
+                d = IODict(url, format=extension, s3_options=s3_options)
+                self.assertTrue(isinstance(d, dict))
+                self.assertEqual(d, expected_dict)
+                # constructor implicit format
+                d = IODict(url, s3_options=s3_options)
+                self.assertTrue(isinstance(d, dict))
+                self.assertEqual(d, expected_dict)
+
     def test_from_xls_with_valid_file_valid_content_custom_sheet_by_index_and_columns(
         self,
     ):