Skip to content

Commit

Permalink
Add s3 support to I/O operations. (#126)
Browse files Browse the repository at this point in the history
  • Loading branch information
fabiocaccamo committed Oct 12, 2022
1 parent a1e5b9f commit 5ac299d
Show file tree
Hide file tree
Showing 13 changed files with 194 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test-package.yml
Expand Up @@ -27,7 +27,7 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'pip'

- name: Install dependencies
- name: Install requirements
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
Expand Down
8 changes: 6 additions & 2 deletions README.md
Expand Up @@ -21,6 +21,7 @@ python-benedict is a dict subclass with **keylist/keypath** support, **I/O** sho
- **Keypath** support using **keypath-separator** *(dot syntax by default)*.
- Keypath **list-index** support *(also negative)* using the standard `[n]` suffix.
- Normalized **I/O operations** with most common formats: `base64`, `csv`, `ini`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xls`, `xml`, `yaml`.
- `NEW` Multiple **I/O operations** backends: `filepath` *(read/write)*, `url` *(read-only)*, `s3` *(read/write)*.
- Many **utility** and **parse methods** to retrieve data as needed *(check the [API](#api) section)*.
- Well **tested**. ;)

Expand Down Expand Up @@ -437,7 +438,7 @@ d.unique()

### I/O methods

It is possible to create a `benedict` instance directly from data source (filepath, url or data-string) by passing the data source and the data format (default 'json') in the constructor.
It is possible to create a `benedict` instance directly from data-source (`filepath`, `url`, `s3` or `data-string`) by passing the data source and the data format (optional, default 'json') in the constructor.

```python
# filepath
Expand All @@ -446,11 +447,14 @@ d = benedict('/root/data.yml', format='yaml')
# url
d = benedict('https://localhost:8000/data.xml', format='xml')

# s3
d = benedict('s3://my-bucket/data.xml', s3_options={"aws_access_key_id": "...", "aws_secret_access_key": "..."})

# data-string
d = benedict('{"a": 1, "b": 2, "c": 3, "x": 7, "y": 8, "z": 9}')
```

These methods simplify I/O operations with most common formats: `base64`, `csv`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xml`, `yaml`.
These methods simplify I/O operations with most common formats: `base64`, `csv`, `ini`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xls`, `xml`, `yaml`.

In all `from_*` methods, the first argument can be: **url**, **filepath** or **data-string**.

Expand Down
3 changes: 0 additions & 3 deletions benedict/dicts/io/io_dict.py
Expand Up @@ -44,10 +44,7 @@ def _decode(s, format, **kwargs):

@staticmethod
def _encode(d, format, **kwargs):
filepath = kwargs.pop("filepath", None)
s = io_util.encode(d, format, **kwargs)
if filepath:
io_util.write_file(filepath, s)
return s

@classmethod
Expand Down
101 changes: 75 additions & 26 deletions benedict/dicts/io/io_util.py
Expand Up @@ -3,15 +3,18 @@
from benedict.serializers import (
get_format_by_path,
get_serializer_by_format,
get_serializers_extensions,
)

# from botocore.exceptions import ClientError
from urllib.parse import urlparse

import boto3
import fsutil
import tempfile


def autodetect_format(s):
if is_url(s) or is_filepath(s):
if any([is_url(s), is_s3(s), is_filepath(s)]):
return get_format_by_path(s)
return None

Expand All @@ -20,20 +23,23 @@ def decode(s, format, **kwargs):
serializer = get_serializer_by_format(format)
if not serializer:
raise ValueError(f"Invalid format: {format}.")
decode_opts = kwargs.copy()
options = kwargs.copy()
if format in ["b64", "base64"]:
decode_opts.setdefault("subformat", "json")
content = read_content(s, format)
data = serializer.decode(content, **decode_opts)
options.setdefault("subformat", "json")
content = read_content(s, format, **options)
data = serializer.decode(content, **options)
return data


def encode(d, format, **kwargs):
def encode(d, format, filepath=None, **kwargs):
serializer = get_serializer_by_format(format)
if not serializer:
raise ValueError(f"Invalid format: {format}.")
s = serializer.encode(d, **kwargs)
return s
options = kwargs.copy()
content = serializer.encode(d, **options)
if filepath:
write_content(filepath, content, **options)
return content


def is_binary_format(format):
Expand All @@ -49,51 +55,94 @@ def is_data(s):


def is_filepath(s):
if any([s.endswith(ext) for ext in get_serializers_extensions()]):
return True
return fsutil.is_file(s)
return fsutil.is_file(s) or get_format_by_path(s)


def is_s3(s):
return s.startswith("s3://") and get_format_by_path(s)


def is_url(s):
return any([s.startswith(protocol) for protocol in ["http://", "https://"]])


def read_content(s, format):
def parse_s3_url(url):
parsed = urlparse(url, allow_fragments=False)
bucket = parsed.netloc
key = parsed.path.lstrip("/")
if parsed.query:
key += "?" + self._parsed.query
url = parsed.geturl()
return {
"url": url,
"bucket": bucket,
"key": key,
}


def read_content(s, format=None, **options):
# s -> filepath or url or data
options.setdefault("format", format)
s = s.strip()
if is_data(s):
return s
elif is_url(s):
return read_content_from_url(s, format)
return read_content_from_url(s, **options)
elif is_s3(s):
return read_content_from_s3(s, **options)
elif is_filepath(s):
return read_content_from_file(s, format)
return read_content_from_file(s, **options)
# one-line data?!
return s


def read_content_from_file(filepath, format):
def read_content_from_file(filepath, format=None, **options):
binary_format = is_binary_format(format)
if binary_format:
return filepath
return read_file(filepath)
return fsutil.read_file(filepath)


def read_content_from_s3(url, s3_options, format=None, **options):
s3_url = parse_s3_url(url)
dirpath = tempfile.gettempdir()
filename = fsutil.get_filename(s3_url["key"])
filepath = fsutil.join_path(dirpath, filename)
s3 = boto3.client("s3", **s3_options)
s3.download_file(s3_url["bucket"], s3_url["key"], filepath)
s3.close()
content = read_content_from_file(filepath, format, **options)
return content


def read_content_from_url(url, format, **options):
def read_content_from_url(url, requests_options=None, format=None, **options):
requests_options = requests_options or {}
binary_format = is_binary_format(format)
if binary_format:
dirpath = tempfile.gettempdir()
filepath = fsutil.download_file(url, dirpath, **options)
filepath = fsutil.download_file(url, dirpath, **requests_options)
return filepath
return read_url(url, **options)
return fsutil.read_file_from_url(url, **requests_options)


def read_file(filepath, **options):
return fsutil.read_file(filepath, **options)
def write_content(filepath, content, **options):
if is_s3(filepath):
write_content_to_s3(filepath, content, **options)
else:
write_content_to_file(filepath, content, **options)


def read_url(url, **options):
return fsutil.read_file_from_url(url, **options)
def write_content_to_file(filepath, content, **options):
fsutil.write_file(filepath, content)


def write_file(filepath, content, **options):
fsutil.write_file(filepath, content, **options)
def write_content_to_s3(url, content, s3_options, **options):
s3_url = parse_s3_url(url)
dirpath = tempfile.gettempdir()
filename = fsutil.get_filename(s3_url["key"])
filepath = fsutil.join_path(dirpath, filename)
fsutil.write_file(filepath, content)
s3 = boto3.client("s3", **s3_options)
s3.upload_file(filepath, s3_url["bucket"], s3_url["key"])
s3.close()
fsutil.remove_file(filepath)
6 changes: 0 additions & 6 deletions benedict/serializers/__init__.py
Expand Up @@ -79,10 +79,4 @@ def get_serializer_by_format(format):
format_key = (format or "").lower().strip()
format_key = re.sub(r"[\s\-\_]*", "", format_key)
serializer = _SERIALIZERS_BY_EXTENSION.get(format_key, None)
if not serializer:
raise ValueError(f"Invalid format: {format}.")
return serializer


def get_serializers_extensions():
return list(_SERIALIZERS_EXTENSIONS)
1 change: 1 addition & 0 deletions requirements-test.txt
Expand Up @@ -2,4 +2,5 @@ codecov == 2.1.12
coverage == 6.5.0
flake8 == 5.0.4
orjson == 3.8.0
python-decouple == 3.6
tox == 3.26.0
1 change: 1 addition & 0 deletions requirements.txt
@@ -1,3 +1,4 @@
boto3 == 1.24.89
ftfy == 6.1.1
mailchecker == 5.0.2
openpyxl == 3.0.10
Expand Down
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -94,6 +94,7 @@
"unique",
],
install_requires=[
"boto3 >= 1.24.89, < 2.0.0",
"ftfy >= 6.0.0, < 7.0.0",
"mailchecker >= 4.1.0, < 6.0.0",
"openpyxl >= 3.0.0, < 4.0.0",
Expand Down
4 changes: 2 additions & 2 deletions tests/dicts/base/test_base_dict.py
Expand Up @@ -234,12 +234,12 @@ def test__str__with_pointer(self):
def test__unicode__(self):
d = BaseDict()
d["name"] = "pythòn-bènèdìçt"
print(unicode(d))
# print(unicode(d))

@unittest.skipIf(sys.version_info[0] > 2, "No unicode in Python > 2")
def test__unicode__with_pointer(self):
d = BaseDict({"name": "pythòn-bènèdìçt"})
print(unicode(d))
# print(unicode(d))

def test_clear(self):
d = {
Expand Down
64 changes: 63 additions & 1 deletion tests/dicts/io/test_io_dict_xls.py
Expand Up @@ -2,6 +2,8 @@

from benedict.dicts.io import IODict

from decouple import config

from .test_io_dict import io_dict_test_case


Expand Down Expand Up @@ -104,7 +106,8 @@ def test_from_xls_with_valid_url_valid_content(self):
with self.subTest(
msg=f"test_from_xls_({extension})_with_valid_url_valid_content"
):
url = f"https://github.com/fabiocaccamo/python-benedict/raw/xls/tests/dicts/io/input/valid-content.{extension}"
# url = f"https://github.com/fabiocaccamo/python-benedict/raw/s3/tests/dicts/io/input/valid-content.{extension}"
url = f"https://github.com/fabiocaccamo/python-benedict/raw/master/tests/dicts/io/input/valid-content.{extension}"
# static method
d = IODict.from_xls(url)
self.assertTrue(isinstance(d, dict))
Expand All @@ -118,6 +121,65 @@ def test_from_xls_with_valid_url_valid_content(self):
self.assertTrue(isinstance(d, dict))
self.assertEqual(d, expected_dict)

def test_from_xls_with_valid_s3_url_valid_content(self):
aws_access_key_id = config("AWS_ACCESS_KEY_ID", default=None)
aws_secret_access_key = config("AWS_SECRET_ACCESS_KEY", default=None)
if not all([aws_access_key_id, aws_secret_access_key]):
# don't use s3 on GH CI
return
s3_options = {
"aws_access_key_id": aws_access_key_id,
"aws_secret_access_key": aws_secret_access_key,
}
expected_dict = {
"values": [
{
"mon": 10,
"tue": 11,
"wed": 12,
"thu": 13,
"fri": 14,
"sat": 15,
"sun": 16,
},
{
"mon": 20,
"tue": 21,
"wed": 22,
"thu": 23,
"fri": 24,
"sat": 25,
"sun": 26,
},
{
"mon": 30,
"tue": 31,
"wed": 32,
"thu": 33,
"fri": 34,
"sat": 35,
"sun": 36,
},
]
}
for extension in self._extensions:
with self.subTest(
msg=f"test_from_xls_({extension})_with_valid_s3_url_valid_content"
):
url = f"s3://python-benedict/valid-content.{extension}"
# static method
d = IODict.from_xls(url, s3_options=s3_options)
self.assertTrue(isinstance(d, dict))
self.assertEqual(d, expected_dict)
# constructor explicit format
d = IODict(url, format=extension, s3_options=s3_options)
self.assertTrue(isinstance(d, dict))
self.assertEqual(d, expected_dict)
# constructor implicit format
d = IODict(url, s3_options=s3_options)
self.assertTrue(isinstance(d, dict))
self.assertEqual(d, expected_dict)

def test_from_xls_with_valid_file_valid_content_custom_sheet_by_index_and_columns(
self,
):
Expand Down

0 comments on commit 5ac299d

Please sign in to comment.