Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add s3 support to I/O operations. #126

Merged
merged 20 commits into from Oct 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test-package.yml
Expand Up @@ -27,7 +27,7 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'pip'

- name: Install dependencies
- name: Install requirements
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
Expand Down
8 changes: 6 additions & 2 deletions README.md
Expand Up @@ -21,6 +21,7 @@ python-benedict is a dict subclass with **keylist/keypath** support, **I/O** sho
- **Keypath** support using **keypath-separator** *(dot syntax by default)*.
- Keypath **list-index** support *(also negative)* using the standard `[n]` suffix.
- Normalized **I/O operations** with most common formats: `base64`, `csv`, `ini`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xls`, `xml`, `yaml`.
- `NEW` Multiple **I/O operations** backends: `filepath` *(read/write)*, `url` *(read-only)*, `s3` *(read/write)*.
- Many **utility** and **parse methods** to retrieve data as needed *(check the [API](#api) section)*.
- Well **tested**. ;)

Expand Down Expand Up @@ -437,7 +438,7 @@ d.unique()

### I/O methods

It is possible to create a `benedict` instance directly from data source (filepath, url or data-string) by passing the data source and the data format (default 'json') in the constructor.
It is possible to create a `benedict` instance directly from data-source (`filepath`, `url`, `s3` or `data-string`) by passing the data source and the data format (optional, default 'json') in the constructor.

```python
# filepath
Expand All @@ -446,11 +447,14 @@ d = benedict('/root/data.yml', format='yaml')
# url
d = benedict('https://localhost:8000/data.xml', format='xml')

# s3
d = benedict('s3://my-bucket/data.xml', s3_options={"aws_access_key_id": "...", "aws_secret_access_key": "..."})

# data-string
d = benedict('{"a": 1, "b": 2, "c": 3, "x": 7, "y": 8, "z": 9}')
```

These methods simplify I/O operations with most common formats: `base64`, `csv`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xml`, `yaml`.
These methods simplify I/O operations with most common formats: `base64`, `csv`, `ini`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xls`, `xml`, `yaml`.

In all `from_*` methods, the first argument can be: **url**, **filepath** or **data-string**.

Expand Down
3 changes: 0 additions & 3 deletions benedict/dicts/io/io_dict.py
Expand Up @@ -44,10 +44,7 @@ def _decode(s, format, **kwargs):

@staticmethod
def _encode(d, format, **kwargs):
filepath = kwargs.pop("filepath", None)
s = io_util.encode(d, format, **kwargs)
if filepath:
io_util.write_file(filepath, s)
return s

@classmethod
Expand Down
101 changes: 75 additions & 26 deletions benedict/dicts/io/io_util.py
Expand Up @@ -3,15 +3,18 @@
from benedict.serializers import (
get_format_by_path,
get_serializer_by_format,
get_serializers_extensions,
)

# from botocore.exceptions import ClientError
from urllib.parse import urlparse

import boto3
import fsutil
import tempfile


def autodetect_format(s):
if is_url(s) or is_filepath(s):
if any([is_url(s), is_s3(s), is_filepath(s)]):
return get_format_by_path(s)
return None

Expand All @@ -20,20 +23,23 @@ def decode(s, format, **kwargs):
serializer = get_serializer_by_format(format)
if not serializer:
raise ValueError(f"Invalid format: {format}.")
decode_opts = kwargs.copy()
options = kwargs.copy()
if format in ["b64", "base64"]:
decode_opts.setdefault("subformat", "json")
content = read_content(s, format)
data = serializer.decode(content, **decode_opts)
options.setdefault("subformat", "json")
content = read_content(s, format, **options)
data = serializer.decode(content, **options)
return data


def encode(d, format, **kwargs):
def encode(d, format, filepath=None, **kwargs):
serializer = get_serializer_by_format(format)
if not serializer:
raise ValueError(f"Invalid format: {format}.")
s = serializer.encode(d, **kwargs)
return s
options = kwargs.copy()
content = serializer.encode(d, **options)
if filepath:
write_content(filepath, content, **options)
return content


def is_binary_format(format):
Expand All @@ -49,51 +55,94 @@ def is_data(s):


def is_filepath(s):
if any([s.endswith(ext) for ext in get_serializers_extensions()]):
return True
return fsutil.is_file(s)
return fsutil.is_file(s) or get_format_by_path(s)


def is_s3(s):
return s.startswith("s3://") and get_format_by_path(s)


def is_url(s):
return any([s.startswith(protocol) for protocol in ["http://", "https://"]])


def read_content(s, format):
def parse_s3_url(url):
parsed = urlparse(url, allow_fragments=False)
bucket = parsed.netloc
key = parsed.path.lstrip("/")
if parsed.query:
key += "?" + self._parsed.query
url = parsed.geturl()
return {
"url": url,
"bucket": bucket,
"key": key,
}


def read_content(s, format=None, **options):
# s -> filepath or url or data
options.setdefault("format", format)
s = s.strip()
if is_data(s):
return s
elif is_url(s):
return read_content_from_url(s, format)
return read_content_from_url(s, **options)
elif is_s3(s):
return read_content_from_s3(s, **options)
elif is_filepath(s):
return read_content_from_file(s, format)
return read_content_from_file(s, **options)
# one-line data?!
return s


def read_content_from_file(filepath, format):
def read_content_from_file(filepath, format=None, **options):
binary_format = is_binary_format(format)
if binary_format:
return filepath
return read_file(filepath)
return fsutil.read_file(filepath)


def read_content_from_s3(url, s3_options, format=None, **options):
s3_url = parse_s3_url(url)
dirpath = tempfile.gettempdir()
filename = fsutil.get_filename(s3_url["key"])
filepath = fsutil.join_path(dirpath, filename)
s3 = boto3.client("s3", **s3_options)
s3.download_file(s3_url["bucket"], s3_url["key"], filepath)
s3.close()
content = read_content_from_file(filepath, format, **options)
return content


def read_content_from_url(url, format, **options):
def read_content_from_url(url, requests_options=None, format=None, **options):
requests_options = requests_options or {}
binary_format = is_binary_format(format)
if binary_format:
dirpath = tempfile.gettempdir()
filepath = fsutil.download_file(url, dirpath, **options)
filepath = fsutil.download_file(url, dirpath, **requests_options)
return filepath
return read_url(url, **options)
return fsutil.read_file_from_url(url, **requests_options)


def read_file(filepath, **options):
return fsutil.read_file(filepath, **options)
def write_content(filepath, content, **options):
if is_s3(filepath):
write_content_to_s3(filepath, content, **options)
else:
write_content_to_file(filepath, content, **options)


def read_url(url, **options):
return fsutil.read_file_from_url(url, **options)
def write_content_to_file(filepath, content, **options):
fsutil.write_file(filepath, content)


def write_file(filepath, content, **options):
fsutil.write_file(filepath, content, **options)
def write_content_to_s3(url, content, s3_options, **options):
s3_url = parse_s3_url(url)
dirpath = tempfile.gettempdir()
filename = fsutil.get_filename(s3_url["key"])
filepath = fsutil.join_path(dirpath, filename)
fsutil.write_file(filepath, content)
s3 = boto3.client("s3", **s3_options)
s3.upload_file(filepath, s3_url["bucket"], s3_url["key"])
s3.close()
fsutil.remove_file(filepath)
6 changes: 0 additions & 6 deletions benedict/serializers/__init__.py
Expand Up @@ -79,10 +79,4 @@ def get_serializer_by_format(format):
format_key = (format or "").lower().strip()
format_key = re.sub(r"[\s\-\_]*", "", format_key)
serializer = _SERIALIZERS_BY_EXTENSION.get(format_key, None)
if not serializer:
raise ValueError(f"Invalid format: {format}.")
return serializer


def get_serializers_extensions():
return list(_SERIALIZERS_EXTENSIONS)
1 change: 1 addition & 0 deletions requirements-test.txt
Expand Up @@ -2,4 +2,5 @@ codecov == 2.1.12
coverage == 6.5.0
flake8 == 5.0.4
orjson == 3.8.0
python-decouple == 3.6
tox == 3.26.0
1 change: 1 addition & 0 deletions requirements.txt
@@ -1,3 +1,4 @@
boto3 == 1.24.89
ftfy == 6.1.1
mailchecker == 5.0.2
openpyxl == 3.0.10
Expand Down
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -94,6 +94,7 @@
"unique",
],
install_requires=[
"boto3 >= 1.24.89, < 2.0.0",
"ftfy >= 6.0.0, < 7.0.0",
"mailchecker >= 4.1.0, < 6.0.0",
"openpyxl >= 3.0.0, < 4.0.0",
Expand Down
4 changes: 2 additions & 2 deletions tests/dicts/base/test_base_dict.py
Expand Up @@ -234,12 +234,12 @@ def test__str__with_pointer(self):
def test__unicode__(self):
d = BaseDict()
d["name"] = "pythòn-bènèdìçt"
print(unicode(d))
# print(unicode(d))

@unittest.skipIf(sys.version_info[0] > 2, "No unicode in Python > 2")
def test__unicode__with_pointer(self):
d = BaseDict({"name": "pythòn-bènèdìçt"})
print(unicode(d))
# print(unicode(d))

def test_clear(self):
d = {
Expand Down
64 changes: 63 additions & 1 deletion tests/dicts/io/test_io_dict_xls.py
Expand Up @@ -2,6 +2,8 @@

from benedict.dicts.io import IODict

from decouple import config

from .test_io_dict import io_dict_test_case


Expand Down Expand Up @@ -104,7 +106,8 @@ def test_from_xls_with_valid_url_valid_content(self):
with self.subTest(
msg=f"test_from_xls_({extension})_with_valid_url_valid_content"
):
url = f"https://github.com/fabiocaccamo/python-benedict/raw/xls/tests/dicts/io/input/valid-content.{extension}"
# url = f"https://github.com/fabiocaccamo/python-benedict/raw/s3/tests/dicts/io/input/valid-content.{extension}"
url = f"https://github.com/fabiocaccamo/python-benedict/raw/master/tests/dicts/io/input/valid-content.{extension}"
# static method
d = IODict.from_xls(url)
self.assertTrue(isinstance(d, dict))
Expand All @@ -118,6 +121,65 @@ def test_from_xls_with_valid_url_valid_content(self):
self.assertTrue(isinstance(d, dict))
self.assertEqual(d, expected_dict)

def test_from_xls_with_valid_s3_url_valid_content(self):
aws_access_key_id = config("AWS_ACCESS_KEY_ID", default=None)
aws_secret_access_key = config("AWS_SECRET_ACCESS_KEY", default=None)
if not all([aws_access_key_id, aws_secret_access_key]):
# don't use s3 on GH CI
return
s3_options = {
"aws_access_key_id": aws_access_key_id,
"aws_secret_access_key": aws_secret_access_key,
}
expected_dict = {
"values": [
{
"mon": 10,
"tue": 11,
"wed": 12,
"thu": 13,
"fri": 14,
"sat": 15,
"sun": 16,
},
{
"mon": 20,
"tue": 21,
"wed": 22,
"thu": 23,
"fri": 24,
"sat": 25,
"sun": 26,
},
{
"mon": 30,
"tue": 31,
"wed": 32,
"thu": 33,
"fri": 34,
"sat": 35,
"sun": 36,
},
]
}
for extension in self._extensions:
with self.subTest(
msg=f"test_from_xls_({extension})_with_valid_s3_url_valid_content"
):
url = f"s3://python-benedict/valid-content.{extension}"
# static method
d = IODict.from_xls(url, s3_options=s3_options)
self.assertTrue(isinstance(d, dict))
self.assertEqual(d, expected_dict)
# constructor explicit format
d = IODict(url, format=extension, s3_options=s3_options)
self.assertTrue(isinstance(d, dict))
self.assertEqual(d, expected_dict)
# constructor implicit format
d = IODict(url, s3_options=s3_options)
self.assertTrue(isinstance(d, dict))
self.assertEqual(d, expected_dict)

def test_from_xls_with_valid_file_valid_content_custom_sheet_by_index_and_columns(
self,
):
Expand Down