Skip to content

Commit

Permalink
Drop File limits
Browse files Browse the repository at this point in the history
  • Loading branch information
shnela committed Nov 4, 2022
1 parent 6fca974 commit 5e00f9a
Show file tree
Hide file tree
Showing 9 changed files with 249 additions and 148 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- Building a package with Poetry ([#1069](https://github.com/neptune-ai/neptune-client/pull/1069))
- Automatically convert image and html like assignments to uploads ([#1006](https://github.com/neptune-ai/neptune-client/pull/1006))
- File.from_stream does not load content into memory ([#1065](https://github.com/neptune-ai/neptune-client/pull/1065))
- Drop limits for in-memory Files ([#1070](https://github.com/neptune-ai/neptune-client/pull/1070))

## neptune-client 0.16.11

Expand Down
137 changes: 71 additions & 66 deletions e2e_tests/standard/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import io
import os
import random
import uuid
Expand All @@ -29,6 +30,8 @@
fake,
)
from e2e_tests.utils import (
SIZE_1KB,
SIZE_1MB,
initialize_container,
tmp_context,
)
Expand All @@ -43,6 +46,7 @@
File,
FileSet,
)
from neptune.new.types.atoms.file import FileType


class TestUpload(BaseE2ETest):
Expand All @@ -52,66 +56,51 @@ def test_using_new_api(self, container: MetadataContainer):
assert container._backend._client_config.has_feature(OptionalFeatures.MULTIPART_UPLOAD)
assert isinstance(container._backend._client_config.multipart_config, MultipartConfig)

@pytest.mark.parametrize("container", AVAILABLE_CONTAINERS, indirect=True)
@pytest.mark.parametrize(
"file_size",
[
pytest.param(10 * 2**20, id="big"), # 10 MB, multipart
pytest.param(100 * 2**10, id="small"), # 100 kB, single upload
],
)
def test_single_file(self, container: MetadataContainer, file_size: int):
def _test_upload(self, container: MetadataContainer, file_type: FileType, file_size: int):
key = self.gen_key()
extension = fake.file_extension()
filename = fake.file_name(extension=extension)
downloaded_filename = fake.file_name()
content = os.urandom(file_size)

with tmp_context():
# create file_size file
if file_type is FileType.LOCAL_FILE:
filename = fake.file_name(extension=extension)
with open(filename, "wb") as file:
file.write(b"\0" * file_size)
container[key].upload(filename)

container.sync()
container[key].download(downloaded_filename)

assert container[key].fetch_extension() == extension
assert os.path.getsize(downloaded_filename) == file_size
with open(downloaded_filename, "rb") as file:
content = file.read()
assert len(content) == file_size
assert content == b"\0" * file_size

@pytest.mark.parametrize("container", AVAILABLE_CONTAINERS, indirect=True)
@pytest.mark.parametrize(
"file_size",
[
pytest.param(10 * 2**20, id="big"), # 10 MB, multipart
pytest.param(100 * 2**10, id="small"), # 100 kB, single upload
],
)
def test_in_memory_file(self, container: MetadataContainer, file_size: int):
key = self.gen_key()
extension = fake.file_extension()
downloaded_filename = fake.file_name()
expected_content = os.urandom(file_size)
file.write(content)

container[key].upload(File.from_content(expected_content, extension))
file = File.from_path(filename)
elif file_type is FileType.IN_MEMORY:
file = File.from_content(content, extension)
elif file_type is FileType.STREAM:
file = File.from_stream(io.BytesIO(content), extension=extension)
else:
raise ValueError()

container[key].upload(file)
container.sync()
container[key].download(downloaded_filename)

assert container[key].fetch_extension() == extension
assert os.path.getsize(downloaded_filename) == file_size
with open(downloaded_filename, "rb") as file:
content = file.read()
downloaded_content = file.read()
assert len(content) == file_size
assert content == expected_content
assert downloaded_content == content

@pytest.mark.parametrize("container", AVAILABLE_CONTAINERS, indirect=True)
@pytest.mark.parametrize("file_type", list(FileType))
def test_single_upload(self, container: MetadataContainer, file_type: FileType):
file_size = 100 * SIZE_1KB # 100 kB, single upload
self._test_upload(container, file_type, file_size)

@pytest.mark.parametrize("container", ["run"], indirect=True)
def test_multipart_upload(self, container: MetadataContainer):
file_size = 10 * SIZE_1MB # 10 MB, multipart
self._test_upload(container, FileType.IN_MEMORY, file_size)

def test_single_file_changed_during_upload(self, environment, monkeypatch):
def test_file_changed_during_upload(self, environment, monkeypatch):
key = self.gen_key()
file_size = 11 * 2**20 # 11 MB, multipart with 3 parts
intermediate_size = 6 * 2**20 # 6 MB, second part < 5MB
file_size = 11 * SIZE_1MB # 11 MB, multipart with 3 parts
intermediate_size = 6 * SIZE_1MB # 6 MB, second part < 5MB
filename = fake.file_name()
downloaded_filename = fake.file_name()

Expand Down Expand Up @@ -167,7 +156,7 @@ def __call__(self, *args, **kwargs):
@pytest.mark.parametrize("container", ["run"], indirect=True)
def test_replace_float_attribute_with_uploaded_file(self, container: MetadataContainer):
key = self.gen_key()
file_size = 100 * 2**10 # 100 kB
file_size = 100 * SIZE_1KB # 100 kB
filename = fake.file_name()
downloaded_filename = fake.file_name()

Expand All @@ -193,17 +182,19 @@ def test_replace_float_attribute_with_uploaded_file(self, container: MetadataCon
assert len(content) == file_size
assert content == b"\0" * file_size

@pytest.mark.parametrize("container", AVAILABLE_CONTAINERS, indirect=True)
def test_fileset(self, container: MetadataContainer):

class TestFileSet(BaseE2ETest):
def _test_fileset(self, container: MetadataContainer, large_file_size: int, small_files_no: int):
key = self.gen_key()
large_filesize = 10 * 2**20 # 10MB
large_filename = fake.file_name()
small_files = [(f"{uuid.uuid4()}.{fake.file_extension()}", fake.sentence().encode("utf-8")) for _ in range(100)]
small_files = [
(f"{uuid.uuid4()}.{fake.file_extension()}", fake.sentence().encode("utf-8")) for _ in range(small_files_no)
]

with tmp_context():
# create single large file (multipart) and a lot of very small files
with open(large_filename, "wb") as file:
file.write(b"\0" * large_filesize)
file.write(b"\0" * large_file_size)
for filename, contents in small_files:
with open(filename, "wb") as file:
file.write(contents)
Expand All @@ -223,8 +214,8 @@ def test_fileset(self, container: MetadataContainer):
assert set(zipped.namelist()) == {large_filename, "/"}
with zipped.open(large_filename, "r") as file:
content = file.read()
assert len(content) == large_filesize
assert content == b"\0" * large_filesize
assert len(content) == large_file_size
assert content == b"\0" * large_file_size

# when small files as fileset uploaded
container[key].upload_files(small_filenames)
Expand All @@ -237,8 +228,8 @@ def test_fileset(self, container: MetadataContainer):
assert set(zipped.namelist()) == {large_filename, "/", *small_filenames}
with zipped.open(large_filename, "r") as file:
content = file.read()
assert len(content) == large_filesize
assert content == b"\0" * large_filesize
assert len(content) == large_file_size
assert content == b"\0" * large_file_size
for filename, expected_content in small_files:
with zipped.open(filename, "r") as file:
content = file.read()
Expand All @@ -260,8 +251,22 @@ def test_fileset(self, container: MetadataContainer):
assert len(content) == len(expected_content)
assert content == expected_content

@pytest.mark.parametrize("container", AVAILABLE_CONTAINERS, indirect=True)
def test_fileset(self, container: MetadataContainer):
# 100 kB, single upload for large file
large_file_size = 100 * SIZE_1KB
small_files_no = 10
self._test_fileset(container, large_file_size, small_files_no)

@pytest.mark.parametrize("container", ["run"], indirect=True)
def test_fileset_with_multipart(self, container: MetadataContainer):
# 10 MB, multipart upload for large file
large_file_size = 10 * SIZE_1MB
small_files_no = 100
self._test_fileset(container, large_file_size, small_files_no)

@classmethod
def _gen_tree_paths(cls, depth, width=3) -> Set:
def _gen_tree_paths(cls, depth, width=2) -> Set:
"""Generates all subdirectories of some random tree directory structure"""
this_level_dirs = (fake.word() + "/" for _ in range(width))
if depth == 1:
Expand All @@ -272,15 +277,15 @@ def _gen_tree_paths(cls, depth, width=3) -> Set:
subpaths.update(new_paths)
return subpaths

@pytest.mark.parametrize("container", ["project", "run"], indirect=True)
@pytest.mark.parametrize("container", ["run"], indirect=True)
def test_fileset_nested_structure(self, container: MetadataContainer):
key = self.gen_key()
possible_paths = self._gen_tree_paths(depth=3)

small_files = [
(
f"{path}{uuid.uuid4()}.{fake.file_extension()}",
os.urandom(random.randint(10**3, 10**6)),
os.urandom(random.randint(SIZE_1KB, 100 * SIZE_1KB)),
)
for path in possible_paths
]
Expand Down Expand Up @@ -317,13 +322,13 @@ def test_fileset_nested_structure(self, container: MetadataContainer):
assert len(content) == len(expected_content)
assert content == expected_content

@pytest.mark.parametrize("container", ["project", "run"], indirect=True)
@pytest.mark.parametrize("container", ["run"], indirect=True)
def test_reset_fileset(self, container: MetadataContainer):
key = self.gen_key()
filename1 = fake.file_name()
filename2 = fake.file_name()
content1 = os.urandom(random.randint(10**3, 10**6))
content2 = os.urandom(random.randint(10**3, 10**6))
content1 = os.urandom(random.randint(SIZE_1KB, 100 * SIZE_1KB))
content2 = os.urandom(random.randint(SIZE_1KB, 100 * SIZE_1KB))

with tmp_context():
# create file1 and file2
Expand All @@ -348,14 +353,14 @@ def test_reset_fileset(self, container: MetadataContainer):
assert len(content) == len(content2)
assert content == content2

@pytest.mark.parametrize("container", ["project", "run"], indirect=True)
@pytest.mark.parametrize("container", ["run"], indirect=True)
@pytest.mark.parametrize("delete_attribute", [True, False])
def test_single_file_override(self, container: MetadataContainer, delete_attribute: bool):
key = self.gen_key()
filename1 = fake.file_name()
filename2 = fake.file_name()
content1 = os.urandom(random.randint(10**3, 10**6))
content2 = os.urandom(random.randint(10**3, 10**6))
content1 = os.urandom(random.randint(SIZE_1KB, 100 * SIZE_1KB))
content2 = os.urandom(random.randint(SIZE_1KB, 100 * SIZE_1KB))
downloaded_filename = fake.file_name()

with tmp_context():
Expand Down Expand Up @@ -386,13 +391,13 @@ def test_single_file_override(self, container: MetadataContainer, delete_attribu
assert len(content) == len(content2)
assert content == content2

@pytest.mark.parametrize("container", ["project", "run"], indirect=True)
@pytest.mark.parametrize("container", ["run"], indirect=True)
@pytest.mark.parametrize("delete_attribute", [True, False])
def test_fileset_file_override(self, container: MetadataContainer, delete_attribute: bool):
key = self.gen_key()
filename = fake.file_name()
content1 = os.urandom(random.randint(10**3, 10**6))
content2 = os.urandom(random.randint(10**3, 10**6))
content1 = os.urandom(random.randint(SIZE_1KB, 100 * SIZE_1KB))
content2 = os.urandom(random.randint(SIZE_1KB, 100 * SIZE_1KB))

with tmp_context():
# create file
Expand Down
4 changes: 2 additions & 2 deletions e2e_tests/standard/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
fake,
)
from e2e_tests.utils import (
SIZE_1KB,
generate_image,
image_to_png,
tmp_context,
Expand Down Expand Up @@ -63,8 +64,7 @@ def test_log_strings(self, container: MetadataContainer):
@pytest.mark.parametrize("container", AVAILABLE_CONTAINERS, indirect=True)
def test_log_images(self, container: MetadataContainer):
key = self.gen_key()
# images with size between 200KB - 12MB
images = list(generate_image(size=2**n) for n in range(8, 12))
images = [generate_image(size=32 * SIZE_1KB) for _ in range(4)]

container[key].log(images[0])
container[key].log(images[1:])
Expand Down
10 changes: 9 additions & 1 deletion e2e_tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
"reinitialize_container",
"modified_environ",
"catch_time",
"SIZE_1KB",
"SIZE_1MB",
]

import io
Expand All @@ -32,6 +34,7 @@
import tempfile
from contextlib import contextmanager
from datetime import datetime
from math import sqrt
from time import perf_counter

import numpy
Expand All @@ -51,6 +54,9 @@ def _remove_file_if_exists(filepath):
pass


SIZE_1MB = 2**20
SIZE_1KB = 2**10

# init kwargs which significantly reduce operations noise
DISABLE_SYSLOG_KWARGS = {
"capture_stdout": False,
Expand Down Expand Up @@ -90,7 +96,9 @@ def tmp_context():


def generate_image(*, size: int) -> Image:
random_numbers = numpy.random.rand(size, size, 3) * 255
"""generate image of size in bytes"""
width = int(sqrt(size / 3)) # 3 bytes per one pixel in square image
random_numbers = numpy.random.rand(width, width, 3) * 255
return Image.fromarray(random_numbers.astype("uint8")).convert("RGB")


Expand Down
7 changes: 1 addition & 6 deletions src/neptune/new/internal/types/file_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,7 @@

from neptune.common.exceptions import NeptuneException
from neptune.new.exceptions import StreamAlreadyUsedException
from neptune.new.internal.utils import (
limits,
verify_type,
)
from neptune.new.internal.utils import verify_type


class FileType(enum.Enum):
Expand Down Expand Up @@ -98,8 +95,6 @@ class InMemoryComposite(FileComposite):
file_type = FileType.IN_MEMORY

def __init__(self, content: Union[str, bytes], extension: Optional[str] = None):
if limits.file_size_exceeds_limit(len(content)):
content = b""
if isinstance(content, str):
ext = "txt"
content = content.encode("utf-8")
Expand Down
10 changes: 0 additions & 10 deletions src/neptune/new/internal/utils/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from pandas import DataFrame

from neptune.new.exceptions import PlotlyIncompatibilityException
from neptune.new.internal.utils import limits
from neptune.new.internal.utils.logger import logger

_logger = logging.getLogger(__name__)
Expand All @@ -55,27 +54,18 @@ def pilimage_fromarray():
def get_image_content(image) -> Optional[bytes]:
content = _image_to_bytes(image)

if limits.image_size_exceeds_limit(len(content)):
return None

return content


def get_html_content(chart) -> Optional[str]:
content = _to_html(chart)

if limits.file_size_exceeds_limit(len(content)):
return None

return content


def get_pickle_content(obj) -> Optional[bytes]:
content = _export_pickle(obj)

if limits.file_size_exceeds_limit(len(content)):
return None

return content


Expand Down

0 comments on commit 5e00f9a

Please sign in to comment.