Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto track binary files #828

Merged
merged 9 commits into from Apr 21, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 8 additions & 12 deletions src/huggingface_hub/hf_api.py
Expand Up @@ -569,23 +569,19 @@ def _validate_or_retrieve_token(
function_name: Optional[str] = None,
):
"""
Args:
Retrieves and validates stored token or validates passed token.
Args:
token (``str``, `optional`):
Hugging Face token. Will default to the locally saved token if
not provided.
Hugging Face token. Will default to the locally saved token if not provided.
name (``str``, `optional`):
Name of the repository. This is deprecated in favor of repo_id
and will be removed in v0.7.
Name of the repository. This is deprecated in favor of repo_id and will be removed in v0.7.
function_name (``str``, `optional`):
If _validate_or_retrieve_token is called from a function, name
of that function to be passed inside deprecation warning.
If _validate_or_retrieve_token is called from a function, name of that function to be passed inside deprecation warning.
Returns:
Validated token and the name of the repository.
Raises:
:class:`EnvironmentError`: If the token is not passed and there's no
token saved locally. :class:`ValueError`: If organization token or
invalid token is passed.
:class:`EnvironmentError`: If the token is not passed and there's no token saved locally.
:class:`ValueError`: If organization token or invalid token is passed.
"""
if token is None or token is True:
token = HfFolder.get_token()
Expand Down Expand Up @@ -1851,8 +1847,8 @@ def get_token(cls) -> Optional[str]:
"""
Get token or None if not existent.

Note that a token can be also provided using the
`HUGGING_FACE_HUB_TOKEN` environment variable.
Note that a token can be also provided using the `HUGGING_FACE_HUB_TOKEN`
environment variable.

Returns:
`str` or `None`: The token, `None` if it doesn't exist.
Expand Down
35 changes: 22 additions & 13 deletions src/huggingface_hub/repository.py
Expand Up @@ -239,7 +239,7 @@ def is_binary_file(filename: Union[str, Path]) -> bool:
"""
try:
with open(filename) as f:
content = f.read(512)
content = f.read()
LysandreJik marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
content = f.read()
content = f.read(1024) # or 512 if we want to be consistent with the backend

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was discussed above here: #828 (comment)

Do you disagree with the conclusion?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't see that. Yeah I don't think we should ever read 11GB of data into memory. This will most certainly crash most people's systems. I'd be happy if we read like 10MB instead of 1MB to reduce the probability of a false detection, which should address those concerns. If we really do want to read a lot more, we should read in chunks. As python's docs state:

To read a file’s contents, call f.read(size), which reads some quantity of data and returns it as a string (in text mode) or bytes object (in binary mode). size is an optional numeric argument. When size is omitted or negative, the entire contents of the file will be read and returned; it’s your problem if the file is twice as large as your machine’s memory.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The solution currently implemented loads a maximum of 10MB in memory when calling git_add: it tracks large files before tracking binary files.

When tracking binary files, it looks at files which are not yet tracked with lfs, eliminating large files.

Instead of the 1MB limit that you propose here, we could instead put a max of 10MB here, which will only be triggered when auto_track_binary_files is called independently of git_add (which is a possibility!).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, but you also want to have this method public, which means people can call it before having tracked large files.

I'm happy with your suggestion of 10MB.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, thanks for your review. Addressed in cbfdce5


# Check for the presence of the null character in the string
return "\x00" in content
Expand Down Expand Up @@ -1023,15 +1023,22 @@ def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]:
continue

path_to_file = os.path.join(os.getcwd(), self.local_dir, filename)
is_binary = is_binary_file(path_to_file)

if (
is_binary
and not is_tracked_with_lfs(path_to_file)
and not is_git_ignored(path_to_file)
):
self.lfs_track(filename)
files_to_be_tracked_with_lfs.append(filename)
if not (is_tracked_with_lfs(path_to_file) or is_git_ignored(path_to_file)):
coyotte508 marked this conversation as resolved.
Show resolved Hide resolved
size_in_mb = os.path.getsize(path_to_file) / (1024 * 1024)

if size_in_mb >= 10:
logger.warning(
"Parsing a large file to check if binary or not. Tracking large "
"files using `repository.auto_track_large_files` is recommended "
"so as to not load the full file in memory."
)

is_binary = is_binary_file(path_to_file)

if is_binary:
self.lfs_track(filename)
files_to_be_tracked_with_lfs.append(filename)

# Cleanup the .gitattributes if files were deleted
self.lfs_untrack(deleted_files)
Expand Down Expand Up @@ -1152,10 +1159,12 @@ def git_add(
be automatically tracked.
"""
if auto_lfs_track:
tracked_files = [
*self.auto_track_large_files(pattern),
*self.auto_track_binary_files(pattern),
]
# Track files according to their size (>=10MB)
tracked_files = self.auto_track_large_files(pattern)

# Read the remaining files and track them if they're binary
tracked_files.extend(self.auto_track_binary_files(pattern))

if tracked_files:
logger.warning(
f"Adding files tracked by Git LFS: {tracked_files}. This may take a bit of time if the files are large."
Expand Down
152 changes: 96 additions & 56 deletions tests/test_repository.py
Expand Up @@ -1075,18 +1075,12 @@ def test_is_tracked_with_lfs_with_pattern(self):
# This content is 20MB (over 10MB)
large_file = [100] * int(4e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
f.write(json.dumps(large_file))

with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
f.write(json.dumps(small_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

os.makedirs(f"{WORKING_REPO_DIR}/dir", exist_ok=True)

with open(f"{WORKING_REPO_DIR}/dir/large_file.txt", "w+") as f:
Expand All @@ -1095,30 +1089,20 @@ def test_is_tracked_with_lfs_with_pattern(self):
with open(f"{WORKING_REPO_DIR}/dir/small_file.txt", "w+") as f:
f.write(json.dumps(small_file))

with open(f"{WORKING_REPO_DIR}/dir/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.auto_track_large_files("dir")
repo.auto_track_binary_files("dir")

self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
)
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
)
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
)
self.assertTrue(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/large_file.txt"))
)
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/small_file.txt"))
)
self.assertTrue(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/binary_file.txt"))
)

def test_auto_track_large_files(self):
repo = Repository(WORKING_REPO_DIR)
Expand All @@ -1129,27 +1113,41 @@ def test_auto_track_large_files(self):
# This content is 20MB (over 10MB)
large_file = [100] * int(4e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
f.write(json.dumps(large_file))

with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
f.write(json.dumps(small_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.auto_track_large_files()
repo.auto_track_binary_files()

self.assertTrue(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
)
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
)

def test_auto_track_binary_files(self):
repo = Repository(WORKING_REPO_DIR)

# This content is non-binary
non_binary_file = [100] * int(1e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/non_binary_file.txt", "w+") as f:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we shouldn't write on the working dir, we should work with python's temp files and folders.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The WORKING_REPO_DIR is a folder that is created on the start of the test, and which is reset at every beginning of every test, see here:

def setUp(self):
if os.path.exists(WORKING_REPO_DIR):
shutil.rmtree(WORKING_REPO_DIR, onerror=set_write_permission_and_retry)
logger.info(
f"Does {WORKING_REPO_DIR} exist: {os.path.exists(WORKING_REPO_DIR)}"
)
self.REPO_NAME = repo_name()
self._repo_url = self._api.create_repo(
repo_id=self.REPO_NAME, token=self._token
)
self._api.upload_file(
path_or_fileobj=BytesIO(b"some initial binary data: \x00\x01"),
path_in_repo="random_file.txt",
repo_id=f"{USER}/{self.REPO_NAME}",
token=self._token,
)

This isn't the working directory in which the test is run, but a folder nested in the fixtures folder:

WORKING_REPO_DIR = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "fixtures/working_repo_2"
)

f.write(json.dumps(non_binary_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.auto_track_binary_files()

self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary)file.txt"))
)
self.assertTrue(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
)
Expand All @@ -1160,17 +1158,14 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
# This content is 20MB (over 10MB)
large_file = [100] * int(4e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

# Test nested gitignores
os.makedirs(f"{WORKING_REPO_DIR}/directory")

with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f:
f.write("large_file.txt\nbinary_file.txt")
f.write("large_file.txt")

with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f:
f.write("large_file_3.txt\nbinary_file_3.txt")
f.write("large_file_3.txt")

with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
f.write(json.dumps(large_file))
Expand All @@ -1186,20 +1181,6 @@ def test_auto_track_large_files_ignored_with_gitignore(self):

repo.auto_track_large_files()

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f:
f.write(binary_file)

repo.auto_track_binary_files()

# Large files
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
Expand All @@ -1219,6 +1200,35 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
)
)

def test_auto_track_binary_files_ignored_with_gitignore(self):
repo = Repository(WORKING_REPO_DIR)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

# Test nested gitignores
os.makedirs(f"{WORKING_REPO_DIR}/directory")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same answer as above :)


with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f:
f.write("binary_file.txt")

with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f:
f.write("binary_file_3.txt")

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f:
f.write(binary_file)

repo.auto_track_binary_files()

# Binary files
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
Expand All @@ -1238,7 +1248,7 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
)
)

def test_auto_track_files_through_git_add(self):
def test_auto_track_large_files_through_git_add(self):
repo = Repository(WORKING_REPO_DIR)

# This content is 5MB (under 10MB)
Expand All @@ -1247,18 +1257,12 @@ def test_auto_track_files_through_git_add(self):
# This content is 20MB (over 10MB)
large_file = [100] * int(4e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
f.write(json.dumps(large_file))

with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
f.write(json.dumps(small_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.git_add(auto_lfs_track=True)

self.assertTrue(
Expand All @@ -1267,11 +1271,32 @@ def test_auto_track_files_through_git_add(self):
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
)

def test_auto_track_binary_files_through_git_add(self):
repo = Repository(WORKING_REPO_DIR)

# This content is non binary
non_binary_file = [100] * int(1e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
f.write(json.dumps(non_binary_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.git_add(auto_lfs_track=True)

self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt"))
)
self.assertTrue(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
)

def test_auto_no_track_files_through_git_add(self):
def test_auto_no_track_large_files_through_git_add(self):
repo = Repository(WORKING_REPO_DIR)

# This content is 5MB (under 10MB)
Expand All @@ -1280,18 +1305,12 @@ def test_auto_no_track_files_through_git_add(self):
# This content is 20MB (over 10MB)
large_file = [100] * int(4e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
f.write(json.dumps(large_file))

with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
f.write(json.dumps(small_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.git_add(auto_lfs_track=False)

self.assertFalse(
Expand All @@ -1300,6 +1319,27 @@ def test_auto_no_track_files_through_git_add(self):
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
)

def test_auto_no_track_binary_files_through_git_add(self):
repo = Repository(WORKING_REPO_DIR)

# This content is non-binary
non_binary_file = [100] * int(1e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
f.write(json.dumps(non_binary_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.git_add(auto_lfs_track=False)

self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt"))
)
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
)
Expand Down