Skip to content

Commit

Permalink
ENH Auto track binary files in Repository (#828)
Browse files Browse the repository at this point in the history
* Auto track binary files

* Add null character check

* Tests

* Apply suggestions from code review

Co-authored-by: Omar Sanseviero <osanseviero@gmail.com>

* Only read 512 bytes

* Address Eliott's comments

* Split test

* Address review comments

* 10MB maximum

Co-authored-by: Omar Sanseviero <osanseviero@gmail.com>
  • Loading branch information
LysandreJik and osanseviero committed Apr 21, 2022
1 parent 18e6e87 commit 45aec8e
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 4 deletions.
82 changes: 78 additions & 4 deletions src/huggingface_hub/repository.py
Expand Up @@ -226,6 +226,31 @@ def is_git_ignored(filename: Union[str, Path]) -> bool:
return is_ignored


def is_binary_file(filename: Union[str, Path]) -> bool:
"""
Check if file is a binary file.
Args:
filename (`str` or `Path`):
The filename to check.
Returns:
`bool`: `True` if the file passed is a binary file, `False` otherwise.
"""
try:
with open(filename, "rb") as f:
content = f.read(10 * (1024**2)) # Read a maximum of 10MB

# Code sample taken from the following stack overflow thread
# https://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python/7392391#7392391
text_chars = bytearray(
{7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F}
)
return bool(content.translate(None, text_chars))
except UnicodeDecodeError:
return True


def files_to_be_staged(pattern: str, folder: Union[str, Path]) -> List[str]:
"""
Returns a list of filenames that are to be staged.
Expand Down Expand Up @@ -485,8 +510,8 @@ def __init__(
skip_lfs_files (`bool`, *optional*, defaults to `False`):
whether to skip git-LFS files or not.
client (`HfApi`, *optional*):
Instance of HfApi to use when calling the HF Hub API.
A new instance will be created if this is left to `None`.
Instance of HfApi to use when calling the HF Hub API. A new
instance will be created if this is left to `None`.
"""

os.makedirs(local_dir, exist_ok=True)
Expand Down Expand Up @@ -981,6 +1006,49 @@ def lfs_enable_largefiles(self):
except subprocess.CalledProcessError as exc:
raise EnvironmentError(exc.stderr)

def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]:
"""
Automatically track binary files with git-lfs.
Args:
pattern (`str`, *optional*, defaults to "."):
The pattern with which to track files that are binary.
Returns:
`List[str]`: List of filenames that are now tracked due to being
binary files
"""
files_to_be_tracked_with_lfs = []

deleted_files = self.list_deleted_files()

for filename in files_to_be_staged(pattern, folder=self.local_dir):
if filename in deleted_files:
continue

path_to_file = os.path.join(os.getcwd(), self.local_dir, filename)

if not (is_tracked_with_lfs(path_to_file) or is_git_ignored(path_to_file)):
size_in_mb = os.path.getsize(path_to_file) / (1024 * 1024)

if size_in_mb >= 10:
logger.warning(
"Parsing a large file to check if binary or not. Tracking large "
"files using `repository.auto_track_large_files` is recommended "
"so as to not load the full file in memory."
)

is_binary = is_binary_file(path_to_file)

if is_binary:
self.lfs_track(filename)
files_to_be_tracked_with_lfs.append(filename)

# Cleanup the .gitattributes if files were deleted
self.lfs_untrack(deleted_files)

return files_to_be_tracked_with_lfs

def auto_track_large_files(self, pattern: Optional[str] = ".") -> List[str]:
"""
Automatically track large files (files that weigh more than 10MBs) with
Expand Down Expand Up @@ -1090,11 +1158,17 @@ def git_add(
pattern (`str`, *optional*, defaults to "."):
The pattern with which to add files to staging.
auto_lfs_track (`bool`, *optional*, defaults to `False`):
Whether to automatically track large files with git-lfs. Any
file over 10MB in size will be automatically tracked.
Whether to automatically track large and binary files with
git-lfs. Any file over 10MB in size, or in binary format, will
be automatically tracked.
"""
if auto_lfs_track:
# Track files according to their size (>=10MB)
tracked_files = self.auto_track_large_files(pattern)

# Read the remaining files and track them if they're binary
tracked_files.extend(self.auto_track_binary_files(pattern))

if tracked_files:
logger.warning(
f"Adding files tracked by Git LFS: {tracked_files}. This may take a bit of time if the files are large."
Expand Down
121 changes: 121 additions & 0 deletions tests/test_repository.py
Expand Up @@ -1128,6 +1128,30 @@ def test_auto_track_large_files(self):
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
)

def test_auto_track_binary_files(self):
repo = Repository(WORKING_REPO_DIR)

# This content is non-binary
non_binary_file = [100] * int(1e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/non_binary_file.txt", "w+") as f:
f.write(json.dumps(non_binary_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.auto_track_binary_files()

self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary)file.txt"))
)
self.assertTrue(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
)

def test_auto_track_large_files_ignored_with_gitignore(self):
repo = Repository(WORKING_REPO_DIR)

Expand Down Expand Up @@ -1157,6 +1181,7 @@ def test_auto_track_large_files_ignored_with_gitignore(self):

repo.auto_track_large_files()

# Large files
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
)
Expand All @@ -1175,6 +1200,54 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
)
)

def test_auto_track_binary_files_ignored_with_gitignore(self):
repo = Repository(WORKING_REPO_DIR)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

# Test nested gitignores
os.makedirs(f"{WORKING_REPO_DIR}/directory")

with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f:
f.write("binary_file.txt")

with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f:
f.write("binary_file_3.txt")

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f:
f.write(binary_file)

with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f:
f.write(binary_file)

repo.auto_track_binary_files()

# Binary files
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
)
self.assertTrue(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file_2.txt"))
)

self.assertFalse(
is_tracked_with_lfs(
os.path.join(WORKING_REPO_DIR, "directory/binary_file_3.txt")
)
)
self.assertTrue(
is_tracked_with_lfs(
os.path.join(WORKING_REPO_DIR, "directory/binary_file_4.txt")
)
)

def test_auto_track_large_files_through_git_add(self):
repo = Repository(WORKING_REPO_DIR)

Expand All @@ -1199,6 +1272,30 @@ def test_auto_track_large_files_through_git_add(self):
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
)

def test_auto_track_binary_files_through_git_add(self):
repo = Repository(WORKING_REPO_DIR)

# This content is non binary
non_binary_file = [100] * int(1e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
f.write(json.dumps(non_binary_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.git_add(auto_lfs_track=True)

self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt"))
)
self.assertTrue(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
)

def test_auto_no_track_large_files_through_git_add(self):
repo = Repository(WORKING_REPO_DIR)

Expand All @@ -1223,6 +1320,30 @@ def test_auto_no_track_large_files_through_git_add(self):
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
)

def test_auto_no_track_binary_files_through_git_add(self):
repo = Repository(WORKING_REPO_DIR)

# This content is non-binary
non_binary_file = [100] * int(1e6)

# This content is binary (contains the null character)
binary_file = "\x00\x00\x00\x00"

with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
f.write(json.dumps(non_binary_file))

with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
f.write(binary_file)

repo.git_add(auto_lfs_track=False)

self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt"))
)
self.assertFalse(
is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
)

def test_auto_track_updates_removed_gitattributes(self):
repo = Repository(WORKING_REPO_DIR)

Expand Down

0 comments on commit 45aec8e

Please sign in to comment.