huggingface · adrinjalali · Apr 21, 2022 · Apr 8, 2022 · Apr 8, 2022 · Apr 8, 2022
diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -569,19 +569,23 @@ def _validate_or_retrieve_token(
         function_name: Optional[str] = None,
     ):
         """
-        Retrieves and validates stored token or validates passed token.
         Args:
+        Retrieves and validates stored token or validates passed token.
             token (``str``, `optional`):
-                Hugging Face token. Will default to the locally saved token if not provided.
+                Hugging Face token. Will default to the locally saved token if
+                not provided.
             name (``str``, `optional`):
-                Name of the repository. This is deprecated in favor of repo_id and will be removed in v0.7.
+                Name of the repository. This is deprecated in favor of repo_id
+                and will be removed in v0.7.
             function_name (``str``, `optional`):
-                If _validate_or_retrieve_token is called from a function, name of that function to be passed inside deprecation warning.
+                If _validate_or_retrieve_token is called from a function, name
+                of that function to be passed inside deprecation warning.
         Returns:
             Validated token and the name of the repository.
         Raises:
-            :class:`EnvironmentError`: If the token is not passed and there's no token saved locally.
-            :class:`ValueError`: If organization token or invalid token is passed.
+            :class:`EnvironmentError`: If the token is not passed and there's no
+            token saved locally. :class:`ValueError`: If organization token or
+            invalid token is passed.
         """
         if token is None or token is True:
             token = HfFolder.get_token()
@@ -1847,8 +1851,8 @@ def get_token(cls) -> Optional[str]:
         """
         Get token or None if not existent.
 
-        Note that a token can be also provided using the `HUGGING_FACE_HUB_TOKEN`
-        environment variable.
+        Note that a token can be also provided using the
+        `HUGGING_FACE_HUB_TOKEN` environment variable.
 
         Returns:
             `str` or `None`: The token, `None` if it doesn't exist.

diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py
@@ -226,6 +226,27 @@ def is_git_ignored(filename: Union[str, Path]) -> bool:
     return is_ignored
 
 
+def is_binary_file(filename: Union[str, Path]) -> bool:
+    """
+    Check if file is a binary file.
+
+    Args:
+        filename (`str` or `Path`):
+            The filename to check.
+
+    Returns:
+        `bool`: `True` if the file passed is a binary file, `False` otherwise.
+    """
+    try:
+        with open(filename) as f:
+            content = f.read()
-            content = f.read()
+            content = f.read(1024)  # or 512 if we want to be consistent with the backend
-            content = f.read()
+            content = f.read(1024)  # or 512 if we want to be consistent with the backend
+
+        # Check for the presence of the null character in the string
+        return "\x00" in content
+    except UnicodeDecodeError:
+        return True
+
+
 def files_to_be_staged(pattern: str, folder: Union[str, Path]) -> List[str]:
     """
     Returns a list of filenames that are to be staged.
@@ -485,8 +506,8 @@ def __init__(
             skip_lfs_files (`bool`, *optional*, defaults to `False`):
                 whether to skip git-LFS files or not.
             client (`HfApi`, *optional*):
-                Instance of HfApi to use when calling the HF Hub API.
-                A new instance will be created if this is left to `None`.
+                Instance of HfApi to use when calling the HF Hub API. A new
+                instance will be created if this is left to `None`.
         """
 
         os.makedirs(local_dir, exist_ok=True)
@@ -981,6 +1002,42 @@ def lfs_enable_largefiles(self):
         except subprocess.CalledProcessError as exc:
             raise EnvironmentError(exc.stderr)
 
+    def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]:
+        """
+        Automatically track binary files with git-lfs.
+
+        Args:
+            pattern (`str`, *optional*, defaults to "."):
+                The pattern with which to track files that are above 10MBs.
+
+        Returns:
+            `List[str]`: List of filenames that are now tracked due to being
+            binary files
+        """
+        files_to_be_tracked_with_lfs = []
+
+        deleted_files = self.list_deleted_files()
+
+        for filename in files_to_be_staged(pattern, folder=self.local_dir):
+            if filename in deleted_files:
+                continue
+
+            path_to_file = os.path.join(os.getcwd(), self.local_dir, filename)
+            is_binary = is_binary_file(path_to_file)
+
+            if (
+                is_binary
+                and not is_tracked_with_lfs(path_to_file)
+                and not is_git_ignored(path_to_file)
+            ):
+                self.lfs_track(filename)
+                files_to_be_tracked_with_lfs.append(filename)
+
+        # Cleanup the .gitattributes if files were deleted
+        self.lfs_untrack(deleted_files)
+
+        return files_to_be_tracked_with_lfs
+
     def auto_track_large_files(self, pattern: Optional[str] = ".") -> List[str]:
         """
         Automatically track large files (files that weigh more than 10MBs) with
@@ -1090,11 +1147,15 @@ def git_add(
             pattern (`str`, *optional*, defaults to "."):
                 The pattern with which to add files to staging.
             auto_lfs_track (`bool`, *optional*, defaults to `False`):
-                Whether to automatically track large files with git-lfs. Any
-                file over 10MB in size will be automatically tracked.
+                Whether to automatically track large and binaryfiles with
+                git-lfs. Any file over 10MB in size, or in binary format, will
+                be automatically tracked.
         """
         if auto_lfs_track:
-            tracked_files = self.auto_track_large_files(pattern)
+            tracked_files = [
+                *self.auto_track_large_files(pattern),
+                *self.auto_track_binary_files(pattern),
+            ]
             if tracked_files:
                 logger.warning(
                     f"Adding files tracked by Git LFS: {tracked_files}. This may take a bit of time if the files are large."

diff --git a/tests/test_repository.py b/tests/test_repository.py
@@ -1075,12 +1075,18 @@ def test_is_tracked_with_lfs_with_pattern(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
 
         with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
         os.makedirs(f"{WORKING_REPO_DIR}/dir", exist_ok=True)
 
         with open(f"{WORKING_REPO_DIR}/dir/large_file.txt", "w+") as f:
@@ -1089,20 +1095,30 @@ def test_is_tracked_with_lfs_with_pattern(self):
         with open(f"{WORKING_REPO_DIR}/dir/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
+        with open(f"{WORKING_REPO_DIR}/dir/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
         repo.auto_track_large_files("dir")
+        repo.auto_track_binary_files("dir")
 
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
         )
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
         self.assertTrue(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/large_file.txt"))
         )
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/small_file.txt"))
         )
+        self.assertTrue(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/binary_file.txt"))
+        )
 
     def test_auto_track_large_files(self):
         repo = Repository(WORKING_REPO_DIR)
@@ -1113,35 +1129,48 @@ def test_auto_track_large_files(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
 
         with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
         repo.auto_track_large_files()
+        repo.auto_track_binary_files()
 
         self.assertTrue(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
         )
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
+        self.assertTrue(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
 
     def test_auto_track_large_files_ignored_with_gitignore(self):
         repo = Repository(WORKING_REPO_DIR)
 
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
         # Test nested gitignores
         os.makedirs(f"{WORKING_REPO_DIR}/directory")
 
         with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f:
-            f.write("large_file.txt")
+            f.write("large_file.txt\nbinary_file.txt")
 
         with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f:
-            f.write("large_file_3.txt")
+            f.write("large_file_3.txt\nbinary_file_3.txt")
 
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
@@ -1157,6 +1186,21 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
 
         repo.auto_track_large_files()
 
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.auto_track_binary_files()
+
+        # Large files
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
         )
@@ -1175,7 +1219,26 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
             )
         )
 
-    def test_auto_track_large_files_through_git_add(self):
+        # Binary files
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
+        self.assertTrue(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file_2.txt"))
+        )
+
+        self.assertFalse(
+            is_tracked_with_lfs(
+                os.path.join(WORKING_REPO_DIR, "directory/binary_file_3.txt")
+            )
+        )
+        self.assertTrue(
+            is_tracked_with_lfs(
+                os.path.join(WORKING_REPO_DIR, "directory/binary_file_4.txt")
+            )
+        )
+
+    def test_auto_track_files_through_git_add(self):
         repo = Repository(WORKING_REPO_DIR)
 
         # This content is 5MB (under 10MB)
@@ -1184,12 +1247,18 @@ def test_auto_track_large_files_through_git_add(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
 
         with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
         repo.git_add(auto_lfs_track=True)
 
         self.assertTrue(
@@ -1198,8 +1267,11 @@ def test_auto_track_large_files_through_git_add(self):
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
+        self.assertTrue(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
 
-    def test_auto_no_track_large_files_through_git_add(self):
+    def test_auto_no_track_files_through_git_add(self):
         repo = Repository(WORKING_REPO_DIR)
 
         # This content is 5MB (under 10MB)
@@ -1208,12 +1280,18 @@ def test_auto_no_track_large_files_through_git_add(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
 
         with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
         repo.git_add(auto_lfs_track=False)
 
         self.assertFalse(
@@ -1222,6 +1300,9 @@ def test_auto_no_track_large_files_through_git_add(self):
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
 
     def test_auto_track_updates_removed_gitattributes(self):
         repo = Repository(WORKING_REPO_DIR)