huggingface · adrinjalali · Apr 21, 2022 · Apr 8, 2022 · Apr 8, 2022 · Apr 8, 2022
diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py
@@ -226,6 +226,27 @@ def is_git_ignored(filename: Union[str, Path]) -> bool:
     return is_ignored
 
 
+def is_binary_file(filename: Union[str, Path]) -> bool:
+    """
+    Check if file is a binary file.
+
+    Args:
+        filename (`str` or `Path`):
+            The filename to check.
+
+    Returns:
+        `bool`: `True` if the file passed is a binary file, `False` otherwise.
+    """
+    try:
+        with open(filename) as f:
+            content = f.read()
-            content = f.read()
+            content = f.read(1024)  # or 512 if we want to be consistent with the backend
-            content = f.read()
+            content = f.read(1024)  # or 512 if we want to be consistent with the backend
+
+        # Check for the presence of the null character in the string
+        return "\x00" in content
+    except UnicodeDecodeError:
+        return True
+
+
 def files_to_be_staged(pattern: str, folder: Union[str, Path]) -> List[str]:
     """
     Returns a list of filenames that are to be staged.
@@ -485,8 +506,8 @@ def __init__(
             skip_lfs_files (`bool`, *optional*, defaults to `False`):
                 whether to skip git-LFS files or not.
             client (`HfApi`, *optional*):
-                Instance of HfApi to use when calling the HF Hub API.
-                A new instance will be created if this is left to `None`.
+                Instance of HfApi to use when calling the HF Hub API. A new
+                instance will be created if this is left to `None`.
         """
 
         os.makedirs(local_dir, exist_ok=True)
@@ -981,6 +1002,49 @@ def lfs_enable_largefiles(self):
         except subprocess.CalledProcessError as exc:
             raise EnvironmentError(exc.stderr)
 
+    def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]:
+        """
+        Automatically track binary files with git-lfs.
+
+        Args:
+            pattern (`str`, *optional*, defaults to "."):
+                The pattern with which to track files that are binary.
+
+        Returns:
+            `List[str]`: List of filenames that are now tracked due to being
+            binary files
+        """
+        files_to_be_tracked_with_lfs = []
+
+        deleted_files = self.list_deleted_files()
+
+        for filename in files_to_be_staged(pattern, folder=self.local_dir):
+            if filename in deleted_files:
+                continue
+
+            path_to_file = os.path.join(os.getcwd(), self.local_dir, filename)
+
+            if not (is_tracked_with_lfs(path_to_file) or is_git_ignored(path_to_file)):
+                size_in_mb = os.path.getsize(path_to_file) / (1024 * 1024)
+
+                if size_in_mb >= 10:
+                    logger.warning(
+                        "Parsing a large file to check if binary or not. Tracking large "
+                        "files using `repository.auto_track_large_files` is recommended "
+                        "so as to not load the full file in memory."
+                    )
+
+                is_binary = is_binary_file(path_to_file)
+
+                if is_binary:
+                    self.lfs_track(filename)
+                    files_to_be_tracked_with_lfs.append(filename)
+
+        # Cleanup the .gitattributes if files were deleted
+        self.lfs_untrack(deleted_files)
+
+        return files_to_be_tracked_with_lfs
+
     def auto_track_large_files(self, pattern: Optional[str] = ".") -> List[str]:
         """
         Automatically track large files (files that weigh more than 10MBs) with
@@ -1090,11 +1154,17 @@ def git_add(
             pattern (`str`, *optional*, defaults to "."):
                 The pattern with which to add files to staging.
             auto_lfs_track (`bool`, *optional*, defaults to `False`):
-                Whether to automatically track large files with git-lfs. Any
-                file over 10MB in size will be automatically tracked.
+                Whether to automatically track large and binary files with
+                git-lfs. Any file over 10MB in size, or in binary format, will
+                be automatically tracked.
         """
         if auto_lfs_track:
+            # Track files according to their size (>=10MB)
             tracked_files = self.auto_track_large_files(pattern)
+
+            # Read the remaining files and track them if they're binary
+            tracked_files.extend(self.auto_track_binary_files(pattern))
+
             if tracked_files:
                 logger.warning(
                     f"Adding files tracked by Git LFS: {tracked_files}. This may take a bit of time if the files are large."

diff --git a/tests/test_repository.py b/tests/test_repository.py
@@ -1128,6 +1128,30 @@ def test_auto_track_large_files(self):
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
 
+    def test_auto_track_binary_files(self):
+        repo = Repository(WORKING_REPO_DIR)
+
+        # This content is non-binary
+        non_binary_file = [100] * int(1e6)
+
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
+        with open(f"{WORKING_REPO_DIR}/non_binary_file.txt", "w+") as f:
 def setUp(self): 
     if os.path.exists(WORKING_REPO_DIR): 
         shutil.rmtree(WORKING_REPO_DIR, onerror=set_write_permission_and_retry) 
     logger.info( 
         f"Does {WORKING_REPO_DIR} exist: {os.path.exists(WORKING_REPO_DIR)}" 
     ) 
     self.REPO_NAME = repo_name() 
     self._repo_url = self._api.create_repo( 
         repo_id=self.REPO_NAME, token=self._token 
     ) 
     self._api.upload_file( 
         path_or_fileobj=BytesIO(b"some initial binary data: \x00\x01"), 
         path_in_repo="random_file.txt", 
         repo_id=f"{USER}/{self.REPO_NAME}", 
         token=self._token, 
     ) 
 WORKING_REPO_DIR = os.path.join( 
     os.path.dirname(os.path.abspath(__file__)), "fixtures/working_repo_2" 
 ) 
 def setUp(self): 
     if os.path.exists(WORKING_REPO_DIR): 
         shutil.rmtree(WORKING_REPO_DIR, onerror=set_write_permission_and_retry) 
     logger.info( 
         f"Does {WORKING_REPO_DIR} exist: {os.path.exists(WORKING_REPO_DIR)}" 
     ) 
     self.REPO_NAME = repo_name() 
     self._repo_url = self._api.create_repo( 
         repo_id=self.REPO_NAME, token=self._token 
     ) 
     self._api.upload_file( 
         path_or_fileobj=BytesIO(b"some initial binary data: \x00\x01"), 
         path_in_repo="random_file.txt", 
         repo_id=f"{USER}/{self.REPO_NAME}", 
         token=self._token, 
     ) 
 WORKING_REPO_DIR = os.path.join( 
     os.path.dirname(os.path.abspath(__file__)), "fixtures/working_repo_2" 
 ) 
+            f.write(json.dumps(non_binary_file))
+
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.auto_track_binary_files()
+
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary)file.txt"))
+        )
+        self.assertTrue(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
+
     def test_auto_track_large_files_ignored_with_gitignore(self):
         repo = Repository(WORKING_REPO_DIR)
 
@@ -1157,6 +1181,7 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
 
         repo.auto_track_large_files()
 
+        # Large files
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
         )
@@ -1175,6 +1200,54 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
             )
         )
 
+    def test_auto_track_binary_files_ignored_with_gitignore(self):
+        repo = Repository(WORKING_REPO_DIR)
+
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
+        # Test nested gitignores
+        os.makedirs(f"{WORKING_REPO_DIR}/directory")
+
+        with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f:
+            f.write("binary_file.txt")
+
+        with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f:
+            f.write("binary_file_3.txt")
+
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.auto_track_binary_files()
+
+        # Binary files
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
+        self.assertTrue(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file_2.txt"))
+        )
+
+        self.assertFalse(
+            is_tracked_with_lfs(
+                os.path.join(WORKING_REPO_DIR, "directory/binary_file_3.txt")
+            )
+        )
+        self.assertTrue(
+            is_tracked_with_lfs(
+                os.path.join(WORKING_REPO_DIR, "directory/binary_file_4.txt")
+            )
+        )
+
     def test_auto_track_large_files_through_git_add(self):
         repo = Repository(WORKING_REPO_DIR)
 
@@ -1199,6 +1272,30 @@ def test_auto_track_large_files_through_git_add(self):
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
 
+    def test_auto_track_binary_files_through_git_add(self):
+        repo = Repository(WORKING_REPO_DIR)
+
+        # This content is non binary
+        non_binary_file = [100] * int(1e6)
+
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
+        with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
+            f.write(json.dumps(non_binary_file))
+
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.git_add(auto_lfs_track=True)
+
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt"))
+        )
+        self.assertTrue(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
+
     def test_auto_no_track_large_files_through_git_add(self):
         repo = Repository(WORKING_REPO_DIR)
 
@@ -1223,6 +1320,30 @@ def test_auto_no_track_large_files_through_git_add(self):
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
 
+    def test_auto_no_track_binary_files_through_git_add(self):
+        repo = Repository(WORKING_REPO_DIR)
+
+        # This content is non-binary
+        non_binary_file = [100] * int(1e6)
+
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
+        with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
+            f.write(json.dumps(non_binary_file))
+
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.git_add(auto_lfs_track=False)
+
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt"))
+        )
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
+        )
+
     def test_auto_track_updates_removed_gitattributes(self):
         repo = Repository(WORKING_REPO_DIR)