huggingface · adrinjalali · Apr 21, 2022 · Apr 8, 2022 · Apr 8, 2022 · Apr 8, 2022
diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -569,23 +569,19 @@ def _validate_or_retrieve_token(
         function_name: Optional[str] = None,
     ):
         """
-        Args:
         Retrieves and validates stored token or validates passed token.
+        Args:
             token (``str``, `optional`):
-                Hugging Face token. Will default to the locally saved token if
-                not provided.
+                Hugging Face token. Will default to the locally saved token if not provided.
             name (``str``, `optional`):
-                Name of the repository. This is deprecated in favor of repo_id
-                and will be removed in v0.7.
+                Name of the repository. This is deprecated in favor of repo_id and will be removed in v0.7.
             function_name (``str``, `optional`):
-                If _validate_or_retrieve_token is called from a function, name
-                of that function to be passed inside deprecation warning.
+                If _validate_or_retrieve_token is called from a function, name of that function to be passed inside deprecation warning.
         Returns:
             Validated token and the name of the repository.
         Raises:
-            :class:`EnvironmentError`: If the token is not passed and there's no
-            token saved locally. :class:`ValueError`: If organization token or
-            invalid token is passed.
+            :class:`EnvironmentError`: If the token is not passed and there's no token saved locally.
+            :class:`ValueError`: If organization token or invalid token is passed.
         """
         if token is None or token is True:
             token = HfFolder.get_token()
@@ -1851,8 +1847,8 @@ def get_token(cls) -> Optional[str]:
         """
         Get token or None if not existent.
 
-        Note that a token can be also provided using the
-        `HUGGING_FACE_HUB_TOKEN` environment variable.
+        Note that a token can be also provided using the `HUGGING_FACE_HUB_TOKEN`
+        environment variable.
 
         Returns:
             `str` or `None`: The token, `None` if it doesn't exist.

diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py
@@ -239,7 +239,7 @@ def is_binary_file(filename: Union[str, Path]) -> bool:
     """
     try:
         with open(filename) as f:
-            content = f.read(512)
+            content = f.read()
-            content = f.read()
+            content = f.read(1024)  # or 512 if we want to be consistent with the backend
-            content = f.read()
+            content = f.read(1024)  # or 512 if we want to be consistent with the backend
 
         # Check for the presence of the null character in the string
         return "\x00" in content
@@ -1023,15 +1023,22 @@ def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]:
                 continue
 
             path_to_file = os.path.join(os.getcwd(), self.local_dir, filename)
-            is_binary = is_binary_file(path_to_file)
 
-            if (
-                is_binary
-                and not is_tracked_with_lfs(path_to_file)
-                and not is_git_ignored(path_to_file)
-            ):
-                self.lfs_track(filename)
-                files_to_be_tracked_with_lfs.append(filename)
+            if not (is_tracked_with_lfs(path_to_file) or is_git_ignored(path_to_file)):
+                size_in_mb = os.path.getsize(path_to_file) / (1024 * 1024)
+
+                if size_in_mb >= 10:
+                    logger.warning(
+                        "Parsing a large file to check if binary or not. Tracking large "
+                        "files using `repository.auto_track_large_files` is recommended "
+                        "so as to not load the full file in memory."
+                    )
+
+                is_binary = is_binary_file(path_to_file)
+
+                if is_binary:
+                    self.lfs_track(filename)
+                    files_to_be_tracked_with_lfs.append(filename)
 
         # Cleanup the .gitattributes if files were deleted
         self.lfs_untrack(deleted_files)
@@ -1152,10 +1159,12 @@ def git_add(
                 be automatically tracked.
         """
         if auto_lfs_track:
-            tracked_files = [
-                *self.auto_track_large_files(pattern),
-                *self.auto_track_binary_files(pattern),
-            ]
+            # Track files according to their size (>=10MB)
+            tracked_files = self.auto_track_large_files(pattern)
+
+            # Read the remaining files and track them if they're binary
+            tracked_files.extend(self.auto_track_binary_files(pattern))
+
             if tracked_files:
                 logger.warning(
                     f"Adding files tracked by Git LFS: {tracked_files}. This may take a bit of time if the files are large."

diff --git a/tests/test_repository.py b/tests/test_repository.py
@@ -1075,18 +1075,12 @@ def test_is_tracked_with_lfs_with_pattern(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
-        # This content is binary (contains the null character)
-        binary_file = "\x00\x00\x00\x00"
-
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
 
         with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
-        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
-            f.write(binary_file)
-
         os.makedirs(f"{WORKING_REPO_DIR}/dir", exist_ok=True)
 
         with open(f"{WORKING_REPO_DIR}/dir/large_file.txt", "w+") as f:
@@ -1095,30 +1089,20 @@ def test_is_tracked_with_lfs_with_pattern(self):
         with open(f"{WORKING_REPO_DIR}/dir/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
-        with open(f"{WORKING_REPO_DIR}/dir/binary_file.txt", "w+") as f:
-            f.write(binary_file)
-
         repo.auto_track_large_files("dir")
-        repo.auto_track_binary_files("dir")
 
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
         )
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
-        self.assertFalse(
-            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
-        )
         self.assertTrue(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/large_file.txt"))
         )
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/small_file.txt"))
         )
-        self.assertTrue(
-            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/binary_file.txt"))
-        )
 
     def test_auto_track_large_files(self):
         repo = Repository(WORKING_REPO_DIR)
@@ -1129,27 +1113,41 @@ def test_auto_track_large_files(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
-        # This content is binary (contains the null character)
-        binary_file = "\x00\x00\x00\x00"
-
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
 
         with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
-        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
-            f.write(binary_file)
-
         repo.auto_track_large_files()
-        repo.auto_track_binary_files()
 
         self.assertTrue(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
         )
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
+
+    def test_auto_track_binary_files(self):
+        repo = Repository(WORKING_REPO_DIR)
+
+        # This content is non-binary
+        non_binary_file = [100] * int(1e6)
+
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
+        with open(f"{WORKING_REPO_DIR}/non_binary_file.txt", "w+") as f:
 def setUp(self): 
     if os.path.exists(WORKING_REPO_DIR): 
         shutil.rmtree(WORKING_REPO_DIR, onerror=set_write_permission_and_retry) 
     logger.info( 
         f"Does {WORKING_REPO_DIR} exist: {os.path.exists(WORKING_REPO_DIR)}" 
     ) 
     self.REPO_NAME = repo_name() 
     self._repo_url = self._api.create_repo( 
         repo_id=self.REPO_NAME, token=self._token 
     ) 
     self._api.upload_file( 
         path_or_fileobj=BytesIO(b"some initial binary data: \x00\x01"), 
         path_in_repo="random_file.txt", 
         repo_id=f"{USER}/{self.REPO_NAME}", 
         token=self._token, 
     ) 
 WORKING_REPO_DIR = os.path.join( 
     os.path.dirname(os.path.abspath(__file__)), "fixtures/working_repo_2" 
 ) 
 def setUp(self): 
     if os.path.exists(WORKING_REPO_DIR): 
         shutil.rmtree(WORKING_REPO_DIR, onerror=set_write_permission_and_retry) 
     logger.info( 
         f"Does {WORKING_REPO_DIR} exist: {os.path.exists(WORKING_REPO_DIR)}" 
     ) 
     self.REPO_NAME = repo_name() 
     self._repo_url = self._api.create_repo( 
         repo_id=self.REPO_NAME, token=self._token 
     ) 
     self._api.upload_file( 
         path_or_fileobj=BytesIO(b"some initial binary data: \x00\x01"), 
         path_in_repo="random_file.txt", 
         repo_id=f"{USER}/{self.REPO_NAME}", 
         token=self._token, 
     ) 
 WORKING_REPO_DIR = os.path.join( 
     os.path.dirname(os.path.abspath(__file__)), "fixtures/working_repo_2" 
 ) 
+            f.write(json.dumps(non_binary_file))
+
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.auto_track_binary_files()
+
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary)file.txt"))
+        )
         self.assertTrue(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
         )
@@ -1160,17 +1158,14 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
-        # This content is binary (contains the null character)
-        binary_file = "\x00\x00\x00\x00"
-
         # Test nested gitignores
         os.makedirs(f"{WORKING_REPO_DIR}/directory")
 
         with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f:
-            f.write("large_file.txt\nbinary_file.txt")
+            f.write("large_file.txt")
 
         with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f:
-            f.write("large_file_3.txt\nbinary_file_3.txt")
+            f.write("large_file_3.txt")
 
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
@@ -1186,20 +1181,6 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
 
         repo.auto_track_large_files()
 
-        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
-            f.write(binary_file)
-
-        with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f:
-            f.write(binary_file)
-
-        with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f:
-            f.write(binary_file)
-
-        with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f:
-            f.write(binary_file)
-
-        repo.auto_track_binary_files()
-
         # Large files
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt"))
@@ -1219,6 +1200,35 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
             )
         )
 
+    def test_auto_track_binary_files_ignored_with_gitignore(self):
+        repo = Repository(WORKING_REPO_DIR)
+
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
+        # Test nested gitignores
+        os.makedirs(f"{WORKING_REPO_DIR}/directory")
+
+        with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f:
+            f.write("binary_file.txt")
+
+        with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f:
+            f.write("binary_file_3.txt")
+
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f:
+            f.write(binary_file)
+
+        with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.auto_track_binary_files()
+
         # Binary files
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
@@ -1238,7 +1248,7 @@ def test_auto_track_large_files_ignored_with_gitignore(self):
             )
         )
 
-    def test_auto_track_files_through_git_add(self):
+    def test_auto_track_large_files_through_git_add(self):
         repo = Repository(WORKING_REPO_DIR)
 
         # This content is 5MB (under 10MB)
@@ -1247,18 +1257,12 @@ def test_auto_track_files_through_git_add(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
-        # This content is binary (contains the null character)
-        binary_file = "\x00\x00\x00\x00"
-
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
 
         with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
-        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
-            f.write(binary_file)
-
         repo.git_add(auto_lfs_track=True)
 
         self.assertTrue(
@@ -1267,11 +1271,32 @@ def test_auto_track_files_through_git_add(self):
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
+
+    def test_auto_track_binary_files_through_git_add(self):
+        repo = Repository(WORKING_REPO_DIR)
+
+        # This content is non binary
+        non_binary_file = [100] * int(1e6)
+
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
+        with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
+            f.write(json.dumps(non_binary_file))
+
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.git_add(auto_lfs_track=True)
+
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt"))
+        )
         self.assertTrue(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
         )
 
-    def test_auto_no_track_files_through_git_add(self):
+    def test_auto_no_track_large_files_through_git_add(self):
         repo = Repository(WORKING_REPO_DIR)
 
         # This content is 5MB (under 10MB)
@@ -1280,18 +1305,12 @@ def test_auto_no_track_files_through_git_add(self):
         # This content is 20MB (over 10MB)
         large_file = [100] * int(4e6)
 
-        # This content is binary (contains the null character)
-        binary_file = "\x00\x00\x00\x00"
-
         with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f:
             f.write(json.dumps(large_file))
 
         with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
             f.write(json.dumps(small_file))
 
-        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
-            f.write(binary_file)
-
         repo.git_add(auto_lfs_track=False)
 
         self.assertFalse(
@@ -1300,6 +1319,27 @@ def test_auto_no_track_files_through_git_add(self):
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt"))
         )
+
+    def test_auto_no_track_binary_files_through_git_add(self):
+        repo = Repository(WORKING_REPO_DIR)
+
+        # This content is non-binary
+        non_binary_file = [100] * int(1e6)
+
+        # This content is binary (contains the null character)
+        binary_file = "\x00\x00\x00\x00"
+
+        with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f:
+            f.write(json.dumps(non_binary_file))
+
+        with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f:
+            f.write(binary_file)
+
+        repo.git_add(auto_lfs_track=False)
+
+        self.assertFalse(
+            is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt"))
+        )
         self.assertFalse(
             is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt"))
         )