Bugs Fixed

nltk · Apr 22, 2023 · a8b534a · a8b534a
1 parent 489b040
commit a8b534a
Showing 1 changed file with 5 additions and 4 deletions.
diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py
@@ -192,10 +192,10 @@ def tokenize(
     def custom_tokenize(self, text: str, custom_tokens: List) -> List[str]:
 
         r"""Return a custom tokenized copy of `text`.
-        
+
         In this function we can make custome token.
         For get that we can give list of needed custome token.
-        
+
         >>> from nltk.tokenize import custom_tokenize
         >>> text = "This is first token, need to edited also first token and second token will be edited, but not this first tokens will be created"
         >>> custom_tokens = ['first token','first token and second token']
@@ -211,13 +211,14 @@ def custom_tokenize(self, text: str, custom_tokens: List) -> List[str]:
         :rtype: List[str]
         """
 
+        custom_tokens = sorted(custom_tokens,reverse = True)
         # Making unique SEP token for adding in text
         # So that we can't split it
         def get_sep_token(sep_token: str,text: str) -> str:
             while(sep_token in text):
                 sep_token = "_" + sep_token + "_"
             return sep_token
-        
+
         # Remove SEP tokens to get needed tokens
         def remove_sep_token(tokens: List[str],sep_token: str) -> List[str]:
             tokens_without_sep = [token.replace(sep_token , " ") for token in tokens]
@@ -231,7 +232,7 @@ def remove_sep_token(tokens: List[str],sep_token: str) -> List[str]:
             temp = i.replace(" ", sep_token)
             text = re.sub(f"((^| ){i})([!,. ])",r" {} \3".format(temp),text)
 
-        # Get tokens using tokenize function 
+        # Get tokens using tokenize function
         raw_tokens = self.tokenize(text)
 
         # Remove SEP token to get orignial(needed) token