chardet · dan-blanchard · May 27, 2022 · May 11, 2022 · May 26, 2022 · May 27, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
     args: ["--profile", "black"]
     name: isort (python)
 - repo: https://github.com/psf/black
-  rev: 21.8b0
+  rev: 22.3.0
   hooks:
   - id: black
     args: ["--target-version", "py36"]
diff --git a/chardet/charsetprober.py b/chardet/charsetprober.py
@@ -31,6 +31,10 @@
 
 from .enums import ProbingState
 
+INTERNATIONAL_WORDS_PATTERN = re.compile(
+    b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
+)
+
 
 class CharSetProber:
 
@@ -70,20 +74,18 @@ def filter_international_words(buf):
         alphabet: english alphabets [a-zA-Z]
         international: international characters [\x80-\xFF]
         marker: everything else [^a-zA-Z\x80-\xFF]
-
         The input buffer can be thought to contain a series of words delimited
         by markers. This function works to filter all words that contain at
         least one international character. All contiguous sequences of markers
         are replaced by a single space ascii character.
-
         This filter applies to all scripts which do not use English characters.
         """
         filtered = bytearray()
 
         # This regex expression filters out only words that have at-least one
         # international character. The word may include one marker character at
         # the end.
-        words = re.findall(b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?", buf)
+        words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
 
         for word in words:
             filtered.extend(word[:-1])
@@ -104,18 +106,16 @@ def remove_xml_tags(buf):
         """
         Returns a copy of ``buf`` that retains only the sequences of English
         alphabet and high byte characters that are not between <> characters.
-
         This filter can be applied to all scripts which contain both English
         characters and extended ASCII characters, but is currently only used by
         ``Latin1Prober``.
         """
         filtered = bytearray()
         in_tag = False
         prev = 0
+        buf = memoryview(buf).cast("c")
 
-        for curr in range(len(buf)):
-            # Slice here to get bytes instead of an int with Python 3
-            buf_char = buf[curr : curr + 1]
+        for curr, buf_char in enumerate(buf):
             # Check if we're coming out of or entering an XML tag
             if buf_char == b">":
                 prev = curr + 1