Merge pull request #8 from Ousret/upgrade-0.3

Upgrade to 0.3
jawah · Sep 12, 2019 · 6009bf8 · 6009bf8
2 parents 17924e4 + d5473af
commit 6009bf8
Show file tree

Hide file tree

Showing 13 changed files with 466 additions and 191 deletions.
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ This project offer you a alternative to **Universal Charset Encoding Detector**,
 
 | Feature       | [Chardet](https://github.com/chardet/chardet)       | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
 | ------------- | :-------------: | :------------------: | :------------------: |
-| `Fast`         | ❌<br> 🐌🐌         | ✅<br>             | ✅ <br>⚡ |
+| `Fast`         | ❌<br>          | ❌<br>             | ✅ <br>⚡ |
 | `Universal**`     | ❌            | ✅                 | ❌ |
 | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
 | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |

diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py
@@ -3,4 +3,5 @@
 from charset_normalizer.unicode import UnicodeRangeIdentify
 from charset_normalizer.probe_chaos import ProbeChaos
 from charset_normalizer.probe_coherence import ProbeCoherence
+from charset_normalizer.probe_words import ProbeWords
 from charset_normalizer.legacy import detect
diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py
@@ -1,9 +1,10 @@
 import argparse
 import sys
 
-from charset_normalizer import CharsetNormalizerMatches
 from prettytable import PrettyTable
 
+from charset_normalizer import CharsetNormalizerMatches
+
 
 def query_yes_no(question, default="yes"):
     """Ask a yes/no question via input() and return their answer.
@@ -56,6 +57,8 @@ def cli_detect(argv=None):
                         help='Replace file when trying to normalize it instead of creating a new one.')
     parser.add_argument('--force', action="store_true", default=False, dest='force',
                         help='Replace file without asking if you are sure, use this flag with caution.')
+    parser.add_argument('--threshold', action="store", default=0.2, type=float, dest='threshold',
+                        help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.")
 
     args = parser.parse_args(argv)
 
@@ -72,10 +75,15 @@ def cli_detect(argv=None):
         print('Use --force in addition of --replace only.', file=sys.stderr)
         return 1
 
+    if args.threshold < 0. or args.threshold > 1.:
+        print('--threshold VALUE should be between 0. AND 1.')
+        return 1
+
     for my_file in args.file:
 
         matches = CharsetNormalizerMatches.from_fp(
-            my_file
+            my_file,
+            threshold=args.threshold
         )
 
         if len(matches) == 0:

diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
@@ -569,6 +569,24 @@
   "Variation Selectors Supplement"
 ]
 
+UNICODE_SECONDARY_RANGE_KEYWORD = [
+    'Supplement',
+    'Extended',
+    'Extensions',
+    'Modifier',
+    'Marks',
+    'Punctuation',
+    'Symbols',
+    'Forms',
+    'Operators',
+    'Miscellaneous',
+    'Drawing',
+    'Block',
+    'Shapes',
+    'Supplemental',
+    'Tags'
+]
+
 BYTE_ORDER_MARK = {
     'utf_8': BOM_UTF8,
     'utf_7': [

diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py
@@ -1,17 +1,16 @@
 # coding: utf-8
+import collections
 import re
 import statistics
 from encodings.aliases import aliases
 from os.path import basename, splitext
-import collections
+from platform import python_version_tuple
 
 from cached_property import cached_property
 
-from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter
-from charset_normalizer.probe_chaos import ProbeChaos
 from charset_normalizer.constant import BYTE_ORDER_MARK
-
-from platform import python_version_tuple
+from charset_normalizer.probe_chaos import ProbeChaos
+from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter
 
 
 class CharsetNormalizerMatch:
@@ -93,8 +92,13 @@ def language(self):
         :return: Most used/probable language in text
         :rtype: str
         """
-        languages = ProbeCoherence(self.char_counter).most_likely
-        return languages[0] if len(languages) > 0 else ('English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown')
+        probe_coherence = ProbeCoherence(self.char_counter)
+        languages = probe_coherence.most_likely
+
+        if len(languages) == 0:
+            return 'English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown'
+
+        return languages[0]
 
     @cached_property
     def chaos(self):
@@ -194,7 +198,7 @@ def __len__(self):
         return len(self._matches)
 
     @staticmethod
-    def normalize(path, steps=10, chunk_size=512, threshold=0.09):
+    def normalize(path, steps=10, chunk_size=512, threshold=0.20):
         """
         :param str path:
         :param int steps:
@@ -226,7 +230,7 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.09):
         return b_
 
     @staticmethod
-    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
+    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
         """
         Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
         charset encoding.
@@ -244,7 +248,7 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
         supported = sorted(aliases.items()) if py_need_sort else aliases.items()
 
         tested = set()
-        working = dict()
+        matches = list()
 
         maximum_length = len(sequences)
 
@@ -286,70 +290,54 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
             except LookupError:
                 continue
 
-            chaos_measures = list()
-            ranges_encountered_t = dict()
-            decoded_len_t = 0
-
-            successive_chaos_zero = 0
             r_ = range(
                 0 if bom_available is False else bom_len,
                 maximum_length,
                 int(maximum_length / steps)
             )
-            p_ = len(r_)
-
-            for i in r_:
-
-                chunk = sequences[i:i + chunk_size]
-                decoded = str(chunk, encoding=p, errors='ignore')
-
-                probe_chaos = ProbeChaos(decoded, giveup_threshold=threshold)
-                chaos_measure, ranges_encountered = probe_chaos.ratio, probe_chaos.encountered_unicode_range_occurrences
-
-                for k, e in ranges_encountered.items():
-                    if k not in ranges_encountered_t.keys():
-                        ranges_encountered_t[k] = 0
-                    ranges_encountered_t[k] += e
-
-                if bom_available is True:
-                    if chaos_measure > 0.:
-                        chaos_measure /= 2
-                    else:
-                        chaos_measure = -1.
-
-                if chaos_measure > threshold:
-                    if p in working.keys():
-                        del working[p]
-                    break
-                elif chaos_measure == 0.:
-                    successive_chaos_zero += 1
-                    if steps > 2 and successive_chaos_zero > p_ / 2:
-                        break
-                elif chaos_measure > 0. and successive_chaos_zero > 0:
-                    successive_chaos_zero = 0
-
-                chaos_measures.append(chaos_measure)
-
-                if p not in working.keys():
-                    working[p] = dict()
-
-            if p in working.keys():
-                working[p]['ratio'] = statistics.mean(chaos_measures)
-                working[p]['ranges'] = ranges_encountered_t
-                working[p]['chaos'] = sum(chaos_measures)
-                working[p]['len'] = decoded_len_t
-                working[p]['bom'] = bom_available
-                working[p]['bom_len'] = bom_len
-
-            if p == 'ascii' and p in working.keys() and working[p]['ratio'] == 0.:
-                break
-
-        return CharsetNormalizerMatches(
-            [CharsetNormalizerMatch(sequences if working[enc]['bom'] is False else sequences[working[enc]['bom_len']:], enc, working[enc]['ratio'], working[enc]['ranges'], working[enc]['bom']) for enc in
-             (sorted(working.keys()) if py_need_sort else working.keys()) if working[enc]['ratio'] <= threshold])
+
+            measures = [ProbeChaos(str(sequences[i:i + chunk_size], encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_]
+            ratios = [el.ratio for el in measures]
+            nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el in measures].count(True)
+
+            chaos_means = statistics.mean(ratios)
+            chaos_median = statistics.median(ratios)
+            chaos_min = min(ratios)
+            chaos_max = max(ratios)
+
+            if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold:
+                # print(p, 'is too much chaos for decoded input !')
+                continue
+
+            encountered_unicode_range_occurrences = dict()
+
+            for el in measures:
+                for u_name, u_occ in el.encountered_unicode_range_occurrences.items():
+                    if u_name not in encountered_unicode_range_occurrences.keys():
+                        encountered_unicode_range_occurrences[u_name] = 0
+                    encountered_unicode_range_occurrences[u_name] += u_occ
+
+            # print(p, 'U RANGES', encountered_unicode_range_occurrences)
+
+            matches.append(
+                CharsetNormalizerMatch(
+                    sequences if not bom_available else sequences[bom_len:],
+                    p,
+                    chaos_means,
+                    encountered_unicode_range_occurrences,
+                    bom_available
+                )
+            )
+
+            # print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].language)
+
+            if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
+                return CharsetNormalizerMatches([matches[-1]])
+
+        return CharsetNormalizerMatches(matches)
 
     @staticmethod
-    def from_fp(fp, steps=10, chunk_size=512, threshold=0.09):
+    def from_fp(fp, steps=10, chunk_size=512, threshold=0.20):
         """
         :param io.BinaryIO fp:
         :param int steps:
@@ -365,7 +353,7 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.09):
         )
 
     @staticmethod
-    def from_path(path, steps=10, chunk_size=512, threshold=0.09):
+    def from_path(path, steps=10, chunk_size=512, threshold=0.20):
         """
         :param str path:
         :param int steps:

diff --git a/charset_normalizer/probe_chaos.py b/charset_normalizer/probe_chaos.py
@@ -1,14 +1,15 @@
 # coding: utf-8
 import re
+from functools import lru_cache
 
 from dragonmapper.hanzi import MIXED, BOTH, UNKNOWN
 from dragonmapper.hanzi import identify as s_identify
 from zhon.hanzi import sentence as cjc_sentence_re
 
+from charset_normalizer.probe_coherence import HashableCounter
+from charset_normalizer.probe_words import ProbeWords
 from charset_normalizer.unicode import UnicodeRangeIdentify
 
-from functools import lru_cache
-
 
 @lru_cache(maxsize=8192)
 class ProbeChaos:
@@ -48,14 +49,62 @@ def __init__(self, string, giveup_threshold=0.09):
         self.total_upper_accent_encountered_inner = 0
         self.total_unaccented_letter_encountered = 0
 
+        self._probe_word = ProbeWords(HashableCounter(self._string.split()))
+
         self.gave_up = False
 
         if len(self._string) >= 10:
             self._probe()
 
+    def __add__(self, other):
+        """
+        :param ProbeChaos other:
+        :return:
+        """
+        k_ = ProbeChaos('', self._threshold)
+
+        k_.successive_upper_lower = self.successive_upper_lower + other.successive_upper_lower
+        k_.successive_accent = self.successive_accent + other.successive_accent
+        k_.successive_different_unicode_range = self.successive_different_unicode_range + other.successive_different_unicode_range
+
+        for el in self.encountered_unicode_range:
+            k_.encountered_unicode_range.add(el)
+
+        for el in other.encountered_unicode_range:
+            k_.encountered_unicode_range.add(el)
+
+        k_.encountered_punc_sign = self.encountered_punc_sign + other.encountered_punc_sign
+        k_.unprintable = self.unprintable + other.unprintable
+        k_.encountered_white_space = self.encountered_white_space + other.encountered_white_space
+        k_.not_encountered_white_space = self.not_encountered_white_space + other.not_encountered_white_space
+
+        for u_name, u_occ in self.encountered_unicode_range_occurrences.items():
+            if u_name not in k_.encountered_unicode_range_occurrences.keys():
+                k_.encountered_unicode_range_occurrences[u_name] = 0
+            k_.encountered_unicode_range_occurrences[u_name] += u_occ
+
+        for u_name, u_occ in other.encountered_unicode_range_occurrences.items():
+            if u_name not in k_.encountered_unicode_range_occurrences.keys():
+                k_.encountered_unicode_range_occurrences[u_name] = 0
+            k_.encountered_unicode_range_occurrences[u_name] += u_occ
+
+        k_.not_encountered_white_space_reset = self.not_encountered_white_space_reset + other.not_encountered_white_space_reset
+        k_.total_letter_encountered = self.total_letter_encountered + other.total_letter_encountered
+        k_.total_lower_letter_encountered = self.total_lower_letter_encountered + other.total_lower_letter_encountered
+        k_.total_upper_accent_encountered = self.total_upper_accent_encountered + other.total_upper_accent_encountered
+        k_.total_upper_accent_encountered_inner = self.total_upper_accent_encountered_inner + other.total_upper_accent_encountered_inner
+        k_.total_unaccented_letter_encountered = self.total_unaccented_letter_encountered + other.total_unaccented_letter_encountered
+
+        k_._probe_word = self._probe_word + other._probe_word
+
+        k_._string = self._string + other._string
+
+        return k_
+
     def _probe(self):
 
         c__ = False
+        upper_lower_m = False
 
         for c, i_ in zip(self._string, range(0, len(self._string))):
 
@@ -133,7 +182,13 @@ def _probe(self):
                     continue
 
                 if (is_lower and self.previous_printable_letter.isupper()) or (is_upper and self.previous_printable_letter.islower()):
-                    self.successive_upper_lower += 1
+                    if not upper_lower_m:
+                        upper_lower_m = True
+                    else:
+                        self.successive_upper_lower += 1
+                        upper_lower_m = False
+                else:
+                    upper_lower_m = False
 
                 if is_latin:
                     self.previous_encountered_unicode_range = u_name
@@ -154,15 +209,19 @@ def _probe(self):
 
     @staticmethod
     def _unravel_cjk_suspicious_chinese(string, encountered_unicode_range_occurrences):
+        if len(string) <= 10:
+            return UNKNOWN
 
         encountered_unicode_range = encountered_unicode_range_occurrences.keys()
 
         if 'CJK Unified Ideographs' in encountered_unicode_range and ('Hiragana' not in encountered_unicode_range and 'Katakana' not in encountered_unicode_range):
             i_ = s_identify(string)
             if i_ in [MIXED, BOTH]:
                 return encountered_unicode_range_occurrences['CJK Unified Ideographs']
-            elif i_ != UNKNOWN and len(re.findall(cjc_sentence_re, string)) == 0:
-                return encountered_unicode_range_occurrences['CJK Unified Ideographs']
+            elif i_ != UNKNOWN and len(re.findall(cjc_sentence_re, string)) > 0:
+                return -encountered_unicode_range_occurrences['CJK Unified Ideographs']
+            elif i_ != UNKNOWN:
+                return int(encountered_unicode_range_occurrences['CJK Unified Ideographs']*0.3)
 
         return UNKNOWN
 
@@ -178,4 +237,4 @@ def ratio(self):
         r_ = self.total_upper_accent_encountered if self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0
         z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(len(self._string), self.encountered_unicode_range_occurrences)
         p_ = self.encountered_punc_sign if self.encountered_punc_sign / len(self._string) > 0.2 else 0
-        return (r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)  # + len(self.encountered_unicode_range)-1
+        return ((r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)) + self._probe_word.ratio  # + len(self.encountered_unicode_range)-1