From 4503b741ed2087760187ec87759066c35edbb8f4 Mon Sep 17 00:00:00 2001 From: Dan Blanchard Date: Fri, 11 Dec 2020 21:52:06 -0500 Subject: [PATCH] Count symbols and digits for total characters in SBCS prober --- chardet/sbcharsetprober.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/chardet/sbcharsetprober.py b/chardet/sbcharsetprober.py index e29ddec2..a49e8160 100644 --- a/chardet/sbcharsetprober.py +++ b/chardet/sbcharsetprober.py @@ -104,21 +104,9 @@ def feed(self, byte_str): language_model = self._model.language_model for char in byte_str: order = char_to_order_map.get(char, CharacterCategory.UNDEFINED) - # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but - # CharacterCategory.SYMBOL is actually 253, so we use CONTROL - # to make it closer to the original intent. The only difference - # is whether or not we count digits and control characters for - # _total_char purposes. - if order < CharacterCategory.CONTROL: + if order < CharacterCategory.SYMBOL: self._total_char += 1 -<<<<<<< HEAD - if order < self.SAMPLE_SIZE: -======= - # TODO: Follow uchardet's lead and discount confidence for frequent - # control characters. - # See https://github.com/BYVoid/uchardet/commit/55b4f23971db61 if order < CharacterCategory.CONTROL: ->>>>>>> 36504df (Add SingleByteCharSetModel for use by new model training code) self._freq_char += 1 if self._last_order < CharacterCategory.CONTROL: self._total_seqs += 1