From 7ff850789076f113dcb79393d5d8fa2ea1de498b Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 23 Jul 2021 21:34:05 +0200 Subject: [PATCH 1/3] :sparkle: Adjust the MD to lower the sensitivity around certain CSVs ..having many columns. --- charset_normalizer/md.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 4f9c7140..d7f14788 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -56,7 +56,7 @@ def eligible(self, character: str) -> bool: def feed(self, character: str) -> None: self._character_count += 1 - if character != self._last_printable_char and character not in ["<", ">", "=", ":", "/", "&", ";", "{", "}", "[", "]"]: + if character != self._last_printable_char and character not in ["<", ">", "=", ":", "/", "&", ";", "{", "}", "[", "]", ",", "|", '"']: if is_punctuation(character): self._punctuation_count += 1 elif character.isdigit() is False and is_symbol(character): @@ -116,7 +116,7 @@ def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: - if character not in {'\n', '\t', '\r'} and character.isprintable() is False: + if character not in {'\n', '\t', '\r', '\t'} and character.isprintable() is False: self._unprintable_count += 1 self._character_count += 1 From 0112a0403832da76b94f25a88a3602eb7b768f85 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 23 Jul 2021 22:30:13 +0200 Subject: [PATCH 2/3] :wrench: :heavy_check_mark: 0.01 MD ratio diff on single subtest is OK --- tests/test_probe_chaos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_probe_chaos.py b/tests/test_probe_chaos.py index b43ba0ac..96342408 100644 --- a/tests/test_probe_chaos.py +++ b/tests/test_probe_chaos.py @@ -40,7 +40,7 @@ def test_subtle_gibberish(self): self.assertGreater( mess_ratio("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v"), - 0.8 + 0.7 ) self.assertGreater( From 775b3c43237fd9fa7e0fa1e3770f898b13d68385 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 23 Jul 2021 23:01:06 +0200 Subject: [PATCH 3/3] :wrench: Remove tab duplicate, using vertical tab instead --- charset_normalizer/md.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index d7f14788..7d948e80 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -116,7 +116,7 @@ def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: - if character not in {'\n', '\t', '\r', '\t'} and character.isprintable() is False: + if character not in {'\n', '\t', '\r', '\v'} and character.isprintable() is False: self._unprintable_count += 1 self._character_count += 1