From 9fc69852287cb66f51130116613d7cd352cfac77 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Fri, 23 Jul 2021 14:46:45 -0700 Subject: [PATCH] =?UTF-8?q?=E2=9D=87=EF=B8=8F=20=20Adjust=20the=20MD=20to?= =?UTF-8?q?=20lower=20the=20sensitivity=20around=20certain=20CSVs=20(#69)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * :sparkle: Adjust the MD to lower the sensitivity around certain CSVs ..having many columns. * :wrench: :heavy_check_mark: 0.01 MD ratio diff on single subtest is OK * :wrench: Remove tab duplicate, using vertical tab instead --- charset_normalizer/md.py | 4 ++-- tests/test_probe_chaos.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 4f9c7140..7d948e80 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -56,7 +56,7 @@ def eligible(self, character: str) -> bool: def feed(self, character: str) -> None: self._character_count += 1 - if character != self._last_printable_char and character not in ["<", ">", "=", ":", "/", "&", ";", "{", "}", "[", "]"]: + if character != self._last_printable_char and character not in ["<", ">", "=", ":", "/", "&", ";", "{", "}", "[", "]", ",", "|", '"']: if is_punctuation(character): self._punctuation_count += 1 elif character.isdigit() is False and is_symbol(character): @@ -116,7 +116,7 @@ def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: - if character not in {'\n', '\t', '\r'} and character.isprintable() is False: + if character not in {'\n', '\t', '\r', '\v'} and character.isprintable() is False: self._unprintable_count += 1 self._character_count += 1 diff --git a/tests/test_probe_chaos.py b/tests/test_probe_chaos.py index b43ba0ac..96342408 100644 --- a/tests/test_probe_chaos.py +++ b/tests/test_probe_chaos.py @@ -40,7 +40,7 @@ def test_subtle_gibberish(self): self.assertGreater( mess_ratio("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v"), - 0.8 + 0.7 ) self.assertGreater(