Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code style as refactored by Sourcery #131

Merged
merged 7 commits into from Oct 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 3 additions & 4 deletions bin/bc.py
Expand Up @@ -18,11 +18,10 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
return 0.

character_count = len(str_a)
diff_character_count = 0
diff_character_count = sum(
chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
)

for chr_a, chr_b in zip(str_a, str_b):
if chr_a != chr_b:
diff_character_count += 1

return 1. - (diff_character_count / character_count)

Expand Down
7 changes: 3 additions & 4 deletions bin/coverage.py
Expand Up @@ -15,11 +15,10 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_b = content.decode(cp_b)

character_count = len(str_a)
diff_character_count = 0
diff_character_count = sum(
chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
)

for chr_a, chr_b in zip(str_a, str_b):
if chr_a != chr_b:
diff_character_count += 1

return 1. - (diff_character_count / character_count)

Expand Down
22 changes: 8 additions & 14 deletions charset_normalizer/api.py
Expand Up @@ -131,7 +131,7 @@ def from_bytes(
prioritized_encodings = [] # type: List[str]

specified_encoding = (
any_specified_encoding(sequences) if preemptive_behaviour is True else None
any_specified_encoding(sequences) if preemptive_behaviour else None
) # type: Optional[str]

if specified_encoding is not None:
Expand Down Expand Up @@ -185,7 +185,7 @@ def from_bytes(
encoding_iana
) # type: bool

if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False:
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.info(
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
Expand Down Expand Up @@ -241,7 +241,7 @@ def from_bytes(
continue

r_ = range(
0 if bom_or_sig_available is False else len(sig_payload),
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
Expand All @@ -261,9 +261,7 @@ def from_bytes(

max_chunk_gave_up = int(len(r_) / 4) # type: int

if max_chunk_gave_up < 2:
max_chunk_gave_up = 2

max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count = 0 # type: int

md_chunks = [] # type: List[str]
Expand All @@ -281,9 +279,7 @@ def from_bytes(
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:

chunk_partial_size_chk = (
16 if chunk_size > 16 else chunk_size
) # type: int
chunk_partial_size_chk = min(chunk_size, 16) # type: int

if (
decoded_payload
Expand Down Expand Up @@ -312,11 +308,9 @@ def from_bytes(
):
break

if md_ratios:
mean_mess_ratio = sum(md_ratios) / len(md_ratios) # type: float
else:
mean_mess_ratio = 0.0

mean_mess_ratio = (
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
) # type: float
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.warning(
Expand Down
37 changes: 13 additions & 24 deletions charset_normalizer/cd.py
Expand Up @@ -119,9 +119,9 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
target_pure_latin = True # type: bool

for character in FREQUENCIES[language]:
if target_have_accents is False and is_accentuated(character):
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin is True and is_latin(character) is False:
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False

return target_have_accents, target_pure_latin
Expand All @@ -135,12 +135,7 @@ def alphabet_languages(
"""
languages = [] # type: List[Tuple[str, float]]

source_have_accents = False # type: bool

for character in characters:
if is_accentuated(character):
source_have_accents = True
break
source_have_accents = any(is_accentuated(character) for character in characters)

for language, language_characters in FREQUENCIES.items():

Expand Down Expand Up @@ -273,8 +268,6 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
The return type is the same as coherence_ratio.
"""
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
merge = [] # type: CoherenceMatches

for result in results:
for sub_result in result:
language, ratio = sub_result
Expand All @@ -283,17 +276,16 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
continue
per_language_ratios[language].append(ratio)

for language in per_language_ratios:
merge.append(
(
language,
round(
sum(per_language_ratios[language])
/ len(per_language_ratios[language]),
4,
),
)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]

return sorted(merge, key=lambda x: x[1], reverse=True)

Expand All @@ -308,14 +300,11 @@ def coherence_ratio(
"""

results = [] # type: List[Tuple[str, float]]
lg_inclusion_list = [] # type: List[str]
ignore_non_latin = False # type: bool

sufficient_match_count = 0 # type: int

if lg_inclusion is not None:
lg_inclusion_list = lg_inclusion.split(",")

lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
Expand Down
29 changes: 14 additions & 15 deletions charset_normalizer/cli/normalizer.py
Expand Up @@ -235,20 +235,19 @@ def cli_detect(argv: List[str] = None) -> int:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
else:
if (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue

try:
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
Expand Down Expand Up @@ -277,7 +276,7 @@ def cli_detect(argv: List[str] = None) -> int:
print(
", ".join(
[
el.encoding if el.encoding else "undefined"
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
Expand Down
3 changes: 2 additions & 1 deletion charset_normalizer/constant.py
Expand Up @@ -32,7 +32,7 @@
UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int

UNICODE_RANGES_COMBINED = {
"Control character": range(0, 31 + 1),
"Control character": range(31 + 1),
"Basic Latin": range(32, 127 + 1),
"Latin-1 Supplement": range(128, 255 + 1),
"Latin Extended-A": range(256, 383 + 1),
Expand Down Expand Up @@ -313,6 +313,7 @@
"Variation Selectors Supplement": range(917760, 917999 + 1),
} # type: Dict[str, range]


UNICODE_SECONDARY_RANGE_KEYWORD = [
"Supplement",
"Extended",
Expand Down
41 changes: 22 additions & 19 deletions charset_normalizer/md.py
Expand Up @@ -170,15 +170,16 @@ def eligible(self, character: str) -> bool:

def feed(self, character: str) -> None:
self._character_count += 1
if self._last_latin_character is not None:
if is_accentuated(character) and is_accentuated(self._last_latin_character):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(
self._last_latin_character
):
self._successive_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character

def reset(self) -> None: # pragma: no cover
Expand Down Expand Up @@ -346,7 +347,7 @@ def eligible(self, character: str) -> bool:
return True

def feed(self, character: str) -> None:
if character in ["丅", "丄"]:
if character in {"丅", "丄"}:
self._wrong_stop_count += 1
return
if is_cjk(character):
Expand Down Expand Up @@ -459,9 +460,10 @@ def is_suspiciously_successive_range(

# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if "Latin" in unicode_range_a or "Latin" in unicode_range_b:
if "Combining" in unicode_range_a or "Combining" in unicode_range_b:
return False
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False

keywords_range_a, keywords_range_b = unicode_range_a.split(
" "
Expand All @@ -482,11 +484,12 @@ def is_suspiciously_successive_range(
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if range_a_jp_chars or range_b_jp_chars:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False

if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
Expand Down Expand Up @@ -530,7 +533,7 @@ def mess_ratio(
else:
intermediary_mean_mess_ratio_calc = 128

for character, index in zip(decoded_sequence + "\n", range(0, length)):
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
Expand Down
3 changes: 1 addition & 2 deletions charset_normalizer/models.py
Expand Up @@ -284,8 +284,7 @@ def __init__(self, results: List[CharsetMatch] = None):
self._results = sorted(results) if results else [] # type: List[CharsetMatch]

def __iter__(self) -> Iterator[CharsetMatch]:
for result in self._results:
yield result
yield from self._results

def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
"""
Expand Down
18 changes: 6 additions & 12 deletions charset_normalizer/utils.py
Expand Up @@ -122,7 +122,7 @@ def is_emoticon(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]:
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
return True

character_category = unicodedata.category(character) # type: str
Expand All @@ -138,7 +138,7 @@ def is_case_variable(character: str) -> bool:
def is_private_use_only(character: str) -> bool:
character_category = unicodedata.category(character) # type: str

return "Co" == character_category
return character_category == "Co"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand Down Expand Up @@ -193,11 +193,7 @@ def is_thai(character: str) -> bool:

@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
if keyword in range_name:
return True

return False
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)


def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
Expand All @@ -211,9 +207,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional

results = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: seq_len if seq_len <= search_zone else search_zone].decode(
"ascii", errors="ignore"
),
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
) # type: List[str]

if len(results) == 0:
Expand Down Expand Up @@ -278,7 +272,7 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace("-", "_")

for encoding_alias, encoding_iana in aliases.items():
if cp_name == encoding_alias or cp_name == encoding_iana:
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana

if strict:
Expand Down Expand Up @@ -314,7 +308,7 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:

character_match_count = 0 # type: int

for i in range(0, 255):
for i in range(255):
to_be_decoded = bytes([i]) # type: bytes
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
Expand Down