Skip to content

Commit

Permalink
Code style as refactored by Sourcery (#131)
Browse files Browse the repository at this point in the history
Co-authored-by: Sourcery AI <>
  • Loading branch information
adbar committed Oct 28, 2021
1 parent b89913a commit 4c69feb
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 95 deletions.
7 changes: 3 additions & 4 deletions bin/bc.py
Expand Up @@ -18,11 +18,10 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
return 0.

character_count = len(str_a)
diff_character_count = 0
diff_character_count = sum(
chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
)

for chr_a, chr_b in zip(str_a, str_b):
if chr_a != chr_b:
diff_character_count += 1

return 1. - (diff_character_count / character_count)

Expand Down
7 changes: 3 additions & 4 deletions bin/coverage.py
Expand Up @@ -15,11 +15,10 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_b = content.decode(cp_b)

character_count = len(str_a)
diff_character_count = 0
diff_character_count = sum(
chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
)

for chr_a, chr_b in zip(str_a, str_b):
if chr_a != chr_b:
diff_character_count += 1

return 1. - (diff_character_count / character_count)

Expand Down
22 changes: 8 additions & 14 deletions charset_normalizer/api.py
Expand Up @@ -131,7 +131,7 @@ def from_bytes(
prioritized_encodings = [] # type: List[str]

specified_encoding = (
any_specified_encoding(sequences) if preemptive_behaviour is True else None
any_specified_encoding(sequences) if preemptive_behaviour else None
) # type: Optional[str]

if specified_encoding is not None:
Expand Down Expand Up @@ -185,7 +185,7 @@ def from_bytes(
encoding_iana
) # type: bool

if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False:
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.info(
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
Expand Down Expand Up @@ -241,7 +241,7 @@ def from_bytes(
continue

r_ = range(
0 if bom_or_sig_available is False else len(sig_payload),
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
Expand All @@ -261,9 +261,7 @@ def from_bytes(

max_chunk_gave_up = int(len(r_) / 4) # type: int

if max_chunk_gave_up < 2:
max_chunk_gave_up = 2

max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count = 0 # type: int

md_chunks = [] # type: List[str]
Expand All @@ -281,9 +279,7 @@ def from_bytes(
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:

chunk_partial_size_chk = (
16 if chunk_size > 16 else chunk_size
) # type: int
chunk_partial_size_chk = min(chunk_size, 16) # type: int

if (
decoded_payload
Expand Down Expand Up @@ -312,11 +308,9 @@ def from_bytes(
):
break

if md_ratios:
mean_mess_ratio = sum(md_ratios) / len(md_ratios) # type: float
else:
mean_mess_ratio = 0.0

mean_mess_ratio = (
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
) # type: float
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.warning(
Expand Down
37 changes: 13 additions & 24 deletions charset_normalizer/cd.py
Expand Up @@ -119,9 +119,9 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
target_pure_latin = True # type: bool

for character in FREQUENCIES[language]:
if target_have_accents is False and is_accentuated(character):
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin is True and is_latin(character) is False:
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False

return target_have_accents, target_pure_latin
Expand All @@ -135,12 +135,7 @@ def alphabet_languages(
"""
languages = [] # type: List[Tuple[str, float]]

source_have_accents = False # type: bool

for character in characters:
if is_accentuated(character):
source_have_accents = True
break
source_have_accents = any(is_accentuated(character) for character in characters)

for language, language_characters in FREQUENCIES.items():

Expand Down Expand Up @@ -273,8 +268,6 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
The return type is the same as coherence_ratio.
"""
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
merge = [] # type: CoherenceMatches

for result in results:
for sub_result in result:
language, ratio = sub_result
Expand All @@ -283,17 +276,16 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
continue
per_language_ratios[language].append(ratio)

for language in per_language_ratios:
merge.append(
(
language,
round(
sum(per_language_ratios[language])
/ len(per_language_ratios[language]),
4,
),
)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]

return sorted(merge, key=lambda x: x[1], reverse=True)

Expand All @@ -308,14 +300,11 @@ def coherence_ratio(
"""

results = [] # type: List[Tuple[str, float]]
lg_inclusion_list = [] # type: List[str]
ignore_non_latin = False # type: bool

sufficient_match_count = 0 # type: int

if lg_inclusion is not None:
lg_inclusion_list = lg_inclusion.split(",")

lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
Expand Down
29 changes: 14 additions & 15 deletions charset_normalizer/cli/normalizer.py
Expand Up @@ -235,20 +235,19 @@ def cli_detect(argv: List[str] = None) -> int:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
else:
if (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue

try:
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
Expand Down Expand Up @@ -277,7 +276,7 @@ def cli_detect(argv: List[str] = None) -> int:
print(
", ".join(
[
el.encoding if el.encoding else "undefined"
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
Expand Down
3 changes: 2 additions & 1 deletion charset_normalizer/constant.py
Expand Up @@ -32,7 +32,7 @@
UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int

UNICODE_RANGES_COMBINED = {
"Control character": range(0, 31 + 1),
"Control character": range(31 + 1),
"Basic Latin": range(32, 127 + 1),
"Latin-1 Supplement": range(128, 255 + 1),
"Latin Extended-A": range(256, 383 + 1),
Expand Down Expand Up @@ -313,6 +313,7 @@
"Variation Selectors Supplement": range(917760, 917999 + 1),
} # type: Dict[str, range]


UNICODE_SECONDARY_RANGE_KEYWORD = [
"Supplement",
"Extended",
Expand Down
41 changes: 22 additions & 19 deletions charset_normalizer/md.py
Expand Up @@ -170,15 +170,16 @@ def eligible(self, character: str) -> bool:

def feed(self, character: str) -> None:
self._character_count += 1
if self._last_latin_character is not None:
if is_accentuated(character) and is_accentuated(self._last_latin_character):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(
self._last_latin_character
):
self._successive_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character

def reset(self) -> None: # pragma: no cover
Expand Down Expand Up @@ -346,7 +347,7 @@ def eligible(self, character: str) -> bool:
return True

def feed(self, character: str) -> None:
if character in ["丅", "丄"]:
if character in {"丅", "丄"}:
self._wrong_stop_count += 1
return
if is_cjk(character):
Expand Down Expand Up @@ -459,9 +460,10 @@ def is_suspiciously_successive_range(

# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if "Latin" in unicode_range_a or "Latin" in unicode_range_b:
if "Combining" in unicode_range_a or "Combining" in unicode_range_b:
return False
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False

keywords_range_a, keywords_range_b = unicode_range_a.split(
" "
Expand All @@ -482,11 +484,12 @@ def is_suspiciously_successive_range(
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if range_a_jp_chars or range_b_jp_chars:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False

if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
Expand Down Expand Up @@ -530,7 +533,7 @@ def mess_ratio(
else:
intermediary_mean_mess_ratio_calc = 128

for character, index in zip(decoded_sequence + "\n", range(0, length)):
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
Expand Down
3 changes: 1 addition & 2 deletions charset_normalizer/models.py
Expand Up @@ -284,8 +284,7 @@ def __init__(self, results: List[CharsetMatch] = None):
self._results = sorted(results) if results else [] # type: List[CharsetMatch]

def __iter__(self) -> Iterator[CharsetMatch]:
for result in self._results:
yield result
yield from self._results

def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
"""
Expand Down
18 changes: 6 additions & 12 deletions charset_normalizer/utils.py
Expand Up @@ -122,7 +122,7 @@ def is_emoticon(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]:
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
return True

character_category = unicodedata.category(character) # type: str
Expand All @@ -138,7 +138,7 @@ def is_case_variable(character: str) -> bool:
def is_private_use_only(character: str) -> bool:
character_category = unicodedata.category(character) # type: str

return "Co" == character_category
return character_category == "Co"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand Down Expand Up @@ -193,11 +193,7 @@ def is_thai(character: str) -> bool:

@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
if keyword in range_name:
return True

return False
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)


def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
Expand All @@ -211,9 +207,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional

results = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: seq_len if seq_len <= search_zone else search_zone].decode(
"ascii", errors="ignore"
),
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
) # type: List[str]

if len(results) == 0:
Expand Down Expand Up @@ -278,7 +272,7 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace("-", "_")

for encoding_alias, encoding_iana in aliases.items():
if cp_name == encoding_alias or cp_name == encoding_iana:
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana

if strict:
Expand Down Expand Up @@ -314,7 +308,7 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:

character_match_count = 0 # type: int

for i in range(0, 255):
for i in range(255):
to_be_decoded = bytes([i]) # type: bytes
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
Expand Down

0 comments on commit 4c69feb

Please sign in to comment.