Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-use decoded buffer for short texts #175

Merged
merged 9 commits into from
Jun 18, 2022
144 changes: 90 additions & 54 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from collections.abc import Iterable
from os.path import basename, splitext
from typing import BinaryIO, List, Optional, Set
from typing import BinaryIO, Generator, List, Optional, Set

try:
from os import PathLike
Expand Down Expand Up @@ -34,6 +35,65 @@
)


def cut_sequence_chunks(
Ousret marked this conversation as resolved.
Show resolved Hide resolved
sequences: bytes,
length: int,
encoding_iana: str,
decoded_payload: Optional[str],
iterator: Iterable,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
) -> Generator[str, None, None]:
chunk = "" # type: str
if decoded_payload:
for i in iterator:
chunk = decoded_payload[i : i + chunk_size]
Ousret marked this conversation as resolved.
Show resolved Hide resolved
if not chunk:
break
yield chunk
else:
for i in iterator:
chunk_end = i + chunk_size
if chunk_end > length + 8:
continue

cut_sequence = sequences[i : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)

# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:

chunk_partial_size_chk = min(chunk_size, 16) # type: int

if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(encoding_iana, errors="ignore")

if chunk[:chunk_partial_size_chk] in decoded_payload:
break

yield chunk


def from_bytes(
sequences: bytes,
steps: int = 5,
Expand Down Expand Up @@ -285,63 +345,39 @@ def from_bytes(
md_chunks = [] # type: List[str]
md_ratios = []

for i in r_:
if i + chunk_size > length + 8:
continue

cut_sequence = sequences[i : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

try:
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
) # type: str
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
break
try:
for chunk in cut_sequence_chunks(
Ousret marked this conversation as resolved.
Show resolved Hide resolved
sequences,
length,
encoding_iana,
decoded_payload,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
):
md_chunks.append(chunk)

# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
md_ratios.append(mess_ratio(chunk, threshold))

chunk_partial_size_chk = min(chunk_size, 16) # type: int
if md_ratios[-1] >= threshold:
early_stop_count += 1

if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(encoding_iana, errors="ignore")

if chunk[:chunk_partial_size_chk] in decoded_payload:
break

md_chunks.append(chunk)

md_ratios.append(mess_ratio(chunk, threshold))

if md_ratios[-1] >= threshold:
early_stop_count += 1

if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
break
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True

# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
Expand Down
204 changes: 204 additions & 0 deletions data/sample-polish.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
"source";"target"
Ousret marked this conversation as resolved.
Show resolved Hide resolved
"REF.-2";"POLISH"
"KW-P00-01";"SYSTEM VIDEODOMOFONOWY MEET"
"KW-P00-02";"URZĄDZENIE"
"KW-P00-03";"OGÓLNE"
"KW-P00-04";"SIEĆ"
"KW-P00-05";"KD"
"KW-P00-06";"ROZP. TWARZY."
"KW-P00-07";"KAMERY IP"
"KW-P00-08";"SIP"
"KW-P00-09";"SIP TRUNK"
"KW-P00-10";"PRZEKIEROWANIA"
"KW-P00-11";"ZAAWANSOWANE"
"KW-P00-12";"KOD PIN"
"KW-P00-13";"WECHAT QR"
"KW-P00-14";"PRZYWRACAĆ"
"KW-P00-16";"WINDA"
"KW-P01-01";"INFORMACJE O URZĄDZENIU"
"KW-P01-02";"PANEL VIDEO FOOBAR KIN"
"KW-P01-03";"FIRMWARE: V02.10"
"KW-P01-04";"URZĄDZENIE: PANEL BLOKOWY-CYFROWY 001-02"
"KW-P01-05";"URZĄDZENIE: PANEL BLOKOWY PRZYCISKI 020-02"
"KW-P01-06";"URZĄDZENIE: PANEL GŁÓWNY 01"
"KW-P01-07";"URZĄDZENIE: PANEL 1W 006-0102-01"
"KW-P01-08";"NUMER SERYJNY:"
"KW-P01-09";"MAC:"
"KW-P01-10";"IP:"
"KW-P01-11";"COPYRIGHT © FOOBAR "
"KW-P01-12";"www.example.com"
"KW-P02-01";"USTAWIENIA GŁÓWNE"
"KW-P02-02";"TYP:"
"KW-P02-03";"PANEL GŁÓWNY"
"KW-P02-04";"CYFROWY P. BLOKOWY"
"KW-P02-05";"P. BLOK. PRZYCISKI"
"KW-P02-06";"PANEL 1NR"
"KW-P02-07";"BLOK:"
"KW-P02-08";"LOKAL:"
"KW-P02-09";"MONIT WYŚWIETLACZA:"
"KW-P02-10";"THIS INTERFACE IS NOT ENABLED"
"KW-P02-11";"NUMER PANELU:"
"KW-P02-12";"NAZWA URZĄDZENIA:"
"KW-P02-13";"(≤16 ZNAKÓW)"
"KW-P02-14";"JĘZYK:"
"KW-P02-15";"ENGLISH"
"KW-P02-16";"中文"
"KW-P02-17";"ESPAÑOL"
"KW-P02-18";"РУССКИЙ"
"KW-P02-19";"DEUTSCH"
"KW-P02-20";"TÜRKÇE"
"KW-P02-21";"POLSKI"
"KW-P02-22";"עברית"
"KW-P02-23";"FRANÇAIS"
"KW-P02-24";"فارسی"
"KW-P02-25";"GŁOŚNOŚĆ PANELU:"
"KW-P02-26";"JASNOŚĆ"
"KW-P02-27";"ROZDZIELCZOŚĆ VIDEO:"
"KW-P02-28";"TRYB PRZEKIEROWANIA SIP:"
"KW-P02-29";"SEKWENCYJNE"
"KW-P02-30";"JEDNOCZESNE"
"KW-P02-31";"PORTIER:"
"KW-P02-32";"PORTIERNIA 1:"
"KW-P02-33";"PORTIERNIA 2:"
"KW-P02-34";"USTAW. DATY I CZASU"
"KW-P02-35";"FORMAT DATY:"
"KW-P02-36";"DATA:"
"KW-P02-37";"CZAS:"
"KW-P02-38";"STREFA CZASOWA:"
"KW-P02-39";"ZAPISZ"
"KW-P02-40";"BŁĘDNE DANE"
"KW-P02-41";"KLAWIATURA ALFANUM.:"
"KW-P02-42";"KOMUNIKAT OTWARCIA DRZWI:"
"KW-P02-43";"WYGASZACZ EKRANU:"
"KW-P02-44";"WSPARCIE:"
"KW-P02-45";"OCZEKIWANIE"
"KW-P02-46";"POŁĄCZENIE"
"KW-P02-47";"WSPARCIE"
"KW-P02-48";"lista"
"KW-P02-49";"DST:"
"KW-P02-57";"TŁO:"
"KW-P02-58";"CIEMNE"
"KW-P02-59";"JASNE"
"KW-P02-60";"IMPORT"
"KW-P02-61";"EKSPORT"
"KW-P02-62";"USUŃ"
"KW-P02-63";"WYBIERZ PRAWIDŁOWY PLIK PNG"
"KW-P02-64";"IMPORTUJ"
"KW-P02-65";"WYSYŁANIE ZAKOŃCZONE"
"KW-P02-66";"BRAK OBRAZU"
"KW-P02-67";"USUNIĘTE"
"KW-P02-68";"BŁĄD USUWANIA"
"KW-P03-01";"USTAWIENIA SIECI"
"KW-P03-02";"IP:"
"KW-P03-03";"MASKA:"
"KW-P03-04";"BRAMA:"
"KW-P03-05";"DNS:"
"KW-P03-06";"SOFTWARE IP:"
"KW-P03-07";"SW. PIN:"
"KW-P03-08";"ZAPISZ"
"KW-P04-01";"USTAWIENIA KONTROLI DOSTĘPU"
"KW-P04-02";"PRZYCISK EGRESS:"
"KW-P04-03";"CZAS ELEKTROZACZEPU:"
"KW-P04-04";"CZAS KONTAKTRONU:"
"KW-P04-05";"REF.1491 4 RELAY:"
"KW-P04-06";"CZAS ELEKTROZACZEPU:"
"KW-P04-07";"CZAS KONTAKTRONU:"
"KW-P04-08";"KARTA ADMINISTRATORA:"
"KW-P04-09";"ROZBRAJANIE KARTĄ:"
"KW-P04-10";"MONITY KART:"
"KW-P04-11";"KOD GOŚCIA:"
"KW-P04-12";"KOD DOSTĘPU:"
"KW-P04-13";"#1"
"KW-P04-14";"#2"
"KW-P04-15";"#3"
"KW-P04-16";"#4"
"KW-P04-17";"ALARM DRZWI"
"KW-P04-18";"GWAŁTOWNY ALARM OTWARCIA"
"KW-P04-19";"WIEGAND:"
"KW-P04-20";"BURST"
"KW-P04-21";"26-BIT"
"KW-P04-22";"FACILITY:"
"KW-P04-24";"ZAPISZ"
"KW-P04-25";"WYŁĄCZONY"
"KW-P04-26";"REF.1490 2 RELAY:"
"KW-P04-27";"KOD QR:"
"KW-P04-28";"WIEGAND:"
"KW-P04-29";"26-BIT"
"KW-P04-30";"34-BIT"
"KW-P04-31";"KOD MIEJSCA:"
"KW-P04-32";"AUTO AKTYWACJA:"
"KW-P04-33";"BŁĘDNE DANE"
"KW-P05-01";"ROZPOZNAWANIE TWARZY"
"KW-P05-02";"ROZPOZNAWANIE TWARZY:"
"KW-P05-04";"MODEL:"
"KW-P05-05";"Wykrycie obecności:"
"KW-P05-06";"WŁĄCZONY"
"KW-P05-07";"WYŁĄCZONY"
"KW-P05-08";"PODOBIEŃSTWO:"
"KW-P05-09";"NISKIE"
"KW-P05-10";"ŚREDNIE"
"KW-P05-11";"WYSOKIE"
"KW-P05-12";"ZAPISZ"
"KW-P06-01";"USTAWIENIA KAMER IP"
"KW-P06-02";"ILOŚĆ KAMER:"
"KW-P06-03";"KAMERA"
"KW-P06-04";"URL:"
"KW-P06-05";"ZAPISZ"
"KW-P07-01";"USTAWIENIA SIP"
"KW-P07-02";"WŁĄCZ SIP:"
"KW-P07-03";"SPRAWDŹ STATUS SIP"
"KW-P07-04";"SIP ZAREJESTROWANY"
"KW-P07-05";"BŁĄD REJESTRACJI SIP"
"KW-P07-06";"SERWER SIP:"
"KW-P07-07";"DOMENA:"
"KW-P07-08";"OUTBOUND:"
"KW-P07-09";"STUN IP:"
"KW-P07-10";"PORT STUN:"
"KW-P07-11";"H.264:"
"KW-P07-12";"UŻYTKOWNIK SIP:"
"KW-P07-13";"HASŁO SIP:"
"KW-P07-14";"CZAS ROZMOWY:"
"KW-P07-15";"CZAS DZWONIENIA:"
"KW-P07-16";"ZAPISZ"
"KW-P08-01";"USTAWIENIA SIP TRUNK"
"KW-P08-02";"WŁĄCZ SIP TRUNK:"
"KW-P08-03";"URL:"
"KW-P08-04";"ZAPISZ"
"KW-P09-01";"USTAWIENIA PRZEKIEROWAŃ"
"KW-P09-02";"IMPORT"
"KW-P09-03";"EKSPORT"
"KW-P09-04";"APARTAMENT"
"KW-P09-05";"NUMER"
"KW-P10-01";"USTAWIENIA ZAAWANSOWANE"
"KW-P10-02";"SZYBKIE WYBIERANIE:"
"KW-P10-03";"URL:"
"KW-P10-04";"ONU:"
"KW-P10-05";"MAPOWANIE POŁĄCZEŃ:"
"KW-P10-06";"BIAŁA LISTA:"
"KW-P10-07";"Lista telefoniczna:"
"KW-P10-08";"IMPORT"
"KW-P10-09";"EKSPORT"
"KW-P10-10";"IMPORTUJ"
"KW-P10-11";"WYSYŁANIE ZAKOŃCZONE"
"KW-P10-12";"UŻYJ WŁAŚCIWEGO PLIKU CSV."
"KW-P10-13";"OK"
"KW-P10-14";"ZAPISZ"
"KW-P11-01";"USTAWIENIA KODU PIN"
"KW-P11-02";"OBECNY PIN:"
"KW-P11-03";"NOWY PIN:"
"KW-P11-04";"POTWIERDŹ PIN:"
"KW-P11-05";"ZAPISZ"
"KW-P12-01";"WECHAT QR"
"KW-P12-02";"WŁĄCZ"
"KW-P12-03";"UUID:"
"KW-P12-04";"HASŁO:"
"KW-P12-05";"SERWER:"
"KW-P12-06";"WŁĄCZ CZYTNIK QR:"
"KW-P12-07";"STATUS:"
"KW-P12-08";"REJESTRACJA POMYŚLNIE"
"KW-P12-09";"REJESTRACJA NIE POWIODŁA SIĘ"
"KW-P12-10";"ZAPISZ"
"KW-P13-01";"PRZYWRACAĆ"
"KW-P13-02";"PRZYWRÓCIĆ USTAWIENIA FABRYCZNE"
"KW-P13-03";"POTWIERDZAĆ PRZYWRÓĆ USTAWIENIA FABRYCZNE?"
"KW-P13-04";"URZĄDZENIE REBOOT"
1 change: 1 addition & 0 deletions tests/test_full_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
('sample-turkish.txt', 'cp1254', 'Turkish'),
('sample-russian-2.txt', 'utf_8', 'Russian'),
('sample-russian.txt', 'mac_cyrillic', 'Russian'),
('sample-polish.txt', 'utf_8', 'Polish'),
]
)
def test_elementary_detection(
Expand Down