Skip to content

Commit

Permalink
Re-use decoded buffer for short texts
Browse files Browse the repository at this point in the history
This avoids issues with detecting string boundaries while improving
performance (avoids multiple decoding of the sequence).

Fixes #174
  • Loading branch information
nijel committed Mar 24, 2022
1 parent a642fcb commit c201454
Show file tree
Hide file tree
Showing 3 changed files with 293 additions and 53 deletions.
141 changes: 88 additions & 53 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from collections.abc import Iterable
from os.path import basename, splitext
from typing import BinaryIO, List, Optional, Set

Expand Down Expand Up @@ -34,6 +35,64 @@
)


def cut_sequence_chunks(
sequences: bytes,
length: int,
encoding_iana: str,
decoded_payload: Optional[str],
iterator: Iterable,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
) -> bytes:
if decoded_payload:
for i in iterator:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in iterator:
chunk_end = i + chunk_size
if chunk_end > length + 8:
continue

cut_sequence = sequences[i : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
) # type: str

# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:

chunk_partial_size_chk = min(chunk_size, 16) # type: int

if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(encoding_iana, errors="ignore")

if chunk[:chunk_partial_size_chk] in decoded_payload:
break

yield chunk


def from_bytes(
sequences: bytes,
steps: int = 5,
Expand Down Expand Up @@ -285,63 +344,39 @@ def from_bytes(
md_chunks = [] # type: List[str]
md_ratios = []

for i in r_:
if i + chunk_size > length + 8:
continue

cut_sequence = sequences[i : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

try:
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
) # type: str
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
break
try:
for chunk in cut_sequence_chunks(
sequences,
length,
encoding_iana,
decoded_payload,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
):
md_chunks.append(chunk)

# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
md_ratios.append(mess_ratio(chunk, threshold))

chunk_partial_size_chk = min(chunk_size, 16) # type: int
if md_ratios[-1] >= threshold:
early_stop_count += 1

if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(encoding_iana, errors="ignore")

if chunk[:chunk_partial_size_chk] in decoded_payload:
break

md_chunks.append(chunk)

md_ratios.append(mess_ratio(chunk, threshold))

if md_ratios[-1] >= threshold:
early_stop_count += 1

if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
break
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True

# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
Expand Down
204 changes: 204 additions & 0 deletions data/sample-polish.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
"source";"target"
"REF.-2";"POLISH"
"KW-P00-01";"SYSTEM VIDEODOMOFONOWY MEET"
"KW-P00-02";"URZĄDZENIE"
"KW-P00-03";"OGÓLNE"
"KW-P00-04";"SIEĆ"
"KW-P00-05";"KD"
"KW-P00-06";"ROZP. TWARZY."
"KW-P00-07";"KAMERY IP"
"KW-P00-08";"SIP"
"KW-P00-09";"SIP TRUNK"
"KW-P00-10";"PRZEKIEROWANIA"
"KW-P00-11";"ZAAWANSOWANE"
"KW-P00-12";"KOD PIN"
"KW-P00-13";"WECHAT QR"
"KW-P00-14";"PRZYWRACAĆ"
"KW-P00-16";"WINDA"
"KW-P01-01";"INFORMACJE O URZĄDZENIU"
"KW-P01-02";"PANEL VIDEO FOOBAR KIN"
"KW-P01-03";"FIRMWARE: V02.10"
"KW-P01-04";"URZĄDZENIE: PANEL BLOKOWY-CYFROWY 001-02"
"KW-P01-05";"URZĄDZENIE: PANEL BLOKOWY PRZYCISKI 020-02"
"KW-P01-06";"URZĄDZENIE: PANEL GŁÓWNY 01"
"KW-P01-07";"URZĄDZENIE: PANEL 1W 006-0102-01"
"KW-P01-08";"NUMER SERYJNY:"
"KW-P01-09";"MAC:"
"KW-P01-10";"IP:"
"KW-P01-11";"COPYRIGHT © FOOBAR "
"KW-P01-12";"www.example.com"
"KW-P02-01";"USTAWIENIA GŁÓWNE"
"KW-P02-02";"TYP:"
"KW-P02-03";"PANEL GŁÓWNY"
"KW-P02-04";"CYFROWY P. BLOKOWY"
"KW-P02-05";"P. BLOK. PRZYCISKI"
"KW-P02-06";"PANEL 1NR"
"KW-P02-07";"BLOK:"
"KW-P02-08";"LOKAL:"
"KW-P02-09";"MONIT WYŚWIETLACZA:"
"KW-P02-10";"THIS INTERFACE IS NOT ENABLED"
"KW-P02-11";"NUMER PANELU:"
"KW-P02-12";"NAZWA URZĄDZENIA:"
"KW-P02-13";"(≤16 ZNAKÓW)"
"KW-P02-14";"JĘZYK:"
"KW-P02-15";"ENGLISH"
"KW-P02-16";"中文"
"KW-P02-17";"ESPAÑOL"
"KW-P02-18";"РУССКИЙ"
"KW-P02-19";"DEUTSCH"
"KW-P02-20";"TÜRKÇE"
"KW-P02-21";"POLSKI"
"KW-P02-22";"עברית"
"KW-P02-23";"FRANÇAIS"
"KW-P02-24";"فارسی"
"KW-P02-25";"GŁOŚNOŚĆ PANELU:"
"KW-P02-26";"JASNOŚĆ"
"KW-P02-27";"ROZDZIELCZOŚĆ VIDEO:"
"KW-P02-28";"TRYB PRZEKIEROWANIA SIP:"
"KW-P02-29";"SEKWENCYJNE"
"KW-P02-30";"JEDNOCZESNE"
"KW-P02-31";"PORTIER:"
"KW-P02-32";"PORTIERNIA 1:"
"KW-P02-33";"PORTIERNIA 2:"
"KW-P02-34";"USTAW. DATY I CZASU"
"KW-P02-35";"FORMAT DATY:"
"KW-P02-36";"DATA:"
"KW-P02-37";"CZAS:"
"KW-P02-38";"STREFA CZASOWA:"
"KW-P02-39";"ZAPISZ"
"KW-P02-40";"BŁĘDNE DANE"
"KW-P02-41";"KLAWIATURA ALFANUM.:"
"KW-P02-42";"KOMUNIKAT OTWARCIA DRZWI:"
"KW-P02-43";"WYGASZACZ EKRANU:"
"KW-P02-44";"WSPARCIE:"
"KW-P02-45";"OCZEKIWANIE"
"KW-P02-46";"POŁĄCZENIE"
"KW-P02-47";"WSPARCIE"
"KW-P02-48";"lista"
"KW-P02-49";"DST:"
"KW-P02-57";"TŁO:"
"KW-P02-58";"CIEMNE"
"KW-P02-59";"JASNE"
"KW-P02-60";"IMPORT"
"KW-P02-61";"EKSPORT"
"KW-P02-62";"USUŃ"
"KW-P02-63";"WYBIERZ PRAWIDŁOWY PLIK PNG"
"KW-P02-64";"IMPORTUJ"
"KW-P02-65";"WYSYŁANIE ZAKOŃCZONE"
"KW-P02-66";"BRAK OBRAZU"
"KW-P02-67";"USUNIĘTE"
"KW-P02-68";"BŁĄD USUWANIA"
"KW-P03-01";"USTAWIENIA SIECI"
"KW-P03-02";"IP:"
"KW-P03-03";"MASKA:"
"KW-P03-04";"BRAMA:"
"KW-P03-05";"DNS:"
"KW-P03-06";"SOFTWARE IP:"
"KW-P03-07";"SW. PIN:"
"KW-P03-08";"ZAPISZ"
"KW-P04-01";"USTAWIENIA KONTROLI DOSTĘPU"
"KW-P04-02";"PRZYCISK EGRESS:"
"KW-P04-03";"CZAS ELEKTROZACZEPU:"
"KW-P04-04";"CZAS KONTAKTRONU:"
"KW-P04-05";"REF.1491 4 RELAY:"
"KW-P04-06";"CZAS ELEKTROZACZEPU:"
"KW-P04-07";"CZAS KONTAKTRONU:"
"KW-P04-08";"KARTA ADMINISTRATORA:"
"KW-P04-09";"ROZBRAJANIE KARTĄ:"
"KW-P04-10";"MONITY KART:"
"KW-P04-11";"KOD GOŚCIA:"
"KW-P04-12";"KOD DOSTĘPU:"
"KW-P04-13";"#1"
"KW-P04-14";"#2"
"KW-P04-15";"#3"
"KW-P04-16";"#4"
"KW-P04-17";"ALARM DRZWI"
"KW-P04-18";"GWAŁTOWNY ALARM OTWARCIA"
"KW-P04-19";"WIEGAND:"
"KW-P04-20";"BURST"
"KW-P04-21";"26-BIT"
"KW-P04-22";"FACILITY:"
"KW-P04-24";"ZAPISZ"
"KW-P04-25";"WYŁĄCZONY"
"KW-P04-26";"REF.1490 2 RELAY:"
"KW-P04-27";"KOD QR:"
"KW-P04-28";"WIEGAND:"
"KW-P04-29";"26-BIT"
"KW-P04-30";"34-BIT"
"KW-P04-31";"KOD MIEJSCA:"
"KW-P04-32";"AUTO AKTYWACJA:"
"KW-P04-33";"BŁĘDNE DANE"
"KW-P05-01";"ROZPOZNAWANIE TWARZY"
"KW-P05-02";"ROZPOZNAWANIE TWARZY:"
"KW-P05-04";"MODEL:"
"KW-P05-05";"Wykrycie obecności:"
"KW-P05-06";"WŁĄCZONY"
"KW-P05-07";"WYŁĄCZONY"
"KW-P05-08";"PODOBIEŃSTWO:"
"KW-P05-09";"NISKIE"
"KW-P05-10";"ŚREDNIE"
"KW-P05-11";"WYSOKIE"
"KW-P05-12";"ZAPISZ"
"KW-P06-01";"USTAWIENIA KAMER IP"
"KW-P06-02";"ILOŚĆ KAMER:"
"KW-P06-03";"KAMERA"
"KW-P06-04";"URL:"
"KW-P06-05";"ZAPISZ"
"KW-P07-01";"USTAWIENIA SIP"
"KW-P07-02";"WŁĄCZ SIP:"
"KW-P07-03";"SPRAWDŹ STATUS SIP"
"KW-P07-04";"SIP ZAREJESTROWANY"
"KW-P07-05";"BŁĄD REJESTRACJI SIP"
"KW-P07-06";"SERWER SIP:"
"KW-P07-07";"DOMENA:"
"KW-P07-08";"OUTBOUND:"
"KW-P07-09";"STUN IP:"
"KW-P07-10";"PORT STUN:"
"KW-P07-11";"H.264:"
"KW-P07-12";"UŻYTKOWNIK SIP:"
"KW-P07-13";"HASŁO SIP:"
"KW-P07-14";"CZAS ROZMOWY:"
"KW-P07-15";"CZAS DZWONIENIA:"
"KW-P07-16";"ZAPISZ"
"KW-P08-01";"USTAWIENIA SIP TRUNK"
"KW-P08-02";"WŁĄCZ SIP TRUNK:"
"KW-P08-03";"URL:"
"KW-P08-04";"ZAPISZ"
"KW-P09-01";"USTAWIENIA PRZEKIEROWAŃ"
"KW-P09-02";"IMPORT"
"KW-P09-03";"EKSPORT"
"KW-P09-04";"APARTAMENT"
"KW-P09-05";"NUMER"
"KW-P10-01";"USTAWIENIA ZAAWANSOWANE"
"KW-P10-02";"SZYBKIE WYBIERANIE:"
"KW-P10-03";"URL:"
"KW-P10-04";"ONU:"
"KW-P10-05";"MAPOWANIE POŁĄCZEŃ:"
"KW-P10-06";"BIAŁA LISTA:"
"KW-P10-07";"Lista telefoniczna:"
"KW-P10-08";"IMPORT"
"KW-P10-09";"EKSPORT"
"KW-P10-10";"IMPORTUJ"
"KW-P10-11";"WYSYŁANIE ZAKOŃCZONE"
"KW-P10-12";"UŻYJ WŁAŚCIWEGO PLIKU CSV."
"KW-P10-13";"OK"
"KW-P10-14";"ZAPISZ"
"KW-P11-01";"USTAWIENIA KODU PIN"
"KW-P11-02";"OBECNY PIN:"
"KW-P11-03";"NOWY PIN:"
"KW-P11-04";"POTWIERDŹ PIN:"
"KW-P11-05";"ZAPISZ"
"KW-P12-01";"WECHAT QR"
"KW-P12-02";"WŁĄCZ"
"KW-P12-03";"UUID:"
"KW-P12-04";"HASŁO:"
"KW-P12-05";"SERWER:"
"KW-P12-06";"WŁĄCZ CZYTNIK QR:"
"KW-P12-07";"STATUS:"
"KW-P12-08";"REJESTRACJA POMYŚLNIE"
"KW-P12-09";"REJESTRACJA NIE POWIODŁA SIĘ"
"KW-P12-10";"ZAPISZ"
"KW-P13-01";"PRZYWRACAĆ"
"KW-P13-02";"PRZYWRÓCIĆ USTAWIENIA FABRYCZNE"
"KW-P13-03";"POTWIERDZAĆ PRZYWRÓĆ USTAWIENIA FABRYCZNE?"
"KW-P13-04";"URZĄDZENIE REBOOT"
1 change: 1 addition & 0 deletions tests/test_full_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
('sample-turkish.txt', 'cp1254', 'Turkish'),
('sample-russian-2.txt', 'utf_8', 'Russian'),
('sample-russian.txt', 'mac_cyrillic', 'Russian'),
('sample-polish.txt', 'utf_8', 'Polish'),
]
)
def test_elementary_detection(
Expand Down

0 comments on commit c201454

Please sign in to comment.