Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Use whole file for encoding checks with charset_normalizer. #22887

Merged
merged 1 commit into from Dec 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
47 changes: 26 additions & 21 deletions numpy/f2py/crackfortran.py
Expand Up @@ -148,9 +148,9 @@
import platform
import codecs
try:
import chardet
import charset_normalizer
except ImportError:
chardet = None
charset_normalizer = None

from . import __version__

Expand Down Expand Up @@ -309,26 +309,31 @@ def getextension(name):
def openhook(filename, mode):
"""Ensures that filename is opened with correct encoding parameter.

This function uses chardet package, when available, for
determining the encoding of the file to be opened. When chardet is
not available, the function detects only UTF encodings, otherwise,
ASCII encoding is used as fallback.
This function uses charset_normalizer package, when available, for
determining the encoding of the file to be opened. When charset_normalizer
is not available, the function detects only UTF encodings, otherwise, ASCII
encoding is used as fallback.
"""
bytes = min(32, os.path.getsize(filename))
with open(filename, 'rb') as f:
raw = f.read(bytes)
if raw.startswith(codecs.BOM_UTF8):
encoding = 'UTF-8-SIG'
elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
encoding = 'UTF-32'
elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
encoding = 'UTF-16'
# Reads in the entire file. Robust detection of encoding.
# Correctly handles comments or late stage unicode characters
# gh-22871
if charset_normalizer is not None:
encoding = charset_normalizer.from_path(filename).best().encoding
else:
if chardet is not None:
encoding = chardet.detect(raw)['encoding']
else:
# hint: install chardet to ensure correct encoding handling
encoding = 'ascii'
# hint: install charset_normalizer for correct encoding handling
# No need to read the whole file for trying with startswith
nbytes = min(32, os.path.getsize(filename))
with open(filename, 'rb') as fhandle:
raw = fhandle.read(nbytes)
if raw.startswith(codecs.BOM_UTF8):
encoding = 'UTF-8-SIG'
elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
encoding = 'UTF-32'
elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
encoding = 'UTF-16'
else:
# Fallback, without charset_normalizer
encoding = 'ascii'
return open(filename, mode, encoding=encoding)


Expand Down Expand Up @@ -394,7 +399,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
except UnicodeDecodeError as msg:
raise Exception(
f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
f' failed with\n{msg}.\nIt is likely that installing chardet'
f' failed with\n{msg}.\nIt is likely that installing charset_normalizer'
' package will help f2py determine the input file encoding'
' correctly.')
if not l:
Expand Down
4 changes: 4 additions & 0 deletions numpy/f2py/tests/src/crackfortran/unicode_comment.f90
@@ -0,0 +1,4 @@
subroutine foo(x)
real(8), intent(in) :: x
! Écrit à l'écran la valeur de x
end subroutine
17 changes: 13 additions & 4 deletions numpy/f2py/tests/test_crackfortran.py
@@ -1,4 +1,6 @@
import importlib
import codecs
import unicodedata
import pytest
import numpy as np
from numpy.f2py.crackfortran import markinnerspaces
Expand Down Expand Up @@ -257,13 +259,20 @@ class TestFortranReader(util.F2PyTest):
def test_input_encoding(self, tmp_path, encoding):
# gh-635
f_path = tmp_path / f"input_with_{encoding}_encoding.f90"
# explicit BOM is required for UTF8
bom = {'utf-8': codecs.BOM_UTF8}.get(encoding, b'')
with f_path.open('w', encoding=encoding) as ff:
ff.write(bom.decode(encoding) +
"""
ff.write("""
subroutine foo()
end subroutine foo
""")
mod = crackfortran.crackfortran([str(f_path)])
assert mod[0]['name'] == 'foo'

class TestUnicodeComment(util.F2PyTest):
sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")]

@pytest.mark.skipif(
(importlib.util.find_spec("charset_normalizer") is None),
reason="test requires charset_normalizer which is not installed",
)
def test_encoding_comment(self):
self.module.foo(3)
2 changes: 2 additions & 0 deletions test_requirements.txt
Expand Up @@ -12,3 +12,5 @@ cffi; python_version < '3.10'
# NOTE: Keep mypy in sync with environment.yml
mypy==0.981; platform_python_implementation != "PyPy"
typing_extensions>=4.2.0
# for optional f2py encoding detection
charset-normalizer