Skip to content

Commit

Permalink
Merge pull request #22887 from charris/backport-22872
Browse files Browse the repository at this point in the history
BUG: Use whole file for encoding checks with ``charset_normalizer``.
  • Loading branch information
charris committed Dec 26, 2022
2 parents 0f3484a + 48f5fe4 commit 6f491e0
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 25 deletions.
47 changes: 26 additions & 21 deletions numpy/f2py/crackfortran.py
Expand Up @@ -148,9 +148,9 @@
import platform
import codecs
try:
import chardet
import charset_normalizer
except ImportError:
chardet = None
charset_normalizer = None

from . import __version__

Expand Down Expand Up @@ -309,26 +309,31 @@ def getextension(name):
def openhook(filename, mode):
"""Ensures that filename is opened with correct encoding parameter.
This function uses chardet package, when available, for
determining the encoding of the file to be opened. When chardet is
not available, the function detects only UTF encodings, otherwise,
ASCII encoding is used as fallback.
This function uses charset_normalizer package, when available, for
determining the encoding of the file to be opened. When charset_normalizer
is not available, the function detects only UTF encodings, otherwise, ASCII
encoding is used as fallback.
"""
bytes = min(32, os.path.getsize(filename))
with open(filename, 'rb') as f:
raw = f.read(bytes)
if raw.startswith(codecs.BOM_UTF8):
encoding = 'UTF-8-SIG'
elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
encoding = 'UTF-32'
elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
encoding = 'UTF-16'
# Reads in the entire file. Robust detection of encoding.
# Correctly handles comments or late stage unicode characters
# gh-22871
if charset_normalizer is not None:
encoding = charset_normalizer.from_path(filename).best().encoding
else:
if chardet is not None:
encoding = chardet.detect(raw)['encoding']
else:
# hint: install chardet to ensure correct encoding handling
encoding = 'ascii'
# hint: install charset_normalizer for correct encoding handling
# No need to read the whole file for trying with startswith
nbytes = min(32, os.path.getsize(filename))
with open(filename, 'rb') as fhandle:
raw = fhandle.read(nbytes)
if raw.startswith(codecs.BOM_UTF8):
encoding = 'UTF-8-SIG'
elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
encoding = 'UTF-32'
elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
encoding = 'UTF-16'
else:
# Fallback, without charset_normalizer
encoding = 'ascii'
return open(filename, mode, encoding=encoding)


Expand Down Expand Up @@ -394,7 +399,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
except UnicodeDecodeError as msg:
raise Exception(
f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
f' failed with\n{msg}.\nIt is likely that installing chardet'
f' failed with\n{msg}.\nIt is likely that installing charset_normalizer'
' package will help f2py determine the input file encoding'
' correctly.')
if not l:
Expand Down
4 changes: 4 additions & 0 deletions numpy/f2py/tests/src/crackfortran/unicode_comment.f90
@@ -0,0 +1,4 @@
subroutine foo(x)
real(8), intent(in) :: x
! Écrit à l'écran la valeur de x
end subroutine
17 changes: 13 additions & 4 deletions numpy/f2py/tests/test_crackfortran.py
@@ -1,4 +1,6 @@
import importlib
import codecs
import unicodedata
import pytest
import numpy as np
from numpy.f2py.crackfortran import markinnerspaces
Expand Down Expand Up @@ -257,13 +259,20 @@ class TestFortranReader(util.F2PyTest):
def test_input_encoding(self, tmp_path, encoding):
# gh-635
f_path = tmp_path / f"input_with_{encoding}_encoding.f90"
# explicit BOM is required for UTF8
bom = {'utf-8': codecs.BOM_UTF8}.get(encoding, b'')
with f_path.open('w', encoding=encoding) as ff:
ff.write(bom.decode(encoding) +
"""
ff.write("""
subroutine foo()
end subroutine foo
""")
mod = crackfortran.crackfortran([str(f_path)])
assert mod[0]['name'] == 'foo'

class TestUnicodeComment(util.F2PyTest):
sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")]

@pytest.mark.skipif(
(importlib.util.find_spec("charset_normalizer") is None),
reason="test requires charset_normalizer which is not installed",
)
def test_encoding_comment(self):
self.module.foo(3)
2 changes: 2 additions & 0 deletions test_requirements.txt
Expand Up @@ -12,3 +12,5 @@ cffi; python_version < '3.10'
# NOTE: Keep mypy in sync with environment.yml
mypy==0.981; platform_python_implementation != "PyPy"
typing_extensions>=4.2.0
# for optional f2py encoding detection
charset-normalizer

0 comments on commit 6f491e0

Please sign in to comment.