diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py index 27e257c4843f..84579fdccd23 100755 --- a/numpy/f2py/crackfortran.py +++ b/numpy/f2py/crackfortran.py @@ -148,9 +148,9 @@ import platform import codecs try: - import chardet + import charset_normalizer except ImportError: - chardet = None + charset_normalizer = None from . import __version__ @@ -309,26 +309,31 @@ def getextension(name): def openhook(filename, mode): """Ensures that filename is opened with correct encoding parameter. - This function uses chardet package, when available, for - determining the encoding of the file to be opened. When chardet is - not available, the function detects only UTF encodings, otherwise, - ASCII encoding is used as fallback. + This function uses charset_normalizer package, when available, for + determining the encoding of the file to be opened. When charset_normalizer + is not available, the function detects only UTF encodings, otherwise, ASCII + encoding is used as fallback. """ - bytes = min(32, os.path.getsize(filename)) - with open(filename, 'rb') as f: - raw = f.read(bytes) - if raw.startswith(codecs.BOM_UTF8): - encoding = 'UTF-8-SIG' - elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): - encoding = 'UTF-32' - elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)): - encoding = 'UTF-16' + # Reads in the entire file. Robust detection of encoding. + # Correctly handles comments or late stage unicode characters + # gh-22871 + if charset_normalizer is not None: + encoding = charset_normalizer.from_path(filename).best().encoding else: - if chardet is not None: - encoding = chardet.detect(raw)['encoding'] - else: - # hint: install chardet to ensure correct encoding handling - encoding = 'ascii' + # hint: install charset_normalizer for correct encoding handling + # No need to read the whole file for trying with startswith + nbytes = min(32, os.path.getsize(filename)) + with open(filename, 'rb') as fhandle: + raw = fhandle.read(nbytes) + if raw.startswith(codecs.BOM_UTF8): + encoding = 'UTF-8-SIG' + elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): + encoding = 'UTF-32' + elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)): + encoding = 'UTF-16' + else: + # Fallback, without charset_normalizer + encoding = 'ascii' return open(filename, mode, encoding=encoding) @@ -394,7 +399,7 @@ def readfortrancode(ffile, dowithline=show, istop=1): except UnicodeDecodeError as msg: raise Exception( f'readfortrancode: reading {fin.filename()}#{fin.lineno()}' - f' failed with\n{msg}.\nIt is likely that installing chardet' + f' failed with\n{msg}.\nIt is likely that installing charset_normalizer' ' package will help f2py determine the input file encoding' ' correctly.') if not l: diff --git a/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 new file mode 100644 index 000000000000..13515ce98c50 --- /dev/null +++ b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 @@ -0,0 +1,4 @@ +subroutine foo(x) + real(8), intent(in) :: x + ! Écrit à l'écran la valeur de x +end subroutine diff --git a/numpy/f2py/tests/test_crackfortran.py b/numpy/f2py/tests/test_crackfortran.py index dcf8760db356..73ac4e2760cd 100644 --- a/numpy/f2py/tests/test_crackfortran.py +++ b/numpy/f2py/tests/test_crackfortran.py @@ -1,4 +1,6 @@ +import importlib import codecs +import unicodedata import pytest import numpy as np from numpy.f2py.crackfortran import markinnerspaces @@ -257,13 +259,20 @@ class TestFortranReader(util.F2PyTest): def test_input_encoding(self, tmp_path, encoding): # gh-635 f_path = tmp_path / f"input_with_{encoding}_encoding.f90" - # explicit BOM is required for UTF8 - bom = {'utf-8': codecs.BOM_UTF8}.get(encoding, b'') with f_path.open('w', encoding=encoding) as ff: - ff.write(bom.decode(encoding) + - """ + ff.write(""" subroutine foo() end subroutine foo """) mod = crackfortran.crackfortran([str(f_path)]) assert mod[0]['name'] == 'foo' + +class TestUnicodeComment(util.F2PyTest): + sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")] + + @pytest.mark.skipif( + (importlib.util.find_spec("charset_normalizer") is None), + reason="test requires charset_normalizer which is not installed", + ) + def test_encoding_comment(self): + self.module.foo(3) diff --git a/test_requirements.txt b/test_requirements.txt index 3e7d3fef79dd..67b6a48667b8 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -12,3 +12,5 @@ cffi; python_version < '3.10' # NOTE: Keep mypy in sync with environment.yml mypy==0.981; platform_python_implementation != "PyPy" typing_extensions>=4.2.0 +# for optional f2py encoding detection +charset-normalizer