From 93852765222ba7b408a83de9d755924df627fc81 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Sat, 27 Aug 2022 05:59:16 +0800 Subject: [PATCH] Refactor ELF parsing logic to standalone class (#553) --- packaging/_elffile.py | 108 ++++++++++++++++++++++++++++ packaging/_manylinux.py | 153 ++++++++++++---------------------------- packaging/_musllinux.py | 74 +++---------------- tests/test_elffile.py | 100 ++++++++++++++++++++++++++ tests/test_manylinux.py | 81 ++------------------- tests/test_musllinux.py | 69 +----------------- 6 files changed, 266 insertions(+), 319 deletions(-) create mode 100644 packaging/_elffile.py create mode 100644 tests/test_elffile.py diff --git a/packaging/_elffile.py b/packaging/_elffile.py new file mode 100644 index 00000000..9fb5984b --- /dev/null +++ b/packaging/_elffile.py @@ -0,0 +1,108 @@ +""" +ELF file parser. + +This provides a class ``ELFFile`` that parses an ELF executable in a similar +interface to ``ZipFile``. Only the read interface is implemented. + +Based on: https://gist.github.com/lyssdod/f51579ae8d93c8657a5564aefc2ffbca +ELF header: https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html +""" + +import enum +import os +import struct +from typing import IO, Optional, Tuple + + +class ELFInvalid(ValueError): + pass + + +class EIClass(enum.IntEnum): + C32 = 1 + C64 = 2 + + +class EIData(enum.IntEnum): + Lsb = 1 + Msb = 2 + + +class EMachine(enum.IntEnum): + I386 = 3 + S390 = 22 + Arm = 40 + X8664 = 62 + AArc64 = 183 + + +class ELFFile: + """ + Representation of an ELF executable. + """ + + def __init__(self, f: IO[bytes]) -> None: + self._f = f + + try: + ident = self._read("16B") + except struct.error: + raise ELFInvalid("unable to parse identification") + magic = bytes(ident[:4]) + if magic != b"\x7fELF": + raise ELFInvalid(f"invalid magic: {magic!r}") + + self.capacity = ident[4] # Format for program header (bitness). + self.encoding = ident[5] # Data structure encoding (endianess). + + try: + # e_fmt: Format for program header. + # p_fmt: Format for section header. + # p_idx: Indexes to find p_type, p_offset, and p_filesz. + e_fmt, self._p_fmt, self._p_idx = { + (1, 1): ("HHIIIIIHHH", ">IIIIIIII", (0, 1, 4)), # 32-bit MSB. + (2, 1): ("HHIQQQIHHH", ">IIQQQQQQ", (0, 2, 5)), # 64-bit MSB. + }[(self.capacity, self.encoding)] + except KeyError: + raise ELFInvalid( + f"unrecognized capacity ({self.capacity}) or " + f"encoding ({self.encoding})" + ) + + try: + ( + _, + self.machine, # Architecture type. + _, + _, + self._e_phoff, # Offset of program header. + _, + self.flags, # Processor-specific flags. + _, + self._e_phentsize, # Size of section. + self._e_phnum, # Number of sections. + ) = self._read(e_fmt) + except struct.error as e: + raise ELFInvalid("unable to parse machine and section information") from e + + def _read(self, fmt: str) -> Tuple[int, ...]: + return struct.unpack(fmt, self._f.read(struct.calcsize(fmt))) + + @property + def interpreter(self) -> Optional[str]: + """ + The path recorded in the ``PT_INTERP`` section header. + """ + for index in range(self._e_phnum): + self._f.seek(self._e_phoff + self._e_phentsize * index) + try: + data = self._read(self._p_fmt) + except struct.error: + continue + if data[self._p_idx[0]] != 3: # Not PT_INTERP. + continue + self._f.seek(data[self._p_idx[1]]) + return os.fsdecode(self._f.read(data[self._p_idx[2]])).strip("\0") + return None diff --git a/packaging/_manylinux.py b/packaging/_manylinux.py index 4c379aa6..2f0cc743 100644 --- a/packaging/_manylinux.py +++ b/packaging/_manylinux.py @@ -1,121 +1,58 @@ import collections +import contextlib import functools import os import re -import struct import sys import warnings -from typing import IO, Dict, Iterator, NamedTuple, Optional, Tuple - - -# Python does not provide platform information at sufficient granularity to -# identify the architecture of the running executable in some cases, so we -# determine it dynamically by reading the information from the running -# process. This only applies on Linux, which uses the ELF format. -class _ELFFileHeader: - # https://en.wikipedia.org/wiki/Executable_and_Linkable_Format#File_header - class _InvalidELFFileHeader(ValueError): - """ - An invalid ELF file header was found. - """ - - ELF_MAGIC_NUMBER = 0x7F454C46 - ELFCLASS32 = 1 - ELFCLASS64 = 2 - ELFDATA2LSB = 1 - ELFDATA2MSB = 2 - EM_386 = 3 - EM_S390 = 22 - EM_ARM = 40 - EM_X86_64 = 62 - EF_ARM_ABIMASK = 0xFF000000 - EF_ARM_ABI_VER5 = 0x05000000 - EF_ARM_ABI_FLOAT_HARD = 0x00000400 - - def __init__(self, file: IO[bytes]) -> None: - def unpack(fmt: str) -> int: - try: - data = file.read(struct.calcsize(fmt)) - result: Tuple[int, ...] = struct.unpack(fmt, data) - except struct.error: - raise _ELFFileHeader._InvalidELFFileHeader() - return result[0] - - self.e_ident_magic = unpack(">I") - if self.e_ident_magic != self.ELF_MAGIC_NUMBER: - raise _ELFFileHeader._InvalidELFFileHeader() - self.e_ident_class = unpack("B") - if self.e_ident_class not in {self.ELFCLASS32, self.ELFCLASS64}: - raise _ELFFileHeader._InvalidELFFileHeader() - self.e_ident_data = unpack("B") - if self.e_ident_data not in {self.ELFDATA2LSB, self.ELFDATA2MSB}: - raise _ELFFileHeader._InvalidELFFileHeader() - self.e_ident_version = unpack("B") - self.e_ident_osabi = unpack("B") - self.e_ident_abiversion = unpack("B") - self.e_ident_pad = file.read(7) - format_h = "H" - format_i = "I" - format_q = "Q" - format_p = format_i if self.e_ident_class == self.ELFCLASS32 else format_q - self.e_type = unpack(format_h) - self.e_machine = unpack(format_h) - self.e_version = unpack(format_i) - self.e_entry = unpack(format_p) - self.e_phoff = unpack(format_p) - self.e_shoff = unpack(format_p) - self.e_flags = unpack(format_i) - self.e_ehsize = unpack(format_h) - self.e_phentsize = unpack(format_h) - self.e_phnum = unpack(format_h) - self.e_shentsize = unpack(format_h) - self.e_shnum = unpack(format_h) - self.e_shstrndx = unpack(format_h) - - -def _get_elf_header() -> Optional[_ELFFileHeader]: +from typing import Dict, Generator, Iterator, NamedTuple, Optional, Tuple + +from ._elffile import EIClass, EIData, ELFFile, EMachine + +EF_ARM_ABIMASK = 0xFF000000 +EF_ARM_ABI_VER5 = 0x05000000 +EF_ARM_ABI_FLOAT_HARD = 0x00000400 + + +@contextlib.contextmanager +def _parse_elf(path: str) -> Generator[Optional[ELFFile], None, None]: try: - with open(sys.executable, "rb") as f: - elf_header = _ELFFileHeader(f) - except (OSError, TypeError, _ELFFileHeader._InvalidELFFileHeader): - return None - return elf_header + with open(path, "rb") as f: + yield ELFFile(f) + except (OSError, TypeError, ValueError): + yield None -def _is_linux_armhf() -> bool: +def _is_linux_armhf(executable: str) -> bool: # hard-float ABI can be detected from the ELF header of the running # process # https://static.docs.arm.com/ihi0044/g/aaelf32.pdf - elf_header = _get_elf_header() - if elf_header is None: - return False - result = elf_header.e_ident_class == elf_header.ELFCLASS32 - result &= elf_header.e_ident_data == elf_header.ELFDATA2LSB - result &= elf_header.e_machine == elf_header.EM_ARM - result &= ( - elf_header.e_flags & elf_header.EF_ARM_ABIMASK - ) == elf_header.EF_ARM_ABI_VER5 - result &= ( - elf_header.e_flags & elf_header.EF_ARM_ABI_FLOAT_HARD - ) == elf_header.EF_ARM_ABI_FLOAT_HARD - return result - - -def _is_linux_i686() -> bool: - elf_header = _get_elf_header() - if elf_header is None: - return False - result = elf_header.e_ident_class == elf_header.ELFCLASS32 - result &= elf_header.e_ident_data == elf_header.ELFDATA2LSB - result &= elf_header.e_machine == elf_header.EM_386 - return result + with _parse_elf(executable) as f: + return ( + f is not None + and f.capacity == EIClass.C32 + and f.encoding == EIData.Lsb + and f.machine == EMachine.Arm + and f.flags & EF_ARM_ABIMASK == EF_ARM_ABI_VER5 + and f.flags & EF_ARM_ABI_FLOAT_HARD == EF_ARM_ABI_FLOAT_HARD + ) + + +def _is_linux_i686(executable: str) -> bool: + with _parse_elf(executable) as f: + return ( + f is not None + and f.capacity == EIClass.C32 + and f.encoding == EIData.Lsb + and f.machine == EMachine.I386 + ) -def _have_compatible_abi(arch: str) -> bool: +def _have_compatible_abi(executable: str, arch: str) -> bool: if arch == "armv7l": - return _is_linux_armhf() + return _is_linux_armhf(executable) if arch == "i686": - return _is_linux_i686() + return _is_linux_i686(executable) return arch in {"x86_64", "aarch64", "ppc64", "ppc64le", "s390x"} @@ -141,10 +78,10 @@ def _glibc_version_string_confstr() -> Optional[str]: # platform module. # https://github.com/python/cpython/blob/fcf1d003bf4f0100c/Lib/platform.py#L175-L183 try: - # os.confstr("CS_GNU_LIBC_VERSION") returns a string like "glibc 2.17". - version_string = os.confstr("CS_GNU_LIBC_VERSION") + # Should be a string like "glibc 2.17". + version_string: str = getattr(os, "confstr")("CS_GNU_LIBC_VERSION") assert version_string is not None - _, version = version_string.split() + _, version = version_string.rsplit() except (AssertionError, AttributeError, OSError, ValueError): # os.confstr() or CS_GNU_LIBC_VERSION not available (or a bad value)... return None @@ -211,8 +148,8 @@ def _parse_glibc_version(version_str: str) -> Tuple[int, int]: m = re.match(r"(?P[0-9]+)\.(?P[0-9]+)", version_str) if not m: warnings.warn( - "Expected glibc version with 2 components major.minor," - " got: %s" % version_str, + f"Expected glibc version with 2 components major.minor," + f" got: {version_str}", RuntimeWarning, ) return -1, -1 @@ -265,7 +202,7 @@ def _is_compatible(name: str, arch: str, version: _GLibCVersion) -> bool: def platform_tags(linux: str, arch: str) -> Iterator[str]: - if not _have_compatible_abi(arch): + if not _have_compatible_abi(sys.executable, arch): return # Oldest glibc to be supported regardless of architecture is (2, 17). too_old_glibc2 = _GLibCVersion(2, 16) diff --git a/packaging/_musllinux.py b/packaging/_musllinux.py index d5d3e044..706ba600 100644 --- a/packaging/_musllinux.py +++ b/packaging/_musllinux.py @@ -4,70 +4,13 @@ linked against musl, and what musl version is used. """ -import contextlib import functools -import operator -import os import re -import struct import subprocess import sys -from typing import IO, Iterator, NamedTuple, Optional, Tuple +from typing import Iterator, NamedTuple, Optional - -def _read_unpacked(f: IO[bytes], fmt: str) -> Tuple[int, ...]: - return struct.unpack(fmt, f.read(struct.calcsize(fmt))) - - -def _parse_ld_musl_from_elf(f: IO[bytes]) -> Optional[str]: - """Detect musl libc location by parsing the Python executable. - - Based on: https://gist.github.com/lyssdod/f51579ae8d93c8657a5564aefc2ffbca - ELF header: https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html - """ - f.seek(0) - try: - ident = _read_unpacked(f, "16B") - except struct.error: - return None - if ident[:4] != tuple(b"\x7fELF"): # Invalid magic, not ELF. - return None - f.seek(struct.calcsize("HHI"), 1) # Skip file type, machine, and version. - - try: - # e_fmt: Format for program header. - # p_fmt: Format for section header. - # p_idx: Indexes to find p_type, p_offset, and p_filesz. - e_fmt, p_fmt, p_idx = { - (1, 1): ("IIIIHHH", ">IIIIIIII", (0, 1, 4)), # 32-bit MSB. - (2, 1): ("QQQIHHH", ">IIQQQQQQ", (0, 2, 5)), # 64-bit MSB. - }[(ident[4], ident[5])] - except KeyError: - return None - else: - p_get = operator.itemgetter(*p_idx) - - # Find the interpreter section and return its content. - try: - _, e_phoff, _, _, _, e_phentsize, e_phnum = _read_unpacked(f, e_fmt) - except struct.error: - return None - for i in range(e_phnum + 1): - f.seek(e_phoff + e_phentsize * i) - try: - p_type, p_offset, p_filesz = p_get(_read_unpacked(f, p_fmt)) - except struct.error: - return None - if p_type != 3: # Not PT_INTERP. - continue - f.seek(p_offset) - interpreter = os.fsdecode(f.read(p_filesz)).strip("\0") - if "musl" not in interpreter: - return None - return interpreter - return None +from ._elffile import ELFFile class _MuslVersion(NamedTuple): @@ -97,13 +40,12 @@ def _get_musl_version(executable: str) -> Optional[_MuslVersion]: Version 1.2.2 Dynamic Program Loader """ - with contextlib.ExitStack() as stack: - try: - f = stack.enter_context(open(executable, "rb")) - except OSError: - return None - ld = _parse_ld_musl_from_elf(f) - if not ld: + try: + with open(executable, "rb") as f: + ld = ELFFile(f).interpreter + except (OSError, TypeError, ValueError): + return None + if ld is None or "musl" not in ld: return None proc = subprocess.run([ld], stderr=subprocess.PIPE, universal_newlines=True) return _parse_musl_version(proc.stderr) diff --git a/tests/test_elffile.py b/tests/test_elffile.py new file mode 100644 index 00000000..6b46ddc6 --- /dev/null +++ b/tests/test_elffile.py @@ -0,0 +1,100 @@ +import io +import pathlib +import struct + +import pytest + +from packaging._elffile import EIClass, EIData, ELFFile, ELFInvalid, EMachine + +DIR_MANYLINUX = pathlib.Path(__file__, "..", "manylinux").resolve() +DIR_MUSLLINUX = pathlib.Path(__file__, "..", "musllinux").resolve() +BIN_MUSL_X86_64 = DIR_MUSLLINUX.joinpath("musl-x86_64").read_bytes() + + +@pytest.mark.parametrize( + "name, capacity, encoding, machine", + [ + ("x86_64-x32", EIClass.C32, EIData.Lsb, EMachine.X8664), + ("x86_64-i386", EIClass.C32, EIData.Lsb, EMachine.I386), + ("x86_64-amd64", EIClass.C64, EIData.Lsb, EMachine.X8664), + ("armv7l-armel", EIClass.C32, EIData.Lsb, EMachine.Arm), + ("armv7l-armhf", EIClass.C32, EIData.Lsb, EMachine.Arm), + ("s390x-s390x", EIClass.C64, EIData.Msb, EMachine.S390), + ], +) +def test_elffile_glibc(name, capacity, encoding, machine): + path = DIR_MANYLINUX.joinpath(f"hello-world-{name}") + with path.open("rb") as f: + ef = ELFFile(f) + assert ef.capacity == capacity + assert ef.encoding == encoding + assert ef.machine == machine + assert ef.flags is not None + + +@pytest.mark.parametrize( + "name, capacity, encoding, machine, interpreter", + [ + ( + "aarch64", + EIClass.C64, + EIData.Lsb, + EMachine.AArc64, + "aarch64", + ), + ("i386", EIClass.C32, EIData.Lsb, EMachine.I386, "i386"), + ("x86_64", EIClass.C64, EIData.Lsb, EMachine.X8664, "x86_64"), + ], +) +def test_elffile_musl(name, capacity, encoding, machine, interpreter): + path = DIR_MUSLLINUX.joinpath(f"musl-{name}") + with path.open("rb") as f: + ef = ELFFile(f) + assert ef.capacity == capacity + assert ef.encoding == encoding + assert ef.machine == machine + assert ef.interpreter == f"/lib/ld-musl-{interpreter}.so.1" + + +@pytest.mark.parametrize( + "data", + [ + # Too short for magic. + b"\0", + # Enough for magic, but not ELF. + b"#!/bin/bash" + b"\0" * 16, + # ELF, but unknown byte declaration. + b"\x7fELF\3" + b"\0" * 16, + ], + ids=["no-magic", "wrong-magic", "unknown-format"], +) +def test_elffile_bad_ident(data): + with pytest.raises(ELFInvalid): + ELFFile(io.BytesIO(data)) + + +def test_elffile_no_section(): + """Enough for magic, but not the section definitions.""" + data = BIN_MUSL_X86_64[:25] + with pytest.raises(ELFInvalid): + ELFFile(io.BytesIO(data)) + + +def test_elffile_invalid_section(): + """Enough for section definitions, but not the actual sections.""" + data = BIN_MUSL_X86_64[:58] + assert ELFFile(io.BytesIO(data)).interpreter is None + + +def test_elffle_no_interpreter_section(): + ef = ELFFile(io.BytesIO(BIN_MUSL_X86_64)) + + # Change all sections to *not* PT_INTERP. + data = BIN_MUSL_X86_64 + for i in range(ef._e_phnum + 1): + sb = ef._e_phoff + ef._e_phentsize * i + se = sb + ef._e_phentsize + section = struct.unpack(ef._p_fmt, data[sb:se]) + data = data[:sb] + struct.pack(ef._p_fmt, 0, *section[1:]) + data[se:] + + assert ELFFile(io.BytesIO(data)).interpreter is None diff --git a/tests/test_manylinux.py b/tests/test_manylinux.py index a04db159..dafdfc3d 100644 --- a/tests/test_manylinux.py +++ b/tests/test_manylinux.py @@ -13,15 +13,12 @@ from packaging import _manylinux from packaging._manylinux import ( - _ELFFileHeader, - _get_elf_header, _get_glibc_version, _glibc_version_string, _glibc_version_string_confstr, _glibc_version_string_ctypes, _is_compatible, - _is_linux_armhf, - _is_linux_i686, + _parse_elf, _parse_glibc_version, ) @@ -167,80 +164,10 @@ def test_glibc_version_string_none(monkeypatch): assert not _is_compatible("any", "any", (2, 4)) -def test_is_linux_armhf_not_elf(monkeypatch): - monkeypatch.setattr(_manylinux, "_get_elf_header", lambda: None) - assert not _is_linux_armhf() - - -def test_is_linux_i686_not_elf(monkeypatch): - monkeypatch.setattr(_manylinux, "_get_elf_header", lambda: None) - assert not _is_linux_i686() - - -@pytest.mark.parametrize( - "machine, abi, elf_class, elf_data, elf_machine", - [ - ( - "x86_64", - "x32", - _ELFFileHeader.ELFCLASS32, - _ELFFileHeader.ELFDATA2LSB, - _ELFFileHeader.EM_X86_64, - ), - ( - "x86_64", - "i386", - _ELFFileHeader.ELFCLASS32, - _ELFFileHeader.ELFDATA2LSB, - _ELFFileHeader.EM_386, - ), - ( - "x86_64", - "amd64", - _ELFFileHeader.ELFCLASS64, - _ELFFileHeader.ELFDATA2LSB, - _ELFFileHeader.EM_X86_64, - ), - ( - "armv7l", - "armel", - _ELFFileHeader.ELFCLASS32, - _ELFFileHeader.ELFDATA2LSB, - _ELFFileHeader.EM_ARM, - ), - ( - "armv7l", - "armhf", - _ELFFileHeader.ELFCLASS32, - _ELFFileHeader.ELFDATA2LSB, - _ELFFileHeader.EM_ARM, - ), - ( - "s390x", - "s390x", - _ELFFileHeader.ELFCLASS64, - _ELFFileHeader.ELFDATA2MSB, - _ELFFileHeader.EM_S390, - ), - ], -) -def test_get_elf_header(monkeypatch, machine, abi, elf_class, elf_data, elf_machine): - path = os.path.join( - os.path.dirname(__file__), - "manylinux", - f"hello-world-{machine}-{abi}", - ) - monkeypatch.setattr(sys, "executable", path) - elf_header = _get_elf_header() - assert elf_header.e_ident_class == elf_class - assert elf_header.e_ident_data == elf_data - assert elf_header.e_machine == elf_machine - - @pytest.mark.parametrize( "content", [None, "invalid-magic", "invalid-class", "invalid-data", "too-short"] ) -def test_get_elf_header_bad_executable(monkeypatch, content): +def test_parse_elf_bad_executable(monkeypatch, content): if content: path = os.path.join( os.path.dirname(__file__), @@ -249,5 +176,5 @@ def test_get_elf_header_bad_executable(monkeypatch, content): ) else: path = None - monkeypatch.setattr(sys, "executable", path) - assert _get_elf_header() is None + with _parse_elf(path) as ef: + assert ef is None diff --git a/tests/test_musllinux.py b/tests/test_musllinux.py index 2623bdbc..c2ab8601 100644 --- a/tests/test_musllinux.py +++ b/tests/test_musllinux.py @@ -1,19 +1,12 @@ import collections -import io import pathlib -import struct import subprocess import pretend import pytest from packaging import _musllinux -from packaging._musllinux import ( - _get_musl_version, - _MuslVersion, - _parse_ld_musl_from_elf, - _parse_musl_version, -) +from packaging._musllinux import _get_musl_version, _MuslVersion, _parse_musl_version MUSL_AMD64 = "musl libc (x86_64)\nVersion 1.2.2\n" MUSL_I386 = "musl libc (i386)\nVersion 1.2.1\n" @@ -54,66 +47,6 @@ def test_parse_musl_version(output, version): assert _parse_musl_version(output) == version -@pytest.mark.parametrize( - "executable, location", - [ - (BIN_GLIBC_X86_64, None), - (BIN_MUSL_X86_64, LD_MUSL_X86_64), - (BIN_MUSL_I386, LD_MUSL_I386), - (BIN_MUSL_AARCH64, LD_MUSL_AARCH64), - ], - ids=["glibc", "x86_64", "i386", "aarch64"], -) -def test_parse_ld_musl_from_elf(executable, location): - with executable.open("rb") as f: - assert _parse_ld_musl_from_elf(f) == location - - -@pytest.mark.parametrize( - "data", - [ - # Too short for magic. - b"\0", - # Enough for magic, but not ELF. - b"#!/bin/bash" + b"\0" * 16, - # ELF, but unknown byte declaration. - b"\x7fELF\3" + b"\0" * 16, - ], - ids=["no-magic", "wrong-magic", "unknown-format"], -) -def test_parse_ld_musl_from_elf_invalid(data): - assert _parse_ld_musl_from_elf(io.BytesIO(data)) is None - - -@pytest.mark.parametrize( - "head", - [ - 25, # Enough for magic, but not the section definitions. - 58, # Enough for section definitions, but not the actual sections. - ], -) -def test_parse_ld_musl_from_elf_invalid_section(head): - data = BIN_MUSL_X86_64.read_bytes()[:head] - assert _parse_ld_musl_from_elf(io.BytesIO(data)) is None - - -def test_parse_ld_musl_from_elf_no_interpreter_section(): - with BIN_MUSL_X86_64.open("rb") as f: - data = f.read() - - # Change all sections to *not* PT_INTERP. We are explicitly using LSB rules - # because the binaries are in LSB. - unpacked = struct.unpack("<16BHHIQQQIHHH", data[:58]) - *_, e_phoff, _, _, _, e_phentsize, e_phnum = unpacked - for i in range(e_phnum + 1): - sb = e_phoff + e_phentsize * i - se = sb + 56 - section = struct.unpack("