Skip to content

Commit

Permalink
Refactor ELF parsing logic to standalone class
Browse files Browse the repository at this point in the history
This makes it easier to reuse the same code for manylinux, musllinux,
and test code.
  • Loading branch information
uranusjr committed Jun 3, 2022
1 parent a339dd3 commit 2388094
Show file tree
Hide file tree
Showing 6 changed files with 277 additions and 331 deletions.
108 changes: 108 additions & 0 deletions packaging/_elffile.py
@@ -0,0 +1,108 @@
"""ELF file parser.
This provides a class ``ElfFile`` that parses an ELF executable in a similar
interface to ``ZipFile``. Only the read interface is implemented.
Based on: https://gist.github.com/lyssdod/f51579ae8d93c8657a5564aefc2ffbca
ELF header: https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html
"""

import enum
import os
import struct
from typing import IO, Any, ContextManager, Optional, Tuple


class ElfInvalid(ValueError):
pass


class EIClass(enum.IntEnum):
C32 = 1
C64 = 2


class EIData(enum.IntEnum):
Lsb = 1
Msb = 2


class EMachine(enum.IntEnum):
I386 = 3
S390 = 22
Arm = 40
X8664 = 62
AArc64 = 183


class ElfFile(ContextManager["ElfFile"]):
"""Representation of an ELF executable."""

def __init__(self, f: IO[bytes]) -> None:
self._f = f

try:
ident = self._read("16B")
except struct.error:
raise ElfInvalid
if bytes(ident[:4]) != b"\x7fELF": # Invalid magic, not ELF.
raise ElfInvalid

self.capacity = ident[4] # Format for program header (bitness).
self.encoding = ident[5] # Data structure encoding (endianess).

try:
# e_fmt: Format for program header.
# p_fmt: Format for section header.
# p_idx: Indexes to find p_type, p_offset, and p_filesz.
e_fmt, self._p_fmt, self._p_idx = {
(1, 1): ("<HHIIIIIHHH", "<IIIIIIII", (0, 1, 4)), # 32-bit LSB.
(1, 2): (">HHIIIIIHHH", ">IIIIIIII", (0, 1, 4)), # 32-bit MSB.
(2, 1): ("<HHIQQQIHHH", "<IIQQQQQQ", (0, 2, 5)), # 64-bit LSB.
(2, 2): (">HHIQQQIHHH", ">IIQQQQQQ", (0, 2, 5)), # 64-bit MSB.
}[(self.capacity, self.encoding)]
except KeyError:
raise ElfInvalid

try:
(
_,
self.machine, # Architecture type.
_,
_,
self._e_phoff, # Offset of program header.
_,
self.flags, # Processor-specific flags.
_,
self._e_phentsize, # Size of section.
self._e_phnum, # Number of sections.
) = self._read(e_fmt)
except struct.error:
raise ElfInvalid

def __enter__(self) -> "ElfFile":
return self

def __exit__(self, typ: Any, val: Any, tb: Any) -> None:
self.close()

def _read(self, fmt: str) -> Tuple[int, ...]:
return struct.unpack(fmt, self._f.read(struct.calcsize(fmt)))

def close(self) -> None:
self._f.close()

@property
def interpreter(self) -> Optional[str]:
"""Path recorded in the ``PT_INTERP`` section header."""
for index in range(self._e_phnum):
self._f.seek(self._e_phoff + self._e_phentsize * index)
try:
data = self._read(self._p_fmt)
except struct.error:
continue
if data[self._p_idx[0]] != 3: # Not PT_INTERP.
continue
self._f.seek(data[self._p_idx[1]])
return os.fsdecode(self._f.read(data[self._p_idx[2]])).strip("\0")
return None
147 changes: 40 additions & 107 deletions packaging/_manylinux.py
Expand Up @@ -2,120 +2,47 @@
import functools
import os
import re
import struct
import sys
import warnings
from typing import IO, Dict, Iterator, NamedTuple, Optional, Tuple


# Python does not provide platform information at sufficient granularity to
# identify the architecture of the running executable in some cases, so we
# determine it dynamically by reading the information from the running
# process. This only applies on Linux, which uses the ELF format.
class _ELFFileHeader:
# https://en.wikipedia.org/wiki/Executable_and_Linkable_Format#File_header
class _InvalidELFFileHeader(ValueError):
"""
An invalid ELF file header was found.
"""

ELF_MAGIC_NUMBER = 0x7F454C46
ELFCLASS32 = 1
ELFCLASS64 = 2
ELFDATA2LSB = 1
ELFDATA2MSB = 2
EM_386 = 3
EM_S390 = 22
EM_ARM = 40
EM_X86_64 = 62
EF_ARM_ABIMASK = 0xFF000000
EF_ARM_ABI_VER5 = 0x05000000
EF_ARM_ABI_FLOAT_HARD = 0x00000400

def __init__(self, file: IO[bytes]) -> None:
def unpack(fmt: str) -> int:
try:
data = file.read(struct.calcsize(fmt))
result: Tuple[int, ...] = struct.unpack(fmt, data)
except struct.error:
raise _ELFFileHeader._InvalidELFFileHeader()
return result[0]

self.e_ident_magic = unpack(">I")
if self.e_ident_magic != self.ELF_MAGIC_NUMBER:
raise _ELFFileHeader._InvalidELFFileHeader()
self.e_ident_class = unpack("B")
if self.e_ident_class not in {self.ELFCLASS32, self.ELFCLASS64}:
raise _ELFFileHeader._InvalidELFFileHeader()
self.e_ident_data = unpack("B")
if self.e_ident_data not in {self.ELFDATA2LSB, self.ELFDATA2MSB}:
raise _ELFFileHeader._InvalidELFFileHeader()
self.e_ident_version = unpack("B")
self.e_ident_osabi = unpack("B")
self.e_ident_abiversion = unpack("B")
self.e_ident_pad = file.read(7)
format_h = "<H" if self.e_ident_data == self.ELFDATA2LSB else ">H"
format_i = "<I" if self.e_ident_data == self.ELFDATA2LSB else ">I"
format_q = "<Q" if self.e_ident_data == self.ELFDATA2LSB else ">Q"
format_p = format_i if self.e_ident_class == self.ELFCLASS32 else format_q
self.e_type = unpack(format_h)
self.e_machine = unpack(format_h)
self.e_version = unpack(format_i)
self.e_entry = unpack(format_p)
self.e_phoff = unpack(format_p)
self.e_shoff = unpack(format_p)
self.e_flags = unpack(format_i)
self.e_ehsize = unpack(format_h)
self.e_phentsize = unpack(format_h)
self.e_phnum = unpack(format_h)
self.e_shentsize = unpack(format_h)
self.e_shnum = unpack(format_h)
self.e_shstrndx = unpack(format_h)


def _get_elf_header() -> Optional[_ELFFileHeader]:
try:
with open(sys.executable, "rb") as f:
elf_header = _ELFFileHeader(f)
except (OSError, TypeError, _ELFFileHeader._InvalidELFFileHeader):
return None
return elf_header
from typing import Dict, Iterator, NamedTuple, Optional, Tuple

from ._elffile import EIClass, EIData, ElfFile, EMachine

EF_ARM_ABIMASK = 0xFF000000
EF_ARM_ABI_VER5 = 0x05000000
EF_ARM_ABI_FLOAT_HARD = 0x00000400

def _is_linux_armhf() -> bool:

def _is_linux_armhf(f: ElfFile) -> bool:
# hard-float ABI can be detected from the ELF header of the running
# process
# https://static.docs.arm.com/ihi0044/g/aaelf32.pdf
elf_header = _get_elf_header()
if elf_header is None:
if f.capacity != EIClass.C32:
return False
if f.encoding != EIData.Lsb:
return False
if f.machine != EMachine.Arm:
return False
result = elf_header.e_ident_class == elf_header.ELFCLASS32
result &= elf_header.e_ident_data == elf_header.ELFDATA2LSB
result &= elf_header.e_machine == elf_header.EM_ARM
result &= (
elf_header.e_flags & elf_header.EF_ARM_ABIMASK
) == elf_header.EF_ARM_ABI_VER5
result &= (
elf_header.e_flags & elf_header.EF_ARM_ABI_FLOAT_HARD
) == elf_header.EF_ARM_ABI_FLOAT_HARD
return result


def _is_linux_i686() -> bool:
elf_header = _get_elf_header()
if elf_header is None:
if f.flags & EF_ARM_ABIMASK != EF_ARM_ABI_VER5:
return False
result = elf_header.e_ident_class == elf_header.ELFCLASS32
result &= elf_header.e_ident_data == elf_header.ELFDATA2LSB
result &= elf_header.e_machine == elf_header.EM_386
return result
if f.flags & EF_ARM_ABI_FLOAT_HARD != EF_ARM_ABI_FLOAT_HARD:
return False
return True


def _is_linux_i686(f: ElfFile) -> bool:
return (
f.capacity == EIClass.C32
and f.encoding == EIData.Lsb
and f.machine == EMachine.I386
)

def _have_compatible_abi(arch: str) -> bool:

def _have_compatible_abi(f: ElfFile, arch: str) -> bool:
if arch == "armv7l":
return _is_linux_armhf()
return _is_linux_armhf(f)
if arch == "i686":
return _is_linux_i686()
return _is_linux_i686(f)
return arch in {"x86_64", "aarch64", "ppc64", "ppc64le", "s390x"}


Expand All @@ -141,10 +68,10 @@ def _glibc_version_string_confstr() -> Optional[str]:
# platform module.
# https://github.com/python/cpython/blob/fcf1d003bf4f0100c/Lib/platform.py#L175-L183
try:
# os.confstr("CS_GNU_LIBC_VERSION") returns a string like "glibc 2.17".
version_string = os.confstr("CS_GNU_LIBC_VERSION")
# Should be a string like "glibc 2.17".
version_string: str = getattr(os, "confstr")("CS_GNU_LIBC_VERSION")
assert version_string is not None
_, version = version_string.split()
_, version = version_string.rsplit()
except (AssertionError, AttributeError, OSError, ValueError):
# os.confstr() or CS_GNU_LIBC_VERSION not available (or a bad value)...
return None
Expand Down Expand Up @@ -211,8 +138,8 @@ def _parse_glibc_version(version_str: str) -> Tuple[int, int]:
m = re.match(r"(?P<major>[0-9]+)\.(?P<minor>[0-9]+)", version_str)
if not m:
warnings.warn(
"Expected glibc version with 2 components major.minor,"
" got: %s" % version_str,
f"Expected glibc version with 2 components major.minor,"
f" got: {version_str}",
RuntimeWarning,
)
return -1, -1
Expand Down Expand Up @@ -265,7 +192,13 @@ def _is_compatible(name: str, arch: str, version: _GLibCVersion) -> bool:


def platform_tags(linux: str, arch: str) -> Iterator[str]:
if not _have_compatible_abi(arch):
try:
with open(sys.executable, "rb") as f:
with ElfFile(f) as ef:
abi_compatible = _have_compatible_abi(ef, arch)
except (OSError, TypeError, ValueError):
abi_compatible = False
if not abi_compatible:
return
# Oldest glibc to be supported regardless of architecture is (2, 17).
too_old_glibc2 = _GLibCVersion(2, 16)
Expand Down
76 changes: 10 additions & 66 deletions packaging/_musllinux.py
Expand Up @@ -4,70 +4,13 @@
linked against musl, and what musl version is used.
"""

import contextlib
import functools
import operator
import os
import re
import struct
import subprocess
import sys
from typing import IO, Iterator, NamedTuple, Optional, Tuple
from typing import Iterator, NamedTuple, Optional


def _read_unpacked(f: IO[bytes], fmt: str) -> Tuple[int, ...]:
return struct.unpack(fmt, f.read(struct.calcsize(fmt)))


def _parse_ld_musl_from_elf(f: IO[bytes]) -> Optional[str]:
"""Detect musl libc location by parsing the Python executable.
Based on: https://gist.github.com/lyssdod/f51579ae8d93c8657a5564aefc2ffbca
ELF header: https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html
"""
f.seek(0)
try:
ident = _read_unpacked(f, "16B")
except struct.error:
return None
if ident[:4] != tuple(b"\x7fELF"): # Invalid magic, not ELF.
return None
f.seek(struct.calcsize("HHI"), 1) # Skip file type, machine, and version.

try:
# e_fmt: Format for program header.
# p_fmt: Format for section header.
# p_idx: Indexes to find p_type, p_offset, and p_filesz.
e_fmt, p_fmt, p_idx = {
(1, 1): ("<IIIIHHH", "<IIIIIIII", (0, 1, 4)), # 32-bit LSB.
(1, 2): (">IIIIHHH", ">IIIIIIII", (0, 1, 4)), # 32-bit MSB.
(2, 1): ("<QQQIHHH", "<IIQQQQQQ", (0, 2, 5)), # 64-bit LSB.
(2, 2): (">QQQIHHH", ">IIQQQQQQ", (0, 2, 5)), # 64-bit MSB.
}[(ident[4], ident[5])]
except KeyError:
return None
else:
p_get = operator.itemgetter(*p_idx)

# Find the interpreter section and return its content.
try:
_, e_phoff, _, _, _, e_phentsize, e_phnum = _read_unpacked(f, e_fmt)
except struct.error:
return None
for i in range(e_phnum + 1):
f.seek(e_phoff + e_phentsize * i)
try:
p_type, p_offset, p_filesz = p_get(_read_unpacked(f, p_fmt))
except struct.error:
return None
if p_type != 3: # Not PT_INTERP.
continue
f.seek(p_offset)
interpreter = os.fsdecode(f.read(p_filesz)).strip("\0")
if "musl" not in interpreter:
return None
return interpreter
return None
from ._elffile import ElfFile


class _MuslVersion(NamedTuple):
Expand Down Expand Up @@ -97,13 +40,14 @@ def _get_musl_version(executable: str) -> Optional[_MuslVersion]:
Version 1.2.2
Dynamic Program Loader
"""
with contextlib.ExitStack() as stack:
try:
f = stack.enter_context(open(executable, "rb"))
except OSError:
return None
ld = _parse_ld_musl_from_elf(f)
if not ld:
try:
with open(executable, "rb") as f:
with ElfFile(f) as ef:
ld = ef.interpreter
except (OSError, TypeError, ValueError):
return None

if ld is None or "musl" not in ld:
return None
proc = subprocess.run([ld], stderr=subprocess.PIPE, universal_newlines=True)
return _parse_musl_version(proc.stderr)
Expand Down

0 comments on commit 2388094

Please sign in to comment.