import argparse import glob import os import re import struct import sys import logging from conda_build.utils import ensure_list, get_logger logging.basicConfig(level=logging.INFO) ''' # Detect security flags via readelf (from https://github.com/hugsy/gef) # .. spawning out to readelf is not something we intend to do though .. @lru_cache(32) def checksec(filename): """Check the security property of the ELF binary. The following properties are: - Canary - NX - PIE - Fortify - Partial/Full RelRO. Return a Python dict() with the different keys mentioned above, and the boolean associated whether the protection was found.""" try: readelf = which("readelf") except IOError: err("Missing `readelf`") return def __check_security_property(opt, filename, pattern): cmd = [readelf,] cmd += opt.split() cmd += [filename,] lines = gef_execute_external(cmd, as_list=True) for line in lines: if re.search(pattern, line): return True return False results = collections.OrderedDict() results["Canary"] = __check_security_property("-s", filename, r"__stack_chk_fail") is True has_gnu_stack = __check_security_property("-W -l", filename, r"GNU_STACK") is True if has_gnu_stack: results["NX"] = __check_security_property("-W -l", filename, r"GNU_STACK.*RWE") is False else: results["NX"] = False results["PIE"] = __check_security_property("-h", filename, r"Type:.*EXEC") is False results["Fortify"] = __check_security_property("-s", filename, r"_chk@GLIBC") is True results["Partial RelRO"] = __check_security_property("-l", filename, r"GNU_RELRO") is True results["Full RelRO"] = __check_security_property("-d", filename, r"BIND_NOW") is True return results ''' ''' Eventual goal is to become a full replacement for `ldd` `otool -L` and `ntldd' For now only works with ELF and Mach-O files and command-line execution is not supported. To get the list of shared libs use `inspect_linkages(filename)`. ''' LDD_USAGE = """ Usage: ldd [OPTION]... FILE... --help print this help and exit --version print version information and exit -d, --data-relocs process data relocations -r, --function-relocs process data and function relocations -u, --unused print unused direct dependencies -v, --verbose print all information For bug reporting instructions, please see: . """ # noqa OTOOL_USAGE = """ Usage: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/otool [-arch arch_type] [-fahlLDtdorSTMRIHGvVcXmqQjCP] [-mcpu=arg] [--version] ... -f print the fat headers -a print the archive header -h print the mach header -l print the load commands -L print shared libraries used -D print shared library id name -t print the text section (disassemble with -v) -p start dissassemble from routine name -s print contents of section -d print the data section -o print the Objective-C segment -r print the relocation entries -S print the table of contents of a library -T print the table of contents of a dynamic shared library -M print the module table of a dynamic shared library -R print the reference table of a dynamic shared library -I print the indirect symbol table -H print the two-level hints table -G print the data in code table -v print verbosely (symbolically) when possible -V print disassembled operands symbolically -c print argument strings of a core file -X print no leading addresses or headers -m don't use archive(member) syntax -B force Thumb disassembly (ARM objects only) -q use llvm's disassembler (the default) -Q use otool(1)'s disassembler -mcpu=arg use `arg' as the cpu for disassembly -j print opcode bytes -P print the info plist section as strings -C print linker optimization hints --version print the version of /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/otool """ # noqa ############################################## # Constants used in the Mach-O specification # ############################################## MH_MAGIC = 0xfeedface MH_CIGAM = 0xcefaedfe MH_MAGIC_64 = 0xfeedfacf MH_CIGAM_64 = 0xcffaedfe FAT_MAGIC = 0xcafebabe BIG_ENDIAN = '>' LITTLE_ENDIAN = '<' LC_ID_DYLIB = 0xd LC_LOAD_DYLIB = 0xc LC_LOAD_WEAK_DYLIB = 0x18 LC_LOAD_UPWARD_DYLIB = 0x23 LC_REEXPORT_DYLIB = 0x1f LC_LAZY_LOAD_DYLIB = 0x20 LC_LOAD_DYLIBS = (LC_LOAD_DYLIB, LC_LOAD_WEAK_DYLIB, LC_LOAD_UPWARD_DYLIB, LC_LAZY_LOAD_DYLIB, LC_REEXPORT_DYLIB) LC_REQ_DYLD = 0x80000000 LC_RPATH = 0x1c | LC_REQ_DYLD majver = sys.version_info[0] maxint = majver == 3 and getattr(sys, 'maxsize') or getattr(sys, 'maxint') class IncompleteRead(Exception): pass class ReadCheckWrapper: """ Wrap a file-object to raises a exception on incomplete reads. """ def __init__(self, file_obj): self._file_obj = file_obj def read(self, size): buf = self._file_obj.read(size) if len(buf) != size: raise IncompleteRead('requested number of bytes were not read.') return buf def __getattr__(self, attr): if attr == 'read': return self.read else: return getattr(self._file_obj, attr) class fileview: """ A proxy for file-like objects that exposes a given view of a file. Modified from macholib. """ def __init__(self, fileobj, start=0, size=maxint): if isinstance(fileobj, fileview): self._fileobj = fileobj._fileobj else: self._fileobj = fileobj self._start = start self._end = start + size self._pos = 0 def __repr__(self): return '' % ( self._start, self._end, self._fileobj) def tell(self): return self._pos def _checkwindow(self, seekto, op): if not (self._start <= seekto <= self._end): raise OSError("%s to offset %d is outside window [%d, %d]" % ( op, seekto, self._start, self._end)) def seek(self, offset, whence=0): seekto = offset if whence == os.SEEK_SET: seekto += self._start elif whence == os.SEEK_CUR: seekto += self._start + self._pos elif whence == os.SEEK_END: seekto += self._end else: raise OSError(f"Invalid whence argument to seek: {whence!r}") self._checkwindow(seekto, 'seek') self._fileobj.seek(seekto) self._pos = seekto - self._start def write(self, bytes): here = self._start + self._pos self._checkwindow(here, 'write') self._checkwindow(here + len(bytes), 'write') self._fileobj.seek(here, os.SEEK_SET) self._fileobj.write(bytes) self._pos += len(bytes) def read(self, size=maxint): assert size >= 0 here = self._start + self._pos self._checkwindow(here, 'read') size = min(size, self._end - here) self._fileobj.seek(here, os.SEEK_SET) bytes = self._fileobj.read(size) self._pos += len(bytes) return bytes class UnixExecutable: def __init__(self, file, initial_rpaths_transitive=[]): self.rpaths_transitive = [] self.rpaths_nontransitive = [] self.shared_libraries = [] self.dt_runpath = [] self.dt_soname = initial_rpaths_transitive def get_rpaths_transitive(self): return self.rpaths_transitive def get_rpaths_nontransitive(self): return self.rpaths_nontransitive def get_shared_libraries(self): return self.shared_libraries def is_executable(self): return True def get_runpaths(self): return self.dt_runpath def get_soname(self): return self.dt_soname def read_data(file, endian, num=1): """ Read a given number of 32-bits unsigned integers from the given file with the given endianness. """ res = struct.unpack(endian + 'L' * num, file.read(num * 4)) if len(res) == 1: return res[0] return res def replace_lc_load_dylib(file, where, bits, endian, cmd, cmdsize, what, val): if cmd & ~LC_REQ_DYLD in LC_LOAD_DYLIBS: # The first data field in LC_LOAD_DYLIB commands is the # offset of the name, starting from the beginning of the # command. name_offset = read_data(file, endian) file.seek(where + name_offset, os.SEEK_SET) # Read the NUL terminated string load = file.read(cmdsize - name_offset).decode() load = load[:load.index('\0')] # If the string is what is being replaced, overwrite it. if load == what: file.seek(where + name_offset, os.SEEK_SET) file.write(val.encode() + b'\0') return True return False def find_lc_load_dylib(file, where, bits, endian, cmd, cmdsize, what): if cmd & ~LC_REQ_DYLD in LC_LOAD_DYLIBS: # The first data field in LC_LOAD_DYLIB commands is the # offset of the name, starting from the beginning of the # command. name_offset = read_data(file, endian) file.seek(where + name_offset, os.SEEK_SET) # Read the NUL terminated string load = file.read(cmdsize - name_offset).decode() load = load[:load.index('\0')] # If the string is what is being replaced, overwrite it. if re.match(what, load): return load def find_lc_rpath(file, where, bits, endian, cmd, cmdsize): if cmd == LC_RPATH: # The first data field in LC_LOAD_DYLIB commands is the # offset of the name, starting from the beginning of the # command. name_offset = read_data(file, endian) file.seek(where + name_offset, os.SEEK_SET) # Read the NUL terminated string load = file.read(cmdsize - name_offset).decode() load = load[:load.index('\0')] return load def do_macho(file, bits, endian, lc_operation, *args): # Read Mach-O header (the magic number is assumed read by the caller) _cputype, _cpusubtype, filetype, ncmds, _sizeofcmds, _flags \ = read_data(file, endian, 6) # 64-bits header has one more field. if bits == 64: read_data(file, endian) # The header is followed by ncmds commands results = [] for _n in range(ncmds): where = file.tell() # Read command header cmd, cmdsize = read_data(file, endian, 2) results.append(lc_operation(file, where, bits, endian, cmd, cmdsize, *args)) # Seek to the next command file.seek(where + cmdsize, os.SEEK_SET) return filetype, results class offset_size: def __init__(self, offset=0, size=maxint): self.offset = offset self.size = size def do_file(file, lc_operation, off_sz, arch, results, *args): file = fileview(file, off_sz.offset, off_sz.size) # Read magic number magic = read_data(file, BIG_ENDIAN) if magic == FAT_MAGIC: # Fat binaries contain nfat_arch Mach-O binaries nfat_arch = read_data(file, BIG_ENDIAN) for _n in range(nfat_arch): # Read arch header _cputype, _cpusubtype, offset, size, _align = \ read_data(file, BIG_ENDIAN, 5) do_file(file, lc_operation, offset_size(offset, size), arch, results, *args) elif magic == MH_MAGIC and arch in ('any', 'ppc32', 'm68k'): results.append(do_macho(file, 32, BIG_ENDIAN, lc_operation, *args)) elif magic == MH_CIGAM and arch in ('any', 'i386'): results.append(do_macho(file, 32, LITTLE_ENDIAN, lc_operation, *args)) elif magic == MH_MAGIC_64 and arch in ('any', 'ppc64'): results.append(do_macho(file, 64, BIG_ENDIAN, lc_operation, *args)) elif magic == MH_CIGAM_64 and arch in ('any', 'x86_64'): results.append(do_macho(file, 64, LITTLE_ENDIAN, lc_operation, *args)) def mach_o_change(path, arch, what, value): """ Replace a given name (what) in any LC_LOAD_DYLIB command found in the given binary with a new name (value), provided it's shorter. """ assert(len(what) >= len(value)) results = [] with open(path, 'r+b') as f: do_file(f, replace_lc_load_dylib, offset_size(), arch, results, what, value) return results def mach_o_find_dylibs(ofile, arch, regex='.*'): """ Finds the executable's view of where any dylibs live without resolving any macros (@rpath, @loader_path, @executable_path) """ results = [] do_file(ofile, find_lc_load_dylib, offset_size(), arch, results, regex) return results def mach_o_find_rpaths(ofile, arch): """ Finds ofile's list of rpaths """ results = [] do_file(ofile, find_lc_rpath, offset_size(), arch, results) return results def _get_resolved_location(codefile, unresolved, exe_dir, self_dir, LD_LIBRARY_PATH='', default_paths=None, sysroot='', resolved_rpath=None): ''' From `man ld.so` When resolving shared object dependencies, the dynamic linker first inspects each dependency string to see if it contains a slash (this can occur if a shared object pathname containing slashes was specified at link time). If a slash is found, then the dependency string is interpreted as a (relative or absolute) pathname, and the shared object is loaded using that pathname. If a shared object dependency does not contain a slash, then it is searched for in the following order: o Using the directories specified in the DT_RPATH dynamic section attribute of the binary if present and DT_RUNPATH attribute does not exist. Use of DT_RPATH is deprecated. o Using the environment variable LD_LIBRARY_PATH (unless the executable is being run in secure-execution mode; see below). in which case it is ignored. o Using the directories specified in the DT_RUNPATH dynamic section attribute of the binary if present. Such directories are searched only to find those objects required by DT_NEEDED (direct dependencies) entries and do not apply to those objects' children, which must themselves have their own DT_RUNPATH entries. This is unlike DT_RPATH, which is applied to searches for all children in the dependency tree. o From the cache file /etc/ld.so.cache, which contains a compiled list of candidate shared objects previously found in the augmented library path. If, however, the binary was linked with the -z nodeflib linker option, shared objects in the default paths are skipped. Shared objects installed in hardware capability directories (see below) are preferred to other shared objects. o In the default path /lib, and then /usr/lib. (On some 64-bit architectures, the default paths for 64-bit shared objects are /lib64, and then /usr/lib64.) If the binary was linked with the -z nodeflib linker option, this step is skipped. Returns a tuple of resolved location, rpath_used, in_sysroot ''' rpath_result = None found = False ld_library_paths = [] if not LD_LIBRARY_PATH else LD_LIBRARY_PATH.split(':') if unresolved.startswith('$RPATH'): these_rpaths = [resolved_rpath] if resolved_rpath else \ codefile.get_rpaths_transitive() + \ ld_library_paths + \ codefile.get_rpaths_nontransitive() + \ [dp.replace('$SYSROOT', sysroot) for dp in ensure_list(default_paths)] for rpath in these_rpaths: resolved = unresolved.replace('$RPATH', rpath) \ .replace('$SELFDIR', self_dir) \ .replace('$EXEDIR', exe_dir) exists = os.path.exists(resolved) exists_sysroot = exists and sysroot and resolved.startswith(sysroot) if resolved_rpath or exists or exists_sysroot: rpath_result = rpath found = True break if not found: # Return the so name so that it can be warned about as missing. return unresolved, None, False elif any(a in unresolved for a in ('$SELFDIR', '$EXEDIR')): resolved = unresolved.replace('$SELFDIR', self_dir) \ .replace('$EXEDIR', exe_dir) exists = os.path.exists(resolved) exists_sysroot = exists and sysroot and resolved.startswith(sysroot) else: if unresolved.startswith('/'): return unresolved, None, False else: return os.path.join(self_dir, unresolved), None, False return resolved, rpath_result, exists_sysroot def _get_resolved_relocated_location(codefile, so, src_exedir, src_selfdir, dst_exedir, dst_selfdir): src_resolved, rpath, in_sysroot = _get_resolved_location(codefile, so, src_exedir, src_selfdir) if in_sysroot: dst_resolved = src_resolved else: dst_resolved = _get_resolved_location(codefile, so, dst_exedir, dst_selfdir, rpath) return src_resolved, dst_resolved, in_sysroot class machofile(UnixExecutable): def __init__(self, file, arch, initial_rpaths_transitive=[]): self.filename = file.name self.shared_libraries = [] self.dt_runpath = [] self._dir = os.path.dirname(file.name) results = mach_o_find_dylibs(file, arch) if not results: return _, sos = zip(*results) file.seek(0) self.rpaths_transitive = initial_rpaths_transitive _filetypes, rpaths = zip(*mach_o_find_rpaths(file, arch)) local_rpaths = [self.from_os_varnames(rpath.rstrip('/')) for rpath in rpaths[0] if rpath] self.rpaths_transitive.extend(local_rpaths) self.rpaths_nontransitive = local_rpaths self.shared_libraries.extend( [(so, self.from_os_varnames(so)) for so in sos[0] if so]) file.seek(0) def to_os_varnames(self, input_): """Don't make these functions - they are methods to match the API for elffiles.""" return input_.replace('$SELFDIR', '@loader_path') \ .replace('$EXEDIR', '@executable_path') \ .replace('$RPATH', '@rpath') def from_os_varnames(self, input_): """Don't make these functions - they are methods to match the API for elffiles.""" return input_.replace('@loader_path', '$SELFDIR') \ .replace('@executable_path', '$EXEDIR') \ .replace('@rpath', '$RPATH') def get_resolved_shared_libraries(self, src_exedir, src_selfdir, sysroot=''): result = [] for so_orig, so in self.shared_libraries: resolved, rpath, in_sysroot = \ _get_resolved_location(self, so, src_exedir, src_selfdir, sysroot) result.append((so_orig, resolved, rpath, in_sysroot)) return result def get_relocated_shared_libraries(self, src_exedir, src_selfdir, dst_exedir, dst_selfdir): result = [] for so in self.shared_libraries: resolved, dst_resolved, in_sysroot = \ _get_resolved_relocated_location(self, so, src_exedir, src_selfdir, dst_exedir, dst_selfdir) result.append((so, resolved, dst_resolved, in_sysroot)) return result def uniqueness_key(self): return self.filename ########################################### # Constants used in the ELF specification # ########################################### ELF_HDR = 0x7f454c46 E_TYPE_RELOCATABLE = 1 E_TYPE_EXECUTABLE = 2 E_TYPE_SHARED = 3 E_TYPE_CORE = 4 E_MACHINE_UNSPECIFIED = 0x00 E_MACHINE_SPARC = 0x02 E_MACHINE_X86 = 0x03 E_MACHINE_MIPS = 0x08 E_MACHINE_POWERPC = 0x14 E_MACHINE_ARM = 0x28 E_MACHINE_SUPERH = 0x2a E_MACHINE_IA_64 = 0x32 E_MACHINE_X86_64 = 0x3e E_MACHINE_AARCH64 = 0xb7 E_MACHINE_RISC_V = 0xf3 # It'd be quicker to use struct.calcsize here and a single # struct.unpack but it would be ugly and harder to maintain. PT_NULL = 0 PT_LOAD = 1 PT_DYNAMIC = 2 PT_INTERP = 3 PT_NOTE = 4 PT_SHLIB = 5 PT_PHDR = 6 PT_LOOS = 0x60000000 PT_LOPROC = 0x70000000 PT_HIPROC = 0x7fffffff PT_GNU_EH_FRAME = (PT_LOOS + 0x474e550) PT_GNU_STACK = (PT_LOOS + 0x474e551) PT_GNU_RELRO = (PT_LOOS + 0x474e552) SHT_PROGBITS = 0x1 SHT_SYMTAB = 0x2 SHT_STRTAB = 0x3 SHT_RELA = 0x4 SHT_HASH = 0x5 SHT_DYNAMIC = 0x6 SHT_NOTE = 0x7 SHT_NOBITS = 0x8 SHT_REL = 0x9 SHT_SHLIB = 0x0A SHT_DYNSYM = 0x0B SHT_INIT_ARRAY = 0x0E SHT_FINI_ARRAY = 0x0F SHT_PREINIT_ARRAY = 0x10 SHT_GROUP = 0x11 SHT_SYMTAB_SHNDX = 0x12 SHT_NUM = 0x13 SHT_LOOS = 0x60000000 SHF_WRITE = 0x1 SHF_ALLOC = 0x2 SHF_EXECINSTR = 0x4 SHF_MERGE = 0x10 SHF_STRINGS = 0x20 SHF_INFO_LINK = 0x40 SHF_LINK_ORDER = 0x80 SHF_OS_NONCONFORMING = 0x100 SHF_GROUP = 0x200 SHF_TLS = 0x400 SHF_MASKOS = 0x0ff00000 SHF_MASKPROC = 0xf0000000 SHF_ORDERED = 0x4000000 SHF_EXCLUDE = 0x8000000 DT_NULL = 0 DT_NEEDED = 1 DT_PLTRELSZ = 2 DT_PLTGOT = 3 DT_HASH = 4 DT_STRTAB = 5 DT_SYMTAB = 6 DT_RELA = 7 DT_RELASZ = 8 DT_RELAENT = 9 DT_STRSZ = 10 DT_SYMENT = 11 DT_INIT = 12 DT_FINI = 13 DT_SONAME = 14 DT_RPATH = 15 DT_SYMBOLIC = 16 DT_REL = 17 DT_RELSZ = 18 DT_RELENT = 19 DT_PLTREL = 20 DT_DEBUG = 21 DT_TEXTREL = 22 DT_JMPREL = 23 DT_BIND_NOW = 24 DT_INIT_ARRAY = 25 DT_FINI_ARRAY = 26 DT_INIT_ARRAYSZ = 27 DT_FINI_ARRAYSZ = 28 DT_RUNPATH = 29 DT_LOOS = 0x60000000 DT_HIOS = 0x6fffffff DT_LOPROC = 0x70000000 DT_HIPROC = 0x7fffffff class elfheader: def __init__(self, file): self.hdr, = struct.unpack(BIG_ENDIAN + 'L', file.read(4)) self.dt_needed = [] self.dt_rpath = [] if self.hdr != ELF_HDR: return bitness, = struct.unpack(LITTLE_ENDIAN + 'B', file.read(1)) bitness = 32 if bitness == 1 else 64 sz_ptr = int(bitness / 8) ptr_type = 'Q' if sz_ptr == 8 else 'L' self.bitness = bitness self.sz_ptr = sz_ptr self.ptr_type = ptr_type endian, = struct.unpack(LITTLE_ENDIAN + 'B', file.read(1)) endian = LITTLE_ENDIAN if endian == 1 else BIG_ENDIAN self.endian = endian self.version, = struct.unpack(endian + 'B', file.read(1)) self.osabi, = struct.unpack(endian + 'B', file.read(1)) self.abiver, = struct.unpack(endian + 'B', file.read(1)) struct.unpack(endian + 'B' * 7, file.read(7)) self.type, = struct.unpack(endian + 'H', file.read(2)) self.machine, = struct.unpack(endian + 'H', file.read(2)) self.version, = struct.unpack(endian + 'L', file.read(4)) self.entry, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.phoff, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.shoff, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.flags, = struct.unpack(endian + 'L', file.read(4)) self.ehsize, = struct.unpack(endian + 'H', file.read(2)) self.phentsize, = struct.unpack(endian + 'H', file.read(2)) self.phnum, = struct.unpack(endian + 'H', file.read(2)) self.shentsize, = struct.unpack(endian + 'H', file.read(2)) self.shnum, = struct.unpack(endian + 'H', file.read(2)) self.shstrndx, = struct.unpack(endian + 'H', file.read(2)) loc = file.tell() if loc != self.ehsize: get_logger(__name__).warning(f'file.tell()={loc} != ehsize={self.ehsize}') def __str__(self): return 'bitness {}, endian {}, version {}, type {}, machine {}, entry {}'.format( # noqa self.bitness, self.endian, self.version, self.type, hex(self.machine), hex(self.entry)) class elfsection: def __init__(self, eh, file): ptr_type = eh.ptr_type sz_ptr = eh.sz_ptr endian = eh.endian # It'd be quicker to use struct.calcsize here and a single # struct.unpack but it would be ugly and harder to maintain. self.sh_name, = struct.unpack(endian + 'L', file.read(4)) self.sh_type, = struct.unpack(endian + 'L', file.read(4)) self.sh_flags, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.sh_addr, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.sh_offset, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.sh_size, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.sh_link, = struct.unpack(endian + 'L', file.read(4)) self.sh_info, = struct.unpack(endian + 'L', file.read(4)) self.sh_addralign, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.sh_entsize, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) # Lower priority == post processed earlier so that those # with higher priority can assume already initialized. if self.sh_type == SHT_STRTAB: self.priority = 0 else: self.priority = 1 def postprocess(self, elffile, file): ptr_type = elffile.ehdr.ptr_type sz_ptr = elffile.ehdr.sz_ptr endian = elffile.ehdr.endian if self.sh_type == SHT_STRTAB: file.seek(self.sh_offset) self.table = file.read(self.sh_size).decode() elif self.sh_type == SHT_DYNAMIC: # # Required reading 1: # http://blog.qt.io/blog/2011/10/28/rpath-and-runpath/ # # Unless loading object has RUNPATH: # RPATH of the loading object, # then the RPATH of its loader (unless it has a RUNPATH), ..., # until the end of the chain, which is either the executable # or an object loaded by dlopen # Unless executable has RUNPATH: # RPATH of the executable # LD_LIBRARY_PATH # RUNPATH of the loading object # ld.so.cache # default dirs # # Required reading 2: # http://www.lumiera.org/documentation/technical/code/linkingStructure.html # # the $ORIGIN token # # To support flexible RUNPATH (and RPATH) settings, the GNU ld.so # (also the SUN and Irix linkers) allow the usage of some "magic" # tokens in the .dynamic section of ELF binaries (both libraries # and executables): # # $ORIGIN # # the directory containing the executable or library actually # triggering the current (innermost) resolution step. Not to be # confused with the entity causing the whole linking procedure # (an executable to be executed or a dlopen() call) # # $PLATFORM # # expands to the architecture/platform tag as provided by the OS # kernel # # $LIB # # the system libraries directory, which is /lib for the native # architecture on FHS compliant GNU/Linux systems. # dt_strtab_ptr = None dt_needed = [] dt_rpath = [] dt_runpath = [] dt_soname = '$EXECUTABLE' if self.sh_entsize == 0: # Some ELF files (e.g., Guile's .go files) include sections # without a table of entries in which case sh_entsize will be 0 num_entries = 0 else: num_entries = int(self.sh_size / self.sh_entsize) for m in range(num_entries): file.seek(self.sh_offset + (m * self.sh_entsize)) d_tag, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) d_val_ptr, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) if d_tag == DT_NEEDED: dt_needed.append(d_val_ptr) elif d_tag == DT_RPATH: dt_rpath.append(d_val_ptr) elif d_tag == DT_RUNPATH: dt_runpath.append(d_val_ptr) elif d_tag == DT_STRTAB: dt_strtab_ptr = d_val_ptr elif d_tag == DT_SONAME: dt_soname = d_val_ptr if dt_strtab_ptr: strsec, _offset = elffile.find_section_and_offset(dt_strtab_ptr) if strsec and strsec.sh_type == SHT_STRTAB: for n in dt_needed: end = n + strsec.table[n:].index('\0') elffile.dt_needed.append(strsec.table[n:end]) for r in dt_rpath: end = r + strsec.table[r:].index('\0') path = strsec.table[r:end] rpaths = [p for p in path.split(':') if path] elffile.dt_rpath.extend([p.rstrip('/') for p in rpaths]) for r in dt_runpath: end = r + strsec.table[r:].index('\0') path = strsec.table[r:end] rpaths = [p for p in path.split(':') if path] elffile.dt_runpath.extend([p.rstrip('/') for p in rpaths]) if dt_soname != '$EXECUTABLE': end = dt_soname + strsec.table[dt_soname:].index('\0') elffile.dt_soname = strsec.table[dt_soname:end] # runpath always takes precedence. if len(elffile.dt_runpath): elffile.dt_rpath = [] class programheader: def __init__(self, eh, file): ptr_type = eh.ptr_type sz_ptr = eh.sz_ptr endian = eh.endian self.p_type, = struct.unpack(endian + 'L', file.read(4)) if eh.bitness == 64: self.p_flags, = struct.unpack(endian + 'L', file.read(4)) self.p_offset, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.p_vaddr, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.p_paddr, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.p_filesz, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) self.p_memsz, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) if eh.bitness == 32: self.p_flags, = struct.unpack(endian + 'L', file.read(4)) self.p_align, = struct.unpack(endian + ptr_type, file.read(sz_ptr)) def postprocess(self, elffile, file): if self.p_type == PT_INTERP: file.seek(self.p_offset) elffile.program_interpreter = file.read(self.p_filesz - 1).decode() elif self.p_type == PT_LOAD: file.seek(self.p_offset) if hasattr(elffile, 'ptload_p_vaddr'): elffile.ptload_p_vaddr.append(self.p_vaddr) elffile.ptload_p_paddr.append(self.p_paddr) else: elffile.ptload_p_vaddr = [self.p_vaddr] elffile.ptload_p_paddr = [self.p_paddr] class elffile(UnixExecutable): def __init__(self, file, initial_rpaths_transitive=[]): self.ehdr = elfheader(file) self.dt_needed = [] self.dt_rpath = [] self.dt_runpath = [] self.programheaders = [] self.elfsections = [] self.program_interpreter = None self.dt_soname = '$EXECUTABLE' self._dir = os.path.dirname(file.name) for n in range(self.ehdr.phnum): file.seek(self.ehdr.phoff + (n * self.ehdr.phentsize)) self.programheaders.append(programheader(self.ehdr, file)) for n in range(self.ehdr.shnum): file.seek(self.ehdr.shoff + (n * self.ehdr.shentsize)) self.elfsections.append(elfsection(self.ehdr, file)) self.elfsections.sort(key=lambda x: x.priority) for ph in self.programheaders: ph.postprocess(self, file) for es in self.elfsections: es.postprocess(self, file) # TODO :: If we have a program_interpreter we need to run it as: # TODO :: LD_DEBUG=all self.program_interpreter --inhibit-cache --list file.name # TODO :: then process the output line e.g.: # TODO :: search path=/usr/lib/tls/x86_64:/usr/lib/tls:/usr/lib/x86_64:/usr/lib (system search path) # noqa # TODO :: .. and optionally add a sysroot prefix to each of those. This needs to work # TODO :: when run through QEMU also, so in that case, # TODO :: we must run os.path.join(sysroot,self.program_interpreter) # TODO :: Interesting stuff: https://www.cs.virginia.edu/~dww4s/articles/ld_linux.html dt_rpath = [p.rstrip("/") for p in self.dt_rpath] dt_runpath = [p.rstrip("/") for p in self.dt_runpath] self.rpaths_transitive = [self.from_os_varnames(rpath) for rpath in (initial_rpaths_transitive + dt_rpath)] self.rpaths_nontransitive = [self.from_os_varnames(rpath) for rpath in dt_runpath] # Lookup must be avoided when DT_NEEDED contains any '/'s self.shared_libraries = [(needed, needed if '/' in needed else '$RPATH/' + needed) for needed in self.dt_needed] def to_os_varnames(self, input): if self.ehdr.sz_ptr == 8: libdir = '/lib64' else: libdir = '/lib' return input.replace('$SELFDIR', '$ORIGIN') \ .replace(libdir, '$LIB') def from_os_varnames(self, input): if self.ehdr.sz_ptr == 8: libdir = '/lib64' else: libdir = '/lib' return input.replace('$ORIGIN', '$SELFDIR') \ .replace('$LIB', libdir) def find_section_and_offset(self, addr): 'Can be called immediately after the elfsections have been constructed' for es in self.elfsections: if addr >= es.sh_addr and addr < es.sh_addr + es.sh_size: # sections which do not appear in the memory image of the # process should be skipped if es.sh_addr == 0: continue return es, addr - es.sh_addr return None, None def get_resolved_shared_libraries(self, src_exedir, src_selfdir, sysroot=''): result = [] default_paths = ['$SYSROOT/lib', '$SYSROOT/usr/lib'] if self.ehdr.sz_ptr == 8: default_paths.extend(['$SYSROOT/lib64', '$SYSROOT/usr/lib64']) for so_orig, so in self.shared_libraries: resolved, rpath, in_sysroot = \ _get_resolved_location(self, so, src_exedir, src_selfdir, LD_LIBRARY_PATH='', default_paths=default_paths, sysroot=sysroot) result.append((so_orig, resolved, rpath, in_sysroot)) return result def get_dir(self): return self._dir def uniqueness_key(self): return self.dt_soname def get_soname(self): return self.dt_soname class inscrutablefile(UnixExecutable): def __init__(self, file, initial_rpaths_transitive=[]): self._dir = None def get_rpaths_transitive(self): return [] def get_resolved_shared_libraries(self, *args, **kw): return [] def get_runpaths(self): return [] def get_dir(self): return self._dir def uniqueness_key(self): return 'unknown' class DLLfile(UnixExecutable): def __init__(self, file, initial_rpaths_transitive=[]): pass def get_rpaths_transitive(self): return [] def get_resolved_shared_libraries(self, *args, **kw): return [] def get_runpaths(self): return [] def get_dir(self): return None def uniqueness_key(self): return 'unknown' class EXEfile: def __init__(self, file, initial_rpaths_transitive=[]): self.super.__init__(self, file, initial_rpaths_transitive) def codefile(file, arch='any', initial_rpaths_transitive=[]): if file.name.endswith('.dll'): return DLLfile(file, list(initial_rpaths_transitive)) magic, = struct.unpack(BIG_ENDIAN + 'L', file.read(4)) file.seek(0) if magic in (FAT_MAGIC, MH_MAGIC, MH_CIGAM, MH_CIGAM_64): return machofile(file, arch, list(initial_rpaths_transitive)) elif magic == ELF_HDR: return elffile(file, list(initial_rpaths_transitive)) else: return inscrutablefile(file, list(initial_rpaths_transitive)) def codefile_class(filename, skip_symlinks=False): if os.path.islink(filename): if skip_symlinks: return None else: filename = os.path.realpath(filename) if os.path.isdir(filename): return None if filename.endswith(('.dll', '.pyd')): return DLLfile if filename.endswith('.exe'): return EXEfile # Java .class files share 0xCAFEBABE with Mach-O FAT_MAGIC. if filename.endswith('.class'): return None if not os.path.exists(filename) or os.path.getsize(filename) < 4: return None with open(filename, 'rb') as file: magic, = struct.unpack(BIG_ENDIAN + 'L', file.read(4)) file.seek(0) if magic in (FAT_MAGIC, MH_MAGIC, MH_CIGAM, MH_CIGAM_64): return machofile elif magic == ELF_HDR: return elffile return None def is_codefile(filename, skip_symlinks=True): klass = codefile_class(filename, skip_symlinks=skip_symlinks) if not klass: return False return True def codefile_type(filename, skip_symlinks=True): "Returns None, 'machofile' or 'elffile'" klass = codefile_class(filename, skip_symlinks=skip_symlinks) if not klass: return None return klass.__name__ def _trim_sysroot(sysroot): if sysroot: while sysroot.endswith('/') or sysroot.endswith('\\'): sysroot = sysroot[:-1] return sysroot def _get_arch_if_native(arch): if arch == 'native': if sys.platform == 'win32': arch = 'x86_64' if sys.maxsize > 2**32 else 'i686' else: _, _, _, _, arch = os.uname() return arch # TODO :: Consider memoizing instead of repeatedly scanning # TODO :: libc.so/libSystem.dylib when inspect_linkages(recurse=True) def _inspect_linkages_this(filename, sysroot='', arch='native'): ''' :param filename: :param sysroot: :param arch: :return: ''' if not os.path.exists(filename): return None, [], [] sysroot = _trim_sysroot(sysroot) arch = _get_arch_if_native(arch) with open(filename, 'rb') as f: # TODO :: Problems here: # TODO :: 1. macOS can modify RPATH for children in each .so # TODO :: 2. Linux can identify the program interpreter which can change the default_paths try: cf = codefile(ReadCheckWrapper(f), arch) except IncompleteRead: # the file was incomplete, can occur if a package ships a test file # which looks like an ELF file but is not. Orange3 does this. get_logger(__name__).warning(f'problems inspecting linkages for {filename}') return None, [], [] dirname = os.path.dirname(filename) results = cf.get_resolved_shared_libraries(dirname, dirname, sysroot) if not results: return cf.uniqueness_key(), [], [] orig_names, resolved_names, _, _in_sysroot = map(list, zip(*results)) return cf.uniqueness_key(), orig_names, resolved_names def inspect_rpaths(filename, resolve_dirnames=True, use_os_varnames=True, sysroot='', arch='native'): if not os.path.exists(filename): return [], [] sysroot = _trim_sysroot(sysroot) arch = _get_arch_if_native(arch) with open(filename, 'rb') as f: # TODO :: Problems here: # TODO :: 1. macOS can modify RPATH for children in each .so # TODO :: 2. Linux can identify the program interpreter which can change the initial RPATHs # TODO :: Should '/lib', '/usr/lib' not include (or be?!) `sysroot`(s) instead? cf = codefile(f, arch, ['/lib', '/usr/lib']) if resolve_dirnames: return [_get_resolved_location(cf, rpath, os.path.dirname(filename), os.path.dirname(filename), sysroot)[0] for rpath in cf.rpaths_nontransitive] else: if use_os_varnames: return [cf.to_os_varnames(rpath) for rpath in cf.rpaths_nontransitive] else: return cf.rpaths_nontransitive def get_runpaths(filename, arch='native'): if not os.path.exists(filename): return [] arch = _get_arch_if_native(arch) with open(filename, 'rb') as f: cf = codefile(f, arch, ['/lib', '/usr/lib']) return cf.get_runpaths() # TODO :: Consider returning a tree structure or a dict when recurse is True? def inspect_linkages(filename, resolve_filenames=True, recurse=True, sysroot='', arch='native'): already_seen = set() todo = {filename} done = set() results = {} while todo != done: filename = next(iter(todo - done)) uniqueness_key, these_orig, these_resolved = _inspect_linkages_this( filename, sysroot=sysroot, arch=arch) if uniqueness_key not in already_seen: for orig, resolved in zip(these_orig, these_resolved): if resolve_filenames: rec = {'orig': orig, 'resolved': os.path.normpath(resolved)} else: rec = {'orig': orig} results[orig] = rec if recurse: todo.update(these_resolved) already_seen.add(uniqueness_key) done.add(filename) return results def inspect_linkages_otool(filename, arch='native'): from subprocess import check_output args = ['/usr/bin/otool'] if arch != 'native': args.extend(['-arch', arch]) else: # 'x86_64' if sys.maxsize > 2**32 else 'i386' args.extend(['-arch', os.uname()[4]]) args.extend(['-L', filename]) result = check_output(args).decode(encoding='ascii') groups = re.findall(r'^\t(.*) \(compatibility', result, re.MULTILINE) return groups # TODO :: Consider allowing QEMU/binfmt_misc to run foreign binaries + passing a sysroot here? def inspect_linkages_ldd(filename): from subprocess import PIPE, Popen process = Popen(['/usr/bin/ldd', filename], stdout=PIPE, stderr=PIPE) result, err = process.communicate() result = result.decode(encoding='ascii') err = err.decode(encoding='ascii') groups = re.findall(r'^\t(?!linux-gate\.so\.1.*$)[^ ]+ => (.*) \([0-9a-fx]+\)', result, re.MULTILINE) return groups def otool(*args): parser = argparse.ArgumentParser(prog='otool', add_help=False) parser.add_argument("-h", "--help", action='store_true') parser.add_argument("-arch", dest='arch_type', help="arch_type", default='native') parser.add_argument("-L", dest='filename', help="print shared libraries used") args = parser.parse_args(args) if args.help: print(OTOOL_USAGE) return 0 if args.filename: shared_libs = inspect_linkages(args.filename, resolve_filenames=False, recurse=False, arch=args.arch_type) print("Shared libs used (non-recursively) by {} are:\n{}".format(args.filename, shared_libs)) return 0 return 1 def otool_sys(*args): import subprocess result = subprocess.check_output('/usr/bin/otool', args).\ decode(encoding='ascii') return result def ldd_sys(*args): result = [] return result def ldd(*args): parser = argparse.ArgumentParser(prog='ldd', add_help=False) parser.add_argument("-h", "--help", action='store_true') parser.add_argument("filename") args = parser.parse_args(args) if args.help: print(LDD_USAGE) return 0 if args.filename: shared_libs = inspect_linkages(args.filename, resolve_filenames=False, recurse=True) print("Shared libs used (recursively) by {} are:\n{}".format(args.filename, shared_libs)) return 0 return 1 def main(argv): for idx, progname in enumerate(argv[0:2][::-1]): if re.match(r'.*ldd(?:$|\.exe|\.py)', progname): return ldd(*argv[2 - idx:]) elif re.match(r'.*otool(?:$|\.exe|\.py)', progname): return otool(*argv[2 - idx:]) elif os.path.isfile(progname): klass = codefile_class(progname) if not klass: return 1 elif klass == elffile: return ldd(*argv[1 - idx:]) elif klass == machofile: return otool('-L', *argv[1 - idx:]) return 1 def main_maybe_test(): if sys.argv[1] == 'test': import functools tool = sys.argv[2] if tool != 'otool' and tool != 'ldd': if sys.platform == 'darwin': tool = 'otool' else: tool = 'ldd' test_that = None sysroot_args = [re.match('--sysroot=([^ ]+)', arg) for arg in sys.argv if re.match('--sysroot=([^ ]+)', arg)] if len(sysroot_args): sysroot, = sysroot_args[-1].groups(1) sysroot = os.path.expanduser(sysroot) else: sysroot = '' if tool == 'otool': test_this = functools.partial(inspect_linkages, sysroot=sysroot, resolve_filenames=False, recurse=False) if sys.platform == 'darwin': test_that = functools.partial(inspect_linkages_otool) SOEXT = 'dylib' elif tool == 'ldd': test_this = functools.partial(inspect_linkages, sysroot=sysroot, resolve_filenames=True, recurse=True) if sys.platform.startswith('linux'): test_that = functools.partial(inspect_linkages_ldd) SOEXT = 'so' # Find a load of dylibs or elfs and compare # the output against 'otool -L' or 'ldd' # codefiles = glob.glob('/usr/lib/*.'+SOEXT) codefiles = glob.glob(sysroot + '/usr/lib/*.' + SOEXT) # codefiles = ['/usr/bin/file'] # Sometimes files do not exist: # (/usr/lib/libgutenprint.2.dylib -> libgutenprint.2.0.3.dylib) codefiles = [codefile for codefile in codefiles if not os.path.islink(codefile) or os.path.exists(os.readlink(codefile))] for codefile in codefiles: print(f'\nchecking {codefile}') this = test_this(codefile) if test_that: that = test_that(codefile) else: that = this print('\n'.join(this)) assert set(this) == set(that),\ "py-ldd result incorrect for {}, this:\n{}\nvs that:\n{}".\ format(codefile, set(this), set(that)) else: return main(sys.argv) if __name__ == '__main__': sys.exit(main_maybe_test())