diff options
| author | 2020-08-03 15:06:44 +0000 | |
|---|---|---|
| committer | 2020-08-03 15:06:44 +0000 | |
| commit | b64793999546ed8adebaeebd9d8345d18db8927d (patch) | |
| tree | 4357c27b561d73b0e089727c6ed659f2ceff5f47 /gnu/llvm/utils/extract_symbols.py | |
| parent | Add support for UTF-8 DISPLAY-HINTs with octet length. For now only (diff) | |
| download | wireguard-openbsd-b64793999546ed8adebaeebd9d8345d18db8927d.tar.xz wireguard-openbsd-b64793999546ed8adebaeebd9d8345d18db8927d.zip | |
Remove LLVM 8.0.1 files.
Diffstat (limited to 'gnu/llvm/utils/extract_symbols.py')
| -rwxr-xr-x | gnu/llvm/utils/extract_symbols.py | 504 |
1 files changed, 0 insertions, 504 deletions
diff --git a/gnu/llvm/utils/extract_symbols.py b/gnu/llvm/utils/extract_symbols.py deleted file mode 100755 index 93ad2e9c375..00000000000 --- a/gnu/llvm/utils/extract_symbols.py +++ /dev/null @@ -1,504 +0,0 @@ -#!/usr/bin/env python - -"""A tool for extracting a list of symbols to export - -When exporting symbols from a dll or exe we either need to mark the symbols in -the source code as __declspec(dllexport) or supply a list of symbols to the -linker. This program automates the latter by inspecting the symbol tables of a -list of link inputs and deciding which of those symbols need to be exported. - -We can't just export all the defined symbols, as there's a limit of 65535 -exported symbols and in clang we go way over that, particularly in a debug -build. Therefore a large part of the work is pruning symbols either which can't -be imported, or which we think are things that have definitions in public header -files (i.e. template instantiations) and we would get defined in the thing -importing these symbols anyway. -""" - -from __future__ import print_function -import sys -import re -import os -import subprocess -import multiprocessing -import argparse - -# Define functions which extract a list of symbols from a library using several -# different tools. We use subprocess.Popen and yield a symbol at a time instead -# of using subprocess.check_output and returning a list as, especially on -# Windows, waiting for the entire output to be ready can take a significant -# amount of time. - -def dumpbin_get_symbols(lib): - process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1, - stdout=subprocess.PIPE, stdin=subprocess.PIPE, - universal_newlines=True) - process.stdin.close() - for line in process.stdout: - # Look for external symbols that are defined in some section - match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line) - if match: - yield match.group(1) - process.wait() - -def nm_get_symbols(lib): - process = subprocess.Popen(['nm',lib], bufsize=1, - stdout=subprocess.PIPE, stdin=subprocess.PIPE, - universal_newlines=True) - process.stdin.close() - for line in process.stdout: - # Look for external symbols that are defined in some section - match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line) - if match: - yield match.group(1) - process.wait() - -def readobj_get_symbols(lib): - process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1, - stdout=subprocess.PIPE, stdin=subprocess.PIPE, - universal_newlines=True) - process.stdin.close() - for line in process.stdout: - # When looking through the output of llvm-readobj we expect to see Name, - # Section, then StorageClass, so record Name and Section when we see - # them and decide if this is a defined external symbol when we see - # StorageClass. - match = re.search('Name: (\S+)', line) - if match: - name = match.group(1) - match = re.search('Section: (\S+)', line) - if match: - section = match.group(1) - match = re.search('StorageClass: (\S+)', line) - if match: - storageclass = match.group(1) - if section != 'IMAGE_SYM_ABSOLUTE' and \ - section != 'IMAGE_SYM_UNDEFINED' and \ - storageclass == 'External': - yield name - process.wait() - -# Define functions which determine if the target is 32-bit Windows (as that's -# where calling convention name decoration happens). - -def dumpbin_is_32bit_windows(lib): - # dumpbin /headers can output a huge amount of data (>100MB in a debug - # build) so we read only up to the 'machine' line then close the output. - process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1, - stdout=subprocess.PIPE, stdin=subprocess.PIPE, - universal_newlines=True) - process.stdin.close() - retval = False - for line in process.stdout: - match = re.match('.+machine \((\S+)\)', line) - if match: - retval = (match.group(1) == 'x86') - break - process.stdout.close() - process.wait() - return retval - -def objdump_is_32bit_windows(lib): - output = subprocess.check_output(['objdump','-f',lib], - universal_newlines=True) - for line in output: - match = re.match('.+file format (\S+)', line) - if match: - return (match.group(1) == 'pe-i386') - return False - -def readobj_is_32bit_windows(lib): - output = subprocess.check_output(['llvm-readobj','-file-headers',lib], - universal_newlines=True) - for line in output: - match = re.match('Format: (\S+)', line) - if match: - return (match.group(1) == 'COFF-i386') - return False - -# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the -# identifier/type mangling we can decide which symbols could possibly be -# required and which we can discard. -def should_keep_microsoft_symbol(symbol, calling_convention_decoration): - # Keep unmangled (i.e. extern "C") names - if not '?' in symbol: - if calling_convention_decoration: - # Remove calling convention decoration from names - match = re.match('[_@]([^@]+)', symbol) - if match: - return match.group(1) - return symbol - # Function template instantiations start with ?$; keep the instantiations of - # clang::Type::getAs, as some of them are explipict specializations that are - # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that - # the definition is public - elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol): - return symbol - elif symbol.startswith('??$'): - return None - # Deleting destructors start with ?_G or ?_E and can be discarded because - # link.exe gives you a warning telling you they can't be exported if you - # don't - elif symbol.startswith('??_G') or symbol.startswith('??_E'): - return None - # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be - # defined in headers and not required to be kept - elif symbol.startswith('??0?$') or symbol.startswith('??1?$'): - return None - # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol - # that mentions an anonymous namespace can be discarded, as the anonymous - # namespace doesn't exist outside of that translation unit. - elif re.search('\?A(0x\w+)?@', symbol): - return None - # Keep mangled llvm:: and clang:: function symbols. How we detect these is a - # bit of a mess and imprecise, but that avoids having to completely demangle - # the symbol name. The outermost namespace is at the end of the identifier - # mangling, and the identifier mangling is followed by the type mangling, so - # we look for (llvm|clang)@@ followed by something that looks like a - # function type mangling. To spot a function type we use (this is derived - # from clang/lib/AST/MicrosoftMangle.cpp): - # <function-type> ::= <function-class> <this-cvr-qualifiers> - # <calling-convention> <return-type> - # <argument-list> <throw-spec> - # <function-class> ::= [A-Z] - # <this-cvr-qualifiers> ::= [A-Z0-9_]* - # <calling-convention> ::= [A-JQ] - # <return-type> ::= .+ - # <argument-list> ::= X (void) - # ::= .+@ (list of types) - # ::= .*Z (list of types, varargs) - # <throw-spec> ::= exceptions are not allowed - elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol): - return symbol - return None - -# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We -# demangle the identifier mangling to identify symbols that can be safely -# discarded. -def should_keep_itanium_symbol(symbol, calling_convention_decoration): - # Start by removing any calling convention decoration (which we expect to - # see on all symbols, even mangled C++ symbols) - if calling_convention_decoration and symbol.startswith('_'): - symbol = symbol[1:] - # Keep unmangled names - if not symbol.startswith('_') and not symbol.startswith('.'): - return symbol - # Discard manglings that aren't nested names - match = re.match('_Z(T[VTIS])?(N.+)', symbol) - if not match: - return None - # Demangle the name. If the name is too complex then we don't need to keep - # it, but it the demangling fails then keep the symbol just in case. - try: - names, _ = parse_itanium_nested_name(match.group(2)) - except TooComplexName: - return None - if not names: - return symbol - # Constructors and destructors of templates classes are assumed to be - # defined in headers and not required to be kept - if re.match('[CD][123]', names[-1][0]) and names[-2][1]: - return None - # Keep the instantiations of clang::Type::getAs, as some of them are - # explipict specializations that are defined in clang's lib/AST/Type.cpp; - # discard any other function template instantiations as it's assumed that - # the definition is public - elif symbol.startswith('_ZNK5clang4Type5getAs'): - return symbol - elif names[-1][1]: - return None - # Keep llvm:: and clang:: names - elif names[0][0] == '4llvm' or names[0][0] == '5clang': - return symbol - # Discard everything else - else: - return None - -# Certain kinds of complex manglings we assume cannot be part of a public -# interface, and we handle them by raising an exception. -class TooComplexName(Exception): - pass - -# Parse an itanium mangled name from the start of a string and return a -# (name, rest of string) pair. -def parse_itanium_name(arg): - # Check for a normal name - match = re.match('(\d+)(.+)', arg) - if match: - n = int(match.group(1)) - name = match.group(1)+match.group(2)[:n] - rest = match.group(2)[n:] - return name, rest - # Check for constructor/destructor names - match = re.match('([CD][123])(.+)', arg) - if match: - return match.group(1), match.group(2) - # Assume that a sequence of characters that doesn't end a nesting is an - # operator (this is very imprecise, but appears to be good enough) - match = re.match('([^E]+)(.+)', arg) - if match: - return match.group(1), match.group(2) - # Anything else: we can't handle it - return None, arg - -# Parse an itanium mangled template argument list from the start of a string -# and throw it away, returning the rest of the string. -def skip_itanium_template(arg): - # A template argument list starts with I - assert arg.startswith('I'), arg - tmp = arg[1:] - while tmp: - # Check for names - match = re.match('(\d+)(.+)', tmp) - if match: - n = int(match.group(1)) - tmp = match.group(2)[n:] - continue - # Check for substitutions - match = re.match('S[A-Z0-9]*_(.+)', tmp) - if match: - tmp = match.group(1) - # Start of a template - elif tmp.startswith('I'): - tmp = skip_itanium_template(tmp) - # Start of a nested name - elif tmp.startswith('N'): - _, tmp = parse_itanium_nested_name(tmp) - # Start of an expression: assume that it's too complicated - elif tmp.startswith('L') or tmp.startswith('X'): - raise TooComplexName - # End of the template - elif tmp.startswith('E'): - return tmp[1:] - # Something else: probably a type, skip it - else: - tmp = tmp[1:] - return None - -# Parse an itanium mangled nested name and transform it into a list of pairs of -# (name, is_template), returning (list, rest of string). -def parse_itanium_nested_name(arg): - # A nested name starts with N - assert arg.startswith('N'), arg - ret = [] - - # Skip past the N, and possibly a substitution - match = re.match('NS[A-Z0-9]*_(.+)', arg) - if match: - tmp = match.group(1) - else: - tmp = arg[1:] - - # Skip past CV-qualifiers and ref qualifiers - match = re.match('[rVKRO]*(.+)', tmp); - if match: - tmp = match.group(1) - - # Repeatedly parse names from the string until we reach the end of the - # nested name - while tmp: - # An E ends the nested name - if tmp.startswith('E'): - return ret, tmp[1:] - # Parse a name - name_part, tmp = parse_itanium_name(tmp) - if not name_part: - # If we failed then we don't know how to demangle this - return None, None - is_template = False - # If this name is a template record that, then skip the template - # arguments - if tmp.startswith('I'): - tmp = skip_itanium_template(tmp) - is_template = True - # Add the name to the list - ret.append((name_part, is_template)) - - # If we get here then something went wrong - return None, None - -def extract_symbols(arg): - get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg - symbols = dict() - for symbol in get_symbols(lib): - symbol = should_keep_symbol(symbol, calling_convention_decoration) - if symbol: - symbols[symbol] = 1 + symbols.setdefault(symbol,0) - return symbols - -if __name__ == '__main__': - tool_exes = ['dumpbin','nm','objdump','llvm-readobj'] - parser = argparse.ArgumentParser( - description='Extract symbols to export from libraries') - parser.add_argument('--mangling', choices=['itanium','microsoft'], - required=True, help='expected symbol mangling scheme') - parser.add_argument('--tools', choices=tool_exes, nargs='*', - help='tools to use to extract symbols and determine the' - ' target') - parser.add_argument('libs', metavar='lib', type=str, nargs='+', - help='libraries to extract symbols from') - parser.add_argument('-o', metavar='file', type=str, help='output to file') - args = parser.parse_args() - - # Determine the function to use to get the list of symbols from the inputs, - # and the function to use to determine if the target is 32-bit windows. - tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows), - 'nm' : (nm_get_symbols, None), - 'objdump' : (None, objdump_is_32bit_windows), - 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) } - get_symbols = None - is_32bit_windows = None - # If we have a tools argument then use that for the list of tools to check - if args.tools: - tool_exes = args.tools - # Find a tool to use by trying each in turn until we find one that exists - # (subprocess.call will throw OSError when the program does not exist) - get_symbols = None - for exe in tool_exes: - try: - # Close std streams as we don't want any output and we don't - # want the process to wait for something on stdin. - p = subprocess.Popen([exe], stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=subprocess.PIPE, - universal_newlines=True) - p.stdout.close() - p.stderr.close() - p.stdin.close() - p.wait() - # Keep going until we have a tool to use for both get_symbols and - # is_32bit_windows - if not get_symbols: - get_symbols = tools[exe][0] - if not is_32bit_windows: - is_32bit_windows = tools[exe][1] - if get_symbols and is_32bit_windows: - break - except OSError: - continue - if not get_symbols: - print("Couldn't find a program to read symbols with", file=sys.stderr) - exit(1) - if not is_32bit_windows: - print("Couldn't find a program to determining the target", file=sys.stderr) - exit(1) - - # How we determine which symbols to keep and which to discard depends on - # the mangling scheme - if args.mangling == 'microsoft': - should_keep_symbol = should_keep_microsoft_symbol - else: - should_keep_symbol = should_keep_itanium_symbol - - # Get the list of libraries to extract symbols from - libs = list() - for lib in args.libs: - # When invoked by cmake the arguments are the cmake target names of the - # libraries, so we need to add .lib/.a to the end and maybe lib to the - # start to get the filename. Also allow objects. - suffixes = ['.lib','.a','.obj','.o'] - if not any([lib.endswith(s) for s in suffixes]): - for s in suffixes: - if os.path.exists(lib+s): - lib = lib+s - break - if os.path.exists('lib'+lib+s): - lib = 'lib'+lib+s - break - if not any([lib.endswith(s) for s in suffixes]): - print("Don't know what to do with argument "+lib, file=sys.stderr) - exit(1) - libs.append(lib) - - # Check if calling convention decoration is used by inspecting the first - # library in the list - calling_convention_decoration = is_32bit_windows(libs[0]) - - # Extract symbols from libraries in parallel. This is a huge time saver when - # doing a debug build, as there are hundreds of thousands of symbols in each - # library. - pool = multiprocessing.Pool() - try: - # Only one argument can be passed to the mapping function, and we can't - # use a lambda or local function definition as that doesn't work on - # windows, so create a list of tuples which duplicates the arguments - # that are the same in all calls. - vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs] - # Do an async map then wait for the result to make sure that - # KeyboardInterrupt gets caught correctly (see - # http://bugs.python.org/issue8296) - result = pool.map_async(extract_symbols, vals) - pool.close() - libs_symbols = result.get(3600) - except KeyboardInterrupt: - # On Ctrl-C terminate everything and exit - pool.terminate() - pool.join() - exit(1) - - # Merge everything into a single dict - symbols = dict() - for this_lib_symbols in libs_symbols: - for k,v in list(this_lib_symbols.items()): - symbols[k] = v + symbols.setdefault(k,0) - - # Count instances of member functions of template classes, and map the - # symbol name to the function+class. We do this under the assumption that if - # a member function of a template class is instantiated many times it's - # probably declared in a public header file. - template_function_count = dict() - template_function_mapping = dict() - template_function_count[""] = 0 - for k in symbols: - name = None - if args.mangling == 'microsoft': - # Member functions of templates start with - # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>. - # As manglings go from the innermost scope to the outermost scope - # this means: - # * When we have a function member of a subclass of a template - # class then <fn_name> will actually contain the mangling of - # both the subclass and the function member. This is fine. - # * When we have a function member of a template subclass of a - # (possibly template) class then it's the innermost template - # subclass that becomes <class_name>. This should be OK so long - # as we don't have multiple classes with a template subclass of - # the same name. - match = re.search("^\?(\??\w+\@\?\$\w+)\@", k) - if match: - name = match.group(1) - else: - # Find member functions of templates by demangling the name and - # checking if the second-to-last name in the list is a template. - match = re.match('_Z(T[VTIS])?(N.+)', k) - if match: - try: - names, _ = parse_itanium_nested_name(match.group(2)) - if names and names[-2][1]: - name = ''.join([x for x,_ in names]) - except TooComplexName: - # Manglings that are too complex should already have been - # filtered out, but if we happen to somehow see one here - # just leave it as-is. - pass - if name: - old_count = template_function_count.setdefault(name,0) - template_function_count[name] = old_count + 1 - template_function_mapping[k] = name - else: - template_function_mapping[k] = "" - - # Print symbols which both: - # * Appear in exactly one input, as symbols defined in multiple - # objects/libraries are assumed to have public definitions. - # * Aren't instances of member functions of templates which have been - # instantiated 100 times or more, which are assumed to have public - # definitions. (100 is an arbitrary guess here.) - if args.o: - outfile = open(args.o,'w') - else: - outfile = sys.stdout - for k,v in list(symbols.items()): - template_count = template_function_count[template_function_mapping[k]] - if v == 1 and template_count < 100: - print(k, file=outfile) |
