tools/strip_asm.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163

#!/usr/bin/env python3

"""
strip_asm.py - Cleanup ASM output for the specified file
"""

import os
import re
import sys
from argparse import ArgumentParser


def find_used_labels(asm):
    found = set()
    label_re = re.compile(r"\s*j[a-z]+\s+\.L([a-zA-Z0-9][a-zA-Z0-9_]*)")
    for line in asm.splitlines():
        m = label_re.match(line)
        if m:
            found.add(".L%s" % m.group(1))
    return found


def normalize_labels(asm):
    decls = set()
    label_decl = re.compile("^[.]{0,1}L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
    for line in asm.splitlines():
        m = label_decl.match(line)
        if m:
            decls.add(m.group(0))
    if len(decls) == 0:
        return asm
    needs_dot = next(iter(decls))[0] != "."
    if not needs_dot:
        return asm
    for ld in decls:
        asm = re.sub(r"(^|\s+)" + ld + r"(?=:|\s)", "\\1." + ld, asm)
    return asm


def transform_labels(asm):
    asm = normalize_labels(asm)
    used_decls = find_used_labels(asm)
    new_asm = ""
    label_decl = re.compile(r"^\.L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
    for line in asm.splitlines():
        m = label_decl.match(line)
        if not m or m.group(0) in used_decls:
            new_asm += line
            new_asm += "\n"
    return new_asm


def is_identifier(tk):
    if len(tk) == 0:
        return False
    first = tk[0]
    if not first.isalpha() and first != "_":
        return False
    for i in range(1, len(tk)):
        c = tk[i]
        if not c.isalnum() and c != "_":
            return False
    return True


def process_identifiers(line):
    """
    process_identifiers - process all identifiers and modify them to have
    consistent names across all platforms; specifically across ELF and MachO.
    For example, MachO inserts an additional understore at the beginning of
    names. This function removes that.
    """
    parts = re.split(r"([a-zA-Z0-9_]+)", line)
    new_line = ""
    for tk in parts:
        if is_identifier(tk):
            if tk.startswith("__Z"):
                tk = tk[1:]
            elif (
                tk.startswith("_")
                and len(tk) > 1
                and tk[1].isalpha()
                and tk[1] != "Z"
            ):
                tk = tk[1:]
        new_line += tk
    return new_line


def process_asm(asm):
    """
    Strip the ASM of unwanted directives and lines
    """
    new_contents = ""
    asm = transform_labels(asm)

    # TODO: Add more things we want to remove
    discard_regexes = [
        re.compile(r"\s+\..*$"),  # directive
        re.compile(r"\s*#(NO_APP|APP)$"),  # inline ASM
        re.compile(r"\s*#.*$"),  # comment line
        re.compile(
            r"\s*\.globa?l\s*([.a-zA-Z_][a-zA-Z0-9$_.]*)"
        ),  # global directive
        re.compile(
            r"\s*\.(string|asciz|ascii|[1248]?byte|short|word|long|quad|value|zero)"
        ),
    ]
    keep_regexes: list[re.Pattern] = []
    fn_label_def = re.compile("^[a-zA-Z_][a-zA-Z0-9_.]*:")
    for line in asm.splitlines():
        # Remove Mach-O attribute
        line = line.replace("@GOTPCREL", "")
        add_line = True
        for reg in discard_regexes:
            if reg.match(line) is not None:
                add_line = False
                break
        for reg in keep_regexes:
            if reg.match(line) is not None:
                add_line = True
                break
        if add_line:
            if fn_label_def.match(line) and len(new_contents) != 0:
                new_contents += "\n"
            line = process_identifiers(line)
            new_contents += line
            new_contents += "\n"
    return new_contents


def main():
    parser = ArgumentParser(description="generate a stripped assembly file")
    parser.add_argument(
        "input",
        metavar="input",
        type=str,
        nargs=1,
        help="An input assembly file",
    )
    parser.add_argument(
        "out", metavar="output", type=str, nargs=1, help="The output file"
    )
    args, unknown_args = parser.parse_known_args()
    input = args.input[0]
    output = args.out[0]
    if not os.path.isfile(input):
        print("ERROR: input file '%s' does not exist" % input)
        sys.exit(1)

    with open(input, "r") as f:
        contents = f.read()
    new_contents = process_asm(contents)
    with open(output, "w") as f:
        f.write(new_contents)


if __name__ == "__main__":
    main()

# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
# kate: indent-mode python; remove-trailing-spaces modified;