scripts/map_html_anchors.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

#!/usr/bin/python3
#
# Copyright 2020-2023 The Khronos Group Inc.
# SPDX-License-Identifier: Apache-2.0

# map_html_anchors - map each id= element in a spec HTML file onto the
# top-level (chapter) id= element it belongs to. Used to rewrite spec
# xrefs for Antora. Prints a Python script containing a dictionary
# mapping each discovered ID into the top-level ID it belongs to, and the
# corresponding title element following the id.
#
# This script is very specific to HTML generated by asciidoctor and
# following conventions of the Vulkan style guide.

# Usage: map_html_anchors.py file.html > xrefMap.py

import argparse
import re
import sys
from lxml import etree

def contains_any_of(words, wordlist):
    """Returns True if any element of 'word' is contained in 'words'

       - words - iterable of words to check against wordlist
       - wordlist - iterable of words"""

    for word in words:
        if word in wordlist:
            return True
    return False

sectNumberPat = re.compile(r'^(Table |)([0-9]+\.)+ *')

def add_id(chapelem, idelem, id_map, chapter_id):
    """Add a ID -> [ chapter ID, title] mapping.

       - chapelem - Element for the chapter containing this ID
       - idelem - Element for the ID itself
       - id_map - dictionary containing the map
       - chapter_id - chapter ID of chapelem"""

    # The actual ID
    id = idelem.get('id')

    # Try to determine the title corresponding to this ID, or '' otherwise
    if idelem.tag == 'a':
        # <a id=> does not have a corresponding title element
        id_title = ''
    elif idelem.tag in (('h2', 'h3', 'h4', 'h4', 'h5', 'h6')):
        # <h# id=> has ((#.)* *title) in the text of its element
        id_title = ''.join(idelem.itertext())
    elif idelem.tag == 'table':
        # <table id=> may be followed by <caption class="title">
        # with 'Table ##. caption' text
        capelem = idelem.find('.//caption[@class="title"]')
        if capelem is not None:
            id_title = ''.join(capelem.itertext())
        else:
            id_title = 'NO TABLE CAPTION FOUND'
    elif idelem.tag == 'div':
        classes = idelem.get('class')
        if classes is not None:
            divclass = classes.split()

            if contains_any_of((('admonitionblock', 'paragraph', 'sidebarblock')), divclass):
                # <div> classes with no title elements (paragraphs or NOTEs)
                id_title = ''
            elif 'listingblock' in divclass:
                # <div id= class="listingblock"> has title == id (used for API includes)
                id_title = id
            elif contains_any_of((('dlist', 'openblock')), divclass):
                # <div> classes with titles in the text of the first
                # <dt class="hdlist1"> element of the div
                #
                # "dlist" are mostly glossary elements
                # "openblock" are mostly SPIR-V keywords
                dtelem = idelem.find('.//dt[@class="hdlist1"]')
                if dtelem is not None:
                    # This may not find text in child Elements of <dt>
                    id_title = ''.join(dtelem.itertext())
                else:
                    # No dtelem text found, this probably means a label on an
                    # API open block
                    id_title = ''
            elif contains_any_of((('ulist', 'imageblock')), divclass):
                # <div> classes with titles in the first
                # <div class="title"> element of the div
                titleelem = idelem.find('.//div[@class="title"]')
                if titleelem is not None:
                    id_title = ''.join(titleelem.itertext())
                else:
                    # No <div class="title"> text found
                    id_title = ''
            else:
                id_title = ''
                print(f'Cannot find title for <div id="{id}" class="{classes}"> - unrecognized class', file=sys.stderr)
        else:
            # <div id=> without a class may have a corresponding <h# id=> with the
            # same id - in this case, the div will be thrown away when the
            # following element is encountered.
            id_title = ''

    if id in id_map:
        val = id_map[id]
        print(f'Replacing key {id} -> ({val[0]}, {val[1]}) with ({chapter_id}, {id_title})', file=sys.stderr)

    # Strip whitespace and leading table or section numbers, if present
    id_title = sectNumberPat.sub('', id_title.strip())

    # Map the xref to the chapter it came from and its title
    id_map[id] = [ chapter_id, id_title ]

def generate_map(id_map, filename, scripttype):
    """Encode the ID map into the specified scripttype ('python' or
       'javascript') in the specified file."""

    fp = open(filename, 'w')

    # Python and JS are extremely similar when the output is just a
    # dictionary of lists of strings.

    if scripttype == 'javascript':
        print('exports.xrefMap = {', file=fp)
    else:
        print('xrefMap = {', file=fp)

    # Sort keys so the can be compared between runs
    for id in sorted(id_map):
        print(f"    '{id}' : [ '{id_map[id][0]}', '{id_map[id][1]}' ],", file=fp)

    print('}', file=fp)

    fp.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()


    parser.add_argument('-jsfile', action='store',
                        default=None,
                        help='Specify name of JavaScript file to generate')
    parser.add_argument('-pyfile', action='store',
                        default=None,
                        help='Specify name of Python file to generate')
    parser.add_argument('files', metavar='filename', nargs=1,
                        help='HTML spec file to map IDs from')
    args = parser.parse_args()

    # Tags whose id elements are anchors (we are not concerned about other
    # tags such as <svg>).
    idtags = (('a', 'div', 'h2', 'h3', 'h4', 'h4', 'h5', 'h6', 'table'))

    # Tags whose id elements we do not care about ('h2' is a special case)
    rejected_tags = (('svg',
                      'circle',
                      'clippath',
                      'defs',
                      'ellipse',
                      'g',
                      'grid',
                      'lineargradient',
                      'marker',
                      'metadata',
                      'namedview',
                      'path',
                      'path-effect',
                      'rect',
                      'stop',
                      'text',
                      'tspan',
        ))

    parser = etree.HTMLParser()

    # There is exactly one HTML filename
    filename = args.files[0]
    tree = etree.parse(filename, parser)

    # Dictionary mapping an ID (anchor) to [chapter ID, ID title],
    # where 'chapter ID' is the ID of the chapter it appears in
    id_map = {}

    # Find each <div class="sect1"> element, which corresponds to a
    # chapter.
    chapter_elems = tree.findall('.//div[@class="sect1"]')
    for chapelem in chapter_elems:
        chapter_id = ''
        h2_elems = chapelem.findall('.//h2[@id]')
        if len(h2_elems) != 1:
            raise UserWarning(f'Error! <div> must have exactly 1 <h2> element, has {len(h2_elems)}')
        else:
            chapter_id = h2_elems[0].get('id')

        for idelem in chapelem.findall('.//*[@id]'):
            if idelem.tag in idtags:
                add_id(chapelem, idelem, id_map, chapter_id)
                True
            elif idelem.tag in rejected_tags:
                # print(f'Rejecting tag {idelem.tag}')
                # Do nothing - for tags we know we do not care about
                True
            else:
                print(f'    Rejecting unknown tag with ID <{idelem.tag} id="{idelem.get("id")}"', file=sys.stderr)
                True

    if args.pyfile is not None:
        generate_map(id_map, args.pyfile, 'python')
    if args.jsfile is not None:
        generate_map(id_map, args.jsfile, 'javascript')