diff options
Diffstat (limited to 'codegen/vulkan/scripts/check_html_xrefs.py')
-rwxr-xr-x | codegen/vulkan/scripts/check_html_xrefs.py | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/codegen/vulkan/scripts/check_html_xrefs.py b/codegen/vulkan/scripts/check_html_xrefs.py new file mode 100755 index 00000000..0081e6c0 --- /dev/null +++ b/codegen/vulkan/scripts/check_html_xrefs.py @@ -0,0 +1,93 @@ +#!/usr/bin/python3 +# +# Copyright 2020-2021 The Khronos Group Inc. +# +# SPDX-License-Identifier: Apache-2.0 + +# check_html_xrefs - simple-minded check for internal xrefs in spec HTML +# that don't exist. + +# Usage: check_html_xrefs file +# Just reports bad xrefs, not where they occur + +import argparse +import re +from lxml import etree + +SECTNAME = re.compile(r'sect(?P<level>\d+)') + +def find_parent_ids(elem, href): + """Find section titles in parents, which are the 'id' elements of '<hN' + children of '<div class="sectM"' tags, and N = M + 1. This may be + specific to the Vulkan spec, though - hierarchy could be different in + other asciidoctor documents. Returns a list of [ anchor, title ]. + + elem - this node + href - href link text of elem""" + + # Find parent <div> with class="sect#" + parent = elem.getparent() + while parent is not None: + if parent.tag == 'div': + cssclass = parent.get('class') + matches = SECTNAME.match(cssclass) + if matches is not None: + level = int(matches.group('level')) + # Look for corresponding header tag in this div + helem = parent.find('./h{}'.format(level+1)) + if helem is not None: + return [ helem.get('id'), ''.join(helem.itertext()) ] + parent = parent.getparent() + return [ '** NO PARENT NODE IDENTIFIED **', '' ] + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('files', metavar='filename', nargs='*', + help='Path to registry XML') + args = parser.parse_args() + + for filename in args.files: + parser = etree.HTMLParser() + tree = etree.parse(filename, parser) + + # Find all 'id' elements + id_elems = tree.findall('.//*[@id]') + ids = set() + for elem in id_elems: + id = elem.get('id') + if id in ids: + True + # print('Duplicate ID attribute:', id) + else: + ids.add(id) + + # Find all internal 'href' attributes and see if they're valid + # Keep an [element, href] list for tracking parents + # Also keep a count of each href + ref_elems = tree.findall('.//a[@href]') + refs = [] + count = {} + for elem in ref_elems: + href = elem.get('href') + # If not a local href, skip it + if href[0] == '#': + # If there's a corresponding id, skip it + href = href[1:] + if href not in ids: + if href in count: + refs.append((elem, href)) + True + count[href] = count[href] + 1 + else: + refs.append((elem, href)) + count[href] = 1 + else: + True + # print('Skipping external href:', ref) + + # Check for hrefs not found in ids + print('Bad links in {}:'.format(filename)) + for (elem, href) in refs: + parents = find_parent_ids(elem, href) + print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1])) |