1 files changed, 93 insertions, 0 deletions
diff --git a/codegen/vulkan/scripts/check_html_xrefs.py b/codegen/vulkan/scripts/check_html_xrefs.py
new file mode 100755
index 00000000..0081e6c0
--- /dev/null
+++ b/codegen/vulkan/scripts/check_html_xrefs.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python3
+#
+# Copyright 2020-2021 The Khronos Group Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# check_html_xrefs - simple-minded check for internal xrefs in spec HTML
+# that don't exist.
+
+# Usage: check_html_xrefs file
+# Just reports bad xrefs, not where they occur
+
+import argparse
+import re
+from lxml import etree
+
+SECTNAME = re.compile(r'sect(?P<level>\d+)')
+
+def find_parent_ids(elem, href):
+    """Find section titles in parents, which are the 'id' elements of '<hN'
+       children of '<div class="sectM"' tags, and N = M + 1. This may be
+       specific to the Vulkan spec, though - hierarchy could be different in
+       other asciidoctor documents. Returns a list of [ anchor, title ].
+
+       elem - this node
+       href - href link text of elem"""
+
+    # Find parent <div> with class="sect#"
+    parent = elem.getparent()
+    while parent is not None:
+        if parent.tag == 'div':
+            cssclass = parent.get('class')
+            matches = SECTNAME.match(cssclass)
+            if matches is not None:
+                level = int(matches.group('level'))
+                # Look for corresponding header tag in this div
+                helem = parent.find('./h{}'.format(level+1))
+                if helem is not None:
+                    return [ helem.get('id'), ''.join(helem.itertext()) ]
+        parent = parent.getparent()
+    return [ '** NO PARENT NODE IDENTIFIED **', '' ]
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('files', metavar='filename', nargs='*',
+                        help='Path to registry XML')
+    args = parser.parse_args()
+
+    for filename in args.files:
+        parser = etree.HTMLParser()
+        tree = etree.parse(filename, parser)
+
+        # Find all 'id' elements
+        id_elems = tree.findall('.//*[@id]')
+        ids = set()
+        for elem in id_elems:
+            id = elem.get('id')
+            if id in ids:
+                True
+                # print('Duplicate ID attribute:', id)
+            else:
+                ids.add(id)
+
+        # Find all internal 'href' attributes and see if they're valid
+        # Keep an [element, href] list for tracking parents
+        # Also keep a count of each href
+        ref_elems = tree.findall('.//a[@href]')
+        refs = []
+        count = {}
+        for elem in ref_elems:
+            href = elem.get('href')
+            # If not a local href, skip it
+            if href[0] == '#':
+                # If there's a corresponding id, skip it
+                href = href[1:]
+                if href not in ids:
+                    if href in count:
+                        refs.append((elem, href))
+                        True
+                        count[href] = count[href] + 1
+                    else:
+                        refs.append((elem, href))
+                        count[href] = 1
+            else:
+                True
+                # print('Skipping external href:', ref)
+
+        # Check for hrefs not found in ids
+        print('Bad links in {}:'.format(filename))
+        for (elem, href) in refs:
+            parents = find_parent_ids(elem, href)
+            print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1]))