etc/scripts/docenizers/docenizer-arm.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
import re
import sys
import tarfile
import urllib
from urllib import request, parse

try:
    from bs4 import BeautifulSoup
except ImportError:
    raise ImportError("Please install BeautifulSoup (apt-get install python3-bs4 or pip install beautifulsoup4 should do it)")

parser = argparse.ArgumentParser(description='Docenizes XML version of the official ARM documents')
parser.add_argument('-i', '--inputfolder', type=str,
                    help='Folder where the input files reside as .xml. Default is ./asm-docs-arm/',
                    default='asm-docs-arm')
parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .ts file. Default is ./asm-docs-arm32.ts',
                    default='./asm-docs-arm32.ts')
parser.add_argument('-d', '--downloadfolder', type=str,
                    help='Folder where the archive will be downloaded and extracted', default='asm-docs-arm')
parser.add_argument('-c', '--configfile', type=str, help='Json configuration file with contants', default='arm32.json', required=True)

# The maximum number of paragraphs from the description to copy.
MAX_DESC_PARAS = 5
STRIP_SUFFIX = re.compile(r'\s*(\(.*\))?\s*(--.*)?')

#arm32
FLDMX_RE = re.compile(r'^(FLDM)(\*)(X)')
FLDMX_SET = set(['DB', 'IA'])

#aarch64
CONDITION_RE = re.compile(r'^([A-Z][A-Z0-9]*\.?)(cond|<cc>)()')
CONDITION_SET = set(['EQ', 'NE', 'CS', 'CC', 'MI', 'PL', 'VS', 'VC', 'HI', 'LS', 'GE', 'LT', 'GT', 'LE', 'AL'])
FRINT_RE = re.compile(r'^(FRINT)(<r>)()')
FRINT_SET = set(['N', 'A', 'M', 'P', 'A', 'I', 'X'])

EXPAND_RE = [(FLDMX_RE, FLDMX_SET), (CONDITION_RE, CONDITION_SET), (FRINT_RE, FRINT_SET)]

# Some instructions are so broken we just take their names from the filename
UNPARSEABLE_INSTR_NAMES = []
# Some files contain instructions which cannot be parsed and which compilers are unlikely to emit
IGNORED_FILE_NAMES = [ ]
# Some instructions are defined in multiple files. We ignore a specific set of the
# duplicates here.
IGNORED_DUPLICATES = []


class Config:
    class Archive:
        url : str
        name : str
        subdir : str
        def __init__(self, *, url, name, subdir):
            self.url = str(url)
            self.name = str(name)
            self.subdir = str(subdir)

    archive : Archive
    documentation : str
    def __init__(self, *, archive, documentation):
        self.archive = Config.Archive(**archive)
        self.documentation = str(documentation)


class Instruction(object):
    def __init__(self, name, names, tooltip, body):
        self.name = name
        self.names = names
        self.tooltip = tooltip.rstrip(': ,')
        self.body = body

    def __str__(self):
        return "{} = {}\n{}".format(self.names, self.tooltip, self.body)


def get_url_for_instruction(instr):
    return config.documentation


def download_asm_doc_archive(downloadfolder):
    if not os.path.exists(downloadfolder):
        print("Creating {} as download folder".format(downloadfolder))
        os.makedirs(downloadfolder)
    elif not os.path.isdir(downloadfolder):
        print("Error: download folder {} is not a directory".format(downloadfolder))
        sys.exit(1)
    archive_name = os.path.join(downloadfolder, config.archive.name)
    print("Downloading archive...")
    urllib.request.urlretrieve(config.archive.url, archive_name)


def extract_asm_doc_archive(downloadfolder, inputfolder):
    print("Extracting file...")
    if os.path.isdir(os.path.join(inputfolder, config.archive.subdir)):
        for root, dirs, files in os.walk(os.path.join(inputfolder, config.archive.subdir)):
            for file in files:
                if os.path.splitext(file)[1] == ".xml":
                    os.remove(os.path.join(root, file))
    tar = tarfile.open(os.path.join(downloadfolder, config.archive.name))
    tar.extractall(path=inputfolder)

def get_description_paragraphs(document_soup, part):
    if part is None:
        return None
    for image in part.find_all('image'):
        image.decompose()
    for table in part.find_all('table'):
        table.decompose()
    paragraphs = part.find_all('para')[:5]
    description_paragraphs = []
    for paragraph in paragraphs:
        paragraph = paragraph.wrap(document_soup.new_tag('p'))
        paragraph.para.unwrap()
        description_paragraphs.append(paragraph)
    return description_paragraphs

instrclasses = set()

def parse(filename, f):
    doc = BeautifulSoup(f, 'html.parser')
    if doc.instructionsection is None:
        print(filename + ": Failed to find instructionsection")
        return None
    instructionsection = doc.instructionsection
    names = set()

    for name in STRIP_SUFFIX.sub('',instructionsection['title']).split(','):
        name = name.strip()
        names.add(name)
        for RE, SET in EXPAND_RE:
            match = RE.match(name)
            if match:
                for elt in SET:
                    names.add(match.group(1) + elt + match.group(3))

    body = get_description_paragraphs(doc, instructionsection.desc.authored)
    if body is None:
        body = get_description_paragraphs(doc, instructionsection.desc.description)
    if body is None:
        return None

    return Instruction(
        filename,
        names,
        body[0].text.strip(),
        ''.join(map(lambda x: str(x), body)).strip())

def parse_xml(directory):
    print("Parsing instructions...")
    instructions = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".xml") and file != "onebigfile.xml":
                with open(os.path.join(root, file), encoding='utf-8') as f2:
                    name = os.path.splitext(file)[0]
                    if name in IGNORED_DUPLICATES or name in IGNORED_FILE_NAMES:
                        continue
                    instruction = parse(name, f2)
                    if not instruction:
                        continue
                    instructions.append(instruction)
    return instructions


def self_test(instructions, directory):
    # For each generated instruction, check that there is a path to a file in
    # the documentation.
    directory = os.path.join(directory, config.archive.subdir)
    ok = True
    for inst in instructions:
        if not os.path.isfile(os.path.join(directory, inst.name + ".xml")):
            print("Warning: {} has not file associated".format(inst.name))
            ok = False
    return ok


def docenizer():
    global config
    args = parser.parse_args()
    print("Called with: {}".format(args))

    with open(args.configfile) as f:
        config = Config(**json.load(f))
    print("Use configs: {}".format(json.dumps(config, default=lambda o: o.__dict__)))
    # If we don't have the html folder already...
    if not os.path.isdir(os.path.join(args.inputfolder, config.archive.subdir)):
        # We don't, try with the compressed file
        if not os.path.isfile(os.path.join(args.downloadfolder, config.archive.name)):
            # We can't find that either. Download it
            try:
                download_asm_doc_archive(args.downloadfolder)
                extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
            except IOError as e:
                print("Error when downloading archive:")
                print(e)
                sys.exit(1)
        else:
            # We have a file already downloaded
            extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
    instructions = parse_xml(os.path.join(args.inputfolder, config.archive.subdir))
    print(instrclasses)
    instructions.sort(key=lambda b: b.name)
    self_test(instructions, args.inputfolder)
    all_inst = set()
    for inst in instructions:
        if not all_inst.isdisjoint(inst.names):
            print("Overlap in instruction names: {} for {}".format(
                inst.names.intersection(all_inst), inst.name))
        all_inst = all_inst.union(inst.names)
    if not self_test(instructions, args.inputfolder):
        print("Tests do not pass. Not writing output file. Aborting.")
        sys.exit(3)
    print("Writing {} instructions".format(len(instructions)))
    with open(args.outputpath, 'w') as f:
        f.write("""
import {AssemblyInstructionInfo} from '../base.js';

export function getAsmOpcode(opcode: string | undefined): AssemblyInstructionInfo | undefined {
    if (!opcode) return;
    switch (opcode) {
""".lstrip())
        for inst in instructions:
            for name in sorted(inst.names):
                f.write('        case "{}":\n'.format(name))
            f.write('            return {}'.format(json.dumps({
                "tooltip": inst.tooltip,
                "html": inst.body,
                "url": get_url_for_instruction(inst)
                }, indent=16, separators=(',', ': ')))[:-1] + '            };\n\n')
        f.write("""
    }
}
""")
if __name__ == '__main__':
    docenizer()