aboutsummaryrefslogtreecommitdiff
path: root/scripts/make-byte-frequency-table
blob: 37eeca7b7d56710c2a1769752e2401fd770db706 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python

# This does simple normalized frequency analysis on UTF-8 encoded text. The
# result of the analysis is translated to a ranked list, where every byte is
# assigned a rank. This list is written to src/freqs.rs.
#
# Currently, the frequencies are generated from the following corpuses:
#
#   * The CIA world fact book
#   * The source code of rustc
#   * Septuaginta

from __future__ import absolute_import, division, print_function

import argparse
from collections import Counter
import sys

preamble = '''
// NOTE: The following code was generated by "scripts/frequencies.py", do not
// edit directly
'''.lstrip()


def eprint(*args, **kwargs):
    kwargs['file'] = sys.stderr
    print(*args, **kwargs)


def main():
    p = argparse.ArgumentParser()
    p.add_argument('corpus', metavar='FILE', nargs='+')
    args = p.parse_args()

    # Get frequency counts of each byte.
    freqs = Counter()
    for i in range(0, 256):
        freqs[i] = 0

    eprint('reading entire corpus into memory')
    corpus = []
    for fpath in args.corpus:
        corpus.append(open(fpath, 'rb').read())

    eprint('computing byte frequencies')
    for c in corpus:
        for byte in c:
            freqs[byte] += 1.0 / float(len(c))

    eprint('writing Rust code')
    # Get the rank of each byte. A lower rank => lower relative frequency.
    rank = [0] * 256
    for i, (byte, _) in enumerate(freqs.most_common()):
        # print(byte)
        rank[byte] = 255 - i

    # Forcefully set the highest rank possible for bytes that start multi-byte
    # UTF-8 sequences. The idea here is that a continuation byte will be more
    # discerning in a homogenous haystack.
    for byte in range(0xC0, 0xFF + 1):
        rank[byte] = 255

    # Now write Rust.
    olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
    for byte in range(256):
        olines.append('    %3d, // %r' % (rank[byte], chr(byte)))
    olines.append('];')

    print(preamble)
    print('\n'.join(olines))


if __name__ == '__main__':
    main()