summaryrefslogtreecommitdiff
path: root/src/__support/hash.h
blob: d1218fdc25927f676af8d6bc15359ba674db6539 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
//===-- Portable string hash function ---------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC___SUPPORT_HASH_H
#define LLVM_LIBC_SRC___SUPPORT_HASH_H

#include "src/__support/CPP/bit.h"           // rotl
#include "src/__support/CPP/limits.h"        // numeric_limits
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/uint128.h"           // UInt128
#include <stdint.h>                          // For uint64_t

namespace LIBC_NAMESPACE {
namespace internal {

// Folded multiplication.
// This function multiplies two 64-bit integers and xor the high and
// low 64-bit parts of the result.
LIBC_INLINE uint64_t folded_multiply(uint64_t x, uint64_t y) {
  UInt128 p = static_cast<UInt128>(x) * static_cast<UInt128>(y);
  uint64_t low = static_cast<uint64_t>(p);
  uint64_t high = static_cast<uint64_t>(p >> 64);
  return low ^ high;
}

// Read as little endian.
// Shift-and-or implementation does not give a satisfactory code on aarch64.
// Therefore, we use a union to read the value.
template <typename T> LIBC_INLINE T read_little_endian(const void *ptr) {
  const uint8_t *bytes = static_cast<const uint8_t *>(ptr);
  union {
    T value;
    uint8_t buffer[sizeof(T)];
  } data;
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
  // Compiler should able to optimize this as a load followed by a byte swap.
  // On aarch64 (-mbig-endian), this compiles to the following for int:
  //      ldr     w0, [x0]
  //      rev     w0, w0
  //      ret
  for (size_t i = 0; i < sizeof(T); ++i) {
    data.buffer[i] = bytes[sizeof(T) - i - 1];
  }
#else
  for (size_t i = 0; i < sizeof(T); ++i) {
    data.buffer[i] = bytes[i];
  }
#endif
  return data.value;
}

// Specialized read functions for small values. size must be <= 8.
LIBC_INLINE void read_small_values(const void *ptr, size_t size, uint64_t &low,
                                   uint64_t &high) {
  const uint8_t *bytes = static_cast<const uint8_t *>(ptr);
  if (size >= 2) {
    if (size >= 4) {
      low = static_cast<uint64_t>(read_little_endian<uint32_t>(&bytes[0]));
      high =
          static_cast<uint64_t>(read_little_endian<uint32_t>(&bytes[size - 4]));
    } else {
      low = static_cast<uint64_t>(read_little_endian<uint16_t>(&bytes[0]));
      high = static_cast<uint64_t>(bytes[size - 1]);
    }
  } else {
    if (size > 0) {
      low = static_cast<uint64_t>(bytes[0]);
      high = static_cast<uint64_t>(bytes[0]);
    } else {
      low = 0;
      high = 0;
    }
  }
}

// This constant comes from Kunth's prng (it empirically works well).
LIBC_INLINE_VAR constexpr uint64_t MULTIPLE = 6364136223846793005;
// Rotation amount for mixing.
LIBC_INLINE_VAR constexpr uint64_t ROTATE = 23;

// Randomly generated values. For now, we use the same values as in aHash as
// they are widely tested.
// https://github.com/tkaitchuck/aHash/blob/9f6a2ad8b721fd28da8dc1d0b7996677b374357c/src/random_state.rs#L38
LIBC_INLINE_VAR constexpr uint64_t RANDOMNESS[2][4] = {
    {0x243f6a8885a308d3, 0x13198a2e03707344, 0xa4093822299f31d0,
     0x082efa98ec4e6c89},
    {0x452821e638d01377, 0xbe5466cf34e90c6c, 0xc0ac29b7c97c50dd,
     0x3f84d5b5b5470917},
};

// This is a portable string hasher. It is not cryptographically secure.
// The quality of the hash is good enough to pass all tests in SMHasher.
// The implementation is derived from the generic routine of aHash.
class HashState {
  uint64_t buffer;
  uint64_t pad;
  uint64_t extra_keys[2];
  LIBC_INLINE void update(uint64_t low, uint64_t high) {
    uint64_t combined =
        folded_multiply(low ^ extra_keys[0], high ^ extra_keys[1]);
    buffer = (buffer + pad) ^ combined;
    buffer = cpp::rotl(buffer, ROTATE);
  }
  LIBC_INLINE static uint64_t mix(uint64_t seed) {
    HashState mixer{RANDOMNESS[0][0], RANDOMNESS[0][1], RANDOMNESS[0][2],
                    RANDOMNESS[0][3]};
    mixer.update(seed, 0);
    return mixer.finish();
  }

public:
  LIBC_INLINE constexpr HashState(uint64_t a, uint64_t b, uint64_t c,
                                  uint64_t d)
      : buffer(a), pad(b), extra_keys{c, d} {}
  LIBC_INLINE HashState(uint64_t seed) {
    // Mix one more round of the seed to make it stronger.
    uint64_t mixed = mix(seed);
    buffer = RANDOMNESS[1][0] ^ mixed;
    pad = RANDOMNESS[1][1] ^ mixed;
    extra_keys[0] = RANDOMNESS[1][2] ^ mixed;
    extra_keys[1] = RANDOMNESS[1][3] ^ mixed;
  }
  LIBC_INLINE void update(const void *ptr, size_t size) {
    uint8_t const *bytes = static_cast<const uint8_t *>(ptr);
    buffer = (buffer + size) * MULTIPLE;
    uint64_t low, high;
    if (size > 8) {
      if (size > 16) {
        // update tail
        low = read_little_endian<uint64_t>(&bytes[size - 16]);
        high = read_little_endian<uint64_t>(&bytes[size - 8]);
        update(low, high);
        while (size > 16) {
          low = read_little_endian<uint64_t>(&bytes[0]);
          high = read_little_endian<uint64_t>(&bytes[8]);
          update(low, high);
          bytes += 16;
          size -= 16;
        }
      } else {
        low = read_little_endian<uint64_t>(&bytes[0]);
        high = read_little_endian<uint64_t>(&bytes[size - 8]);
        update(low, high);
      }
    } else {
      read_small_values(ptr, size, low, high);
      update(low, high);
    }
  }
  LIBC_INLINE uint64_t finish() {
    int rot = buffer & 63;
    uint64_t folded = folded_multiply(buffer, pad);
    return cpp::rotl(folded, rot);
  }
};

} // namespace internal
} // namespace LIBC_NAMESPACE

#endif // LLVM_LIBC_SRC___SUPPORT_HASH_H