linker/arch/arm_neon/linker_gnu_hash_neon.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

/*
 * Copyright (C) 2019 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

// A Neon vectorized implementation of the GNU symbol hash function.

// This function generally accesses beyond the bounds of the name string. Specifically, it reads
// each aligned 8-byte chunk containing a byte of the string, including the final NUL byte. This
// should be acceptable for use with MTE, which uses 16-byte granules. Typically, the function is
// used to hash strings in an ELF file's string table, where MTE is presumably unaware of the
// bounds of each symbol, but the linker also hashes the symbol name passed to dlsym.

#include "linker_gnu_hash_neon.h"

#include <arm_neon.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

struct __attribute__((aligned(8))) GnuHashInitEntry {
  uint64_t ignore_mask;
  uint32_t accum;
};

constexpr uint32_t kStep0 = 1;
constexpr uint32_t kStep1 = kStep0 * 33;
constexpr uint32_t kStep2 = kStep1 * 33;
constexpr uint32_t kStep3 = kStep2 * 33;
constexpr uint32_t kStep4 = kStep3 * 33;
constexpr uint32_t kStep5 = kStep4 * 33;
constexpr uint32_t kStep6 = kStep5 * 33;
constexpr uint32_t kStep7 = kStep6 * 33;
constexpr uint32_t kStep8 = kStep7 * 33;
constexpr uint32_t kStep9 = kStep8 * 33;
constexpr uint32_t kStep10 = kStep9 * 33;
constexpr uint32_t kStep11 = kStep10 * 33;

// Step by -1 through -7:  33 * 0x3e0f83e1 == 1 (mod 2**32)
constexpr uint32_t kStepN1 = kStep0 * 0x3e0f83e1;
constexpr uint32_t kStepN2 = kStepN1 * 0x3e0f83e1;
constexpr uint32_t kStepN3 = kStepN2 * 0x3e0f83e1;
constexpr uint32_t kStepN4 = kStepN3 * 0x3e0f83e1;
constexpr uint32_t kStepN5 = kStepN4 * 0x3e0f83e1;
constexpr uint32_t kStepN6 = kStepN5 * 0x3e0f83e1;
constexpr uint32_t kStepN7 = kStepN6 * 0x3e0f83e1;

// Calculate the GNU hash and string length of the symbol name.
//
// The hash calculation is an optimized version of this function:
//
//    uint32_t calculate_gnu_hash(const uint8_t* name) {
//      uint32_t h = 5381;
//      for (; *name != '\0'; ++name) {
//        h *= 33;
//        h += *name;
//      }
//      return h;
//    }
//
std::pair<uint32_t, uint32_t> calculate_gnu_hash_neon(const char* name) {

  // The input string may be misaligned by 0-7 bytes (K). This function loads the first aligned
  // 8-byte chunk, then counteracts the misalignment:
  //  - The initial K bytes are set to 0xff in the working chunk vector.
  //  - The accumulator is initialized to 5381 * modinv(33)**K.
  //  - The accumulator also cancels out each initial 0xff byte.
  // If we could set bytes to NUL instead, then the accumulator wouldn't need to cancel out the
  // 0xff values, but this would break the NUL check.

  static const struct GnuHashInitEntry kInitTable[] = {
    { // (addr&7) == 0
      0ull,
      5381u*kStep0,
    }, { // (addr&7) == 1
      0xffull,
      5381u*kStepN1 - 0xffu*kStepN1,
    }, { // (addr&7) == 2
      0xffffull,
      5381u*kStepN2 - 0xffu*kStepN1 - 0xffu*kStepN2,
    }, { // (addr&7) == 3
      0xffffffull,
      5381u*kStepN3 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3,
    }, { // (addr&7) == 4
      0xffffffffull,
      5381u*kStepN4 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4,
    }, { // (addr&7) == 5
      0xffffffffffull,
      5381u*kStepN5 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5,
    }, { // (addr&7) == 6
      0xffffffffffffull,
      5381u*kStepN6 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5 - 0xffu*kStepN6,
    }, { // (addr&7) == 7
      0xffffffffffffffull,
      5381u*kStepN7 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5 - 0xffu*kStepN6 - 0xffu*kStepN7,
    },
  };

  uint8_t offset = reinterpret_cast<uintptr_t>(name) & 7;
  const uint64_t* chunk_ptr = reinterpret_cast<const uint64_t*>(reinterpret_cast<uintptr_t>(name) & ~7);
  const struct GnuHashInitEntry* entry = &kInitTable[offset];

  uint8x8_t chunk = vld1_u8(reinterpret_cast<const uint8_t*>(chunk_ptr));
  chunk |= vld1_u8(reinterpret_cast<const uint8_t*>(&entry->ignore_mask));

  uint32x4_t accum_lo = { 0 };
  uint32x4_t accum_hi = { entry->accum, 0, 0, 0 };
  const uint16x4_t kInclineVec = { kStep3, kStep2, kStep1, kStep0 };
  const uint32x4_t kStep8Vec = vdupq_n_u32(kStep8);
  uint8x8_t is_nul;
  uint16x8_t expand;

  while (1) {
    // Exit the loop if any of the 8 bytes is NUL.
    is_nul = vceq_u8(chunk, (uint8x8_t){ 0 });
    expand = vmovl_u8(chunk);
    uint64x1_t is_nul_64 = vreinterpret_u64_u8(is_nul);
    if (vget_lane_u64(is_nul_64, 0)) break;

    // Multiply both accumulators by 33**8.
    accum_lo = vmulq_u32(accum_lo, kStep8Vec);
    accum_hi = vmulq_u32(accum_hi, kStep8Vec);

    // Multiply each 4-piece subchunk by (33**3, 33**2, 33*1, 1), then accumulate the result. The lo
    // accumulator will be behind by 33**4 until the very end of the computation.
    accum_lo = vmlal_u16(accum_lo, vget_low_u16(expand), kInclineVec);
    accum_hi = vmlal_u16(accum_hi, vget_high_u16(expand), kInclineVec);

    // Load the next chunk.
    chunk = vld1_u8(reinterpret_cast<const uint8_t*>(++chunk_ptr));
  }

  // Reverse the is-NUL vector so we can use clz to count the number of remaining bytes.
  is_nul = vrev64_u8(is_nul);
  const uint64_t is_nul_u64 = vget_lane_u64(vreinterpret_u64_u8(is_nul), 0);
  const uint32_t num_valid_bits = __builtin_clzll(is_nul_u64);

  const uint32_t name_len = reinterpret_cast<const char*>(chunk_ptr) - name + (num_valid_bits >> 3);

  static const uint32_t kFinalStepTable[] = {
    kStep4, kStep0,   // 0 remaining bytes
    kStep5, kStep1,   // 1 remaining byte
    kStep6, kStep2,   // 2 remaining bytes
    kStep7, kStep3,   // 3 remaining bytes
    kStep8, kStep4,   // 4 remaining bytes
    kStep9, kStep5,   // 5 remaining bytes
    kStep10, kStep6,  // 6 remaining bytes
    kStep11, kStep7,  // 7 remaining bytes
  };

  // Advance the lo/hi accumulators appropriately for the number of remaining bytes. Multiply 33**4
  // into the lo accumulator to catch it up with the hi accumulator.
  const uint32_t* final_step = &kFinalStepTable[num_valid_bits >> 2];
  accum_lo = vmulq_u32(accum_lo, vdupq_n_u32(final_step[0]));
  accum_lo = vmlaq_u32(accum_lo, accum_hi, vdupq_n_u32(final_step[1]));

  static const uint32_t kFinalInclineTable[] = {
    0,      kStep6, kStep5, kStep4, kStep3, kStep2, kStep1, kStep0,
    0,      0,      0,      0,      0,      0,      0,      0,
  };

  // Prepare a vector to multiply powers of 33 into each of the remaining bytes.
  const uint32_t* const incline = &kFinalInclineTable[8 - (num_valid_bits >> 3)];
  const uint32x4_t incline_lo = vld1q_u32(incline);
  const uint32x4_t incline_hi = vld1q_u32(incline + 4);

  // Multiply 33 into each of the remaining 4-piece vectors, then accumulate everything into
  // accum_lo. Combine everything into a single 32-bit result.
  accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_low_u16(expand)), incline_lo);
  accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_high_u16(expand)), incline_hi);

  uint32x2_t sum = vadd_u32(vget_low_u32(accum_lo), vget_high_u32(accum_lo));
  const uint32_t hash = sum[0] + sum[1];

  return { hash, name_len };
}