aboutsummaryrefslogtreecommitdiff
path: root/pw_tokenizer/rust/pw_tokenizer_core.rs
diff options
context:
space:
mode:
Diffstat (limited to 'pw_tokenizer/rust/pw_tokenizer_core.rs')
-rw-r--r--pw_tokenizer/rust/pw_tokenizer_core.rs107
1 files changed, 107 insertions, 0 deletions
diff --git a/pw_tokenizer/rust/pw_tokenizer_core.rs b/pw_tokenizer/rust/pw_tokenizer_core.rs
new file mode 100644
index 000000000..c8e550413
--- /dev/null
+++ b/pw_tokenizer/rust/pw_tokenizer_core.rs
@@ -0,0 +1,107 @@
+// Copyright 2023 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+//! # pw_tokenizer_core
+//!
+//! This crate provides the core functionality of calculating a token hash
+//! for a string or byte sequence. This is intended to provide a minimal core
+//! for both the main `pw_tokenizer` and `pw_tokenizer_macro` crates. Users
+//! should prefer depending `pw_tokenizer` instead of this crate.
+
+use core::num::Wrapping;
+
+pub const HASH_CONSTANT: Wrapping<u32> = Wrapping(65599u32);
+
+/// Calculate the hash for a sequence of bytes.
+///
+/// ```
+/// use pw_tokenizer_core::hash_bytes;
+///
+/// let hash = hash_bytes(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07]);
+/// assert_eq!(hash, 0x9e624642);
+/// ```
+pub fn hash_bytes(bytes: &[u8]) -> u32 {
+ hash_bytes_fixed(bytes, bytes.len())
+}
+
+/// Calculate the hash for a sequence of bytes, examining at most `len` bytes.
+///
+/// ```
+/// use pw_tokenizer_core::hash_bytes_fixed;
+///
+/// let hash = hash_bytes_fixed(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07], 4);
+/// assert_eq!(hash, 0x92c5d2ac);
+/// ```
+pub fn hash_bytes_fixed(bytes: &[u8], len: usize) -> u32 {
+ // The length is hashed as if it were the first byte.
+ let mut hash = Wrapping(bytes.len() as u32);
+ let mut coefficient = HASH_CONSTANT;
+
+ // Fixed sized hashes are seeded with the total length of the slice
+ // but only hash over at most `len` bytes.
+ let bytes = if bytes.len() > len {
+ &bytes[0..len]
+ } else {
+ bytes
+ };
+
+ // Hash all of the bytes in the slice as u32s. Wrapping arithmetic
+ // is used intentionally as part of the hashing algorithm.
+ for b in bytes {
+ hash += coefficient * Wrapping(*b as u32);
+ coefficient *= HASH_CONSTANT;
+ }
+
+ hash.0
+}
+
+/// Calculate the hash for a string.
+///
+/// ```
+/// use pw_tokenizer_core::hash_string;
+///
+/// let hash = hash_string("I 💖 Pigweed");
+/// assert_eq!(hash, 0xe318d1b3);
+/// ```
+pub fn hash_string(s: &str) -> u32 {
+ hash_bytes(s.as_bytes())
+}
+
+pub const TOKENIZER_ENTRY_MAGIC: u32 = 0xBAA98DEE;
+
+#[cfg(test)]
+mod tests {
+ use super::hash_bytes_fixed;
+
+ struct TestCase {
+ string: &'static [u8],
+ hash_length: usize,
+ hash: u32,
+ }
+
+ // Generated file defines `test_cases()`.
+ include!("pw_tokenizer_core_test_cases.rs");
+
+ #[test]
+ fn hash() {
+ for test in test_cases() {
+ let hash = hash_bytes_fixed(test.string, test.hash_length);
+ assert_eq!(
+ hash, test.hash,
+ "{:08x} != {:08x} string: {:x?} hash_size: {}",
+ hash, test.hash, test.string, test.hash_length
+ );
+ }
+ }
+}