1 files changed, 277 insertions, 0 deletions
diff --git a/pw_tokenizer/rust/pw_tokenizer/lib.rs b/pw_tokenizer/rust/pw_tokenizer/lib.rs
new file mode 100644
index 000000000..7909f124b
--- /dev/null
+++ b/pw_tokenizer/rust/pw_tokenizer/lib.rs
@@ -0,0 +1,277 @@
+// Copyright 2023 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+//! `pw_tokenizer` - Efficient string handling and printf style encoding.
+//!
+//! Logging is critical, but developers are often forced to choose between
+//! additional logging or saving crucial flash space. The `pw_tokenizer` crate
+//! helps address this by replacing printf-style strings with binary tokens
+//! during compilation. This enables extensive logging with substantially less
+//! memory usage.
+//!
+//! For a more in depth explanation of the systems design and motivations,
+//! see [Pigweed's pw_tokenizer module documentation](https://pigweed.dev/pw_tokenizer/).
+//!
+//! # Example
+//!
+//! ```
+//! use pw_tokenizer::tokenize_to_buffer;
+//!
+//! # fn doctest() -> pw_status::Result<()> {
+//! let mut buffer = [0u8; 1024];
+//! let len = tokenize_to_buffer!(&mut buffer, "The answer is %d", 42)?;
+//!
+//! // 4 bytes used to encode the token and one to encode the value 42.  This
+//! // is a **3.5x** reduction in size compared to the raw string!
+//! assert_eq!(len, 5);
+//! # Ok(())
+//! # }
+//! # doctest().unwrap();
+//! ```
+
+#![no_std]
+#![deny(missing_docs)]
+
+#[doc(hidden)]
+pub mod internal;
+
+#[doc(hidden)]
+// Creating a __private namespace allows us a way to get to the modules
+// we need from macros by doing:
+//     use $crate::__private as __pw_tokenizer_crate;
+//
+// This is how proc macro generated code can reliably reference back to
+// `pw_tokenizer` while still allowing a user to import it under a different
+// name.
+pub mod __private {
+    pub use crate::*;
+    pub use pw_status::Result;
+    pub use pw_stream::{Cursor, Seek, WriteInteger, WriteVarint};
+    pub use pw_tokenizer_macro::{_token, _tokenize_to_buffer};
+}
+
+/// Return the [`u32`] token for the specified string and add it to the token
+/// database.
+///
+/// This is where the magic happens in `pw_tokenizer`!   ... and by magic
+/// we mean hiding information in a special linker section that ends up in the
+/// final elf binary but does not get flashed to the device.
+///
+/// Two things are accomplished here:
+/// 1) The string is hashed into its stable `u32` token.  This is the value that
+///    is returned from the macro.
+/// 2) A [token database entry](https://pigweed.dev/pw_tokenizer/design.html#binary-database-format)
+///   is generated, assigned to a unique static symbol, placed in a linker
+///   section named `pw_tokenizer.entries.<TOKEN_HASH>`.  A
+///   [linker script](https://pigweed.googlesource.com/pigweed/pigweed/+/refs/heads/main/pw_tokenizer/pw_tokenizer_linker_sections.ld)
+///   is responsible for picking these symbols up and aggregating them into a
+///   single `.pw_tokenizer.entries` section in the final binary.
+///
+/// # Example
+/// ```
+/// use pw_tokenizer::token;
+///
+/// let token = token!("hello, \"world\"");
+/// assert_eq!(token, 3537412730);
+/// ```
+///
+/// Currently there is no support for encoding tokens to specific domains
+/// or with "fixed lengths" per [`pw_tokenizer_core::hash_bytes_fixed`].
+#[macro_export]
+macro_rules! token {
+    ($string:literal) => {{
+        $crate::__private::_token!($string)
+    }};
+}
+
+/// Tokenize a format string and arguments to an [`AsMut<u8>`] buffer and add
+/// the format string's token to the token database.
+///
+/// See [`token`] for an explanation on how strings are tokenized and entries
+/// are added to the token database.
+///
+/// Returns a [`pw_status::Result<usize>`] the number of bytes written to the buffer.
+///
+/// # Errors
+/// - [`pw_status::Error::OutOfRange`] - Buffer is not large enough to fit
+///   tokenized data.
+/// - [`pw_status::Error::InvalidArgument`] - Invalid buffer was provided.
+///
+/// # Example
+///
+/// ```
+/// use pw_tokenizer::tokenize_to_buffer;
+///
+/// # fn doctest() -> pw_status::Result<()> {
+/// let mut buffer = [0u8; 1024];
+/// let len = tokenize_to_buffer!(&mut buffer, "The answer is %d", 42)?;
+///
+/// // 4 bytes used to encode the token and one to encode the value 42.
+/// assert_eq!(len, 5);
+/// # Ok(())
+/// # }
+/// # doctest().unwrap();
+/// ```
+#[macro_export]
+macro_rules! tokenize_to_buffer {
+    ($buffer:expr, $format_string:literal) => {{
+      use $crate::__private as __pw_tokenizer_crate;
+      __pw_tokenizer_crate::_tokenize_to_buffer!($buffer, $format_string)
+    }};
+
+    ($buffer:expr, $format_string:literal, $($args:expr),*) => {{
+      use $crate::__private as __pw_tokenizer_crate;
+      __pw_tokenizer_crate::_tokenize_to_buffer!($buffer, $format_string, $($args),*)
+    }};
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    extern crate self as pw_tokenizer;
+
+    // This is not meant to be an exhaustive test of tokenization which is
+    // covered by `pw_tokenizer_core`'s unit tests.  Rather, this is testing
+    // that the `tokenize!` macro connects to that correctly.
+    #[test]
+    fn test_token() {}
+
+    macro_rules! tokenize_to_buffer_test {
+      ($expected_data:expr, $buffer_len:expr, $fmt:expr) => {
+        {
+          let mut orig_buffer = [0u8; $buffer_len];
+          let buffer =
+              tokenize_to_buffer!(&mut orig_buffer, $fmt).unwrap();
+            let len = buffer.len();
+            assert_eq!(
+              &orig_buffer[..(($buffer_len) - len)],
+              $expected_data,
+          );
+        }
+      };
+
+      ($expected_data:expr, $buffer_len:expr, $fmt:expr, $($args:expr),*) => {
+        {
+          let mut buffer = [0u8; $buffer_len];
+          let len = tokenize_to_buffer!(&mut buffer, $fmt, $($args),*).unwrap();
+          assert_eq!(
+              &buffer[..len],
+              $expected_data,
+          );
+        }
+      };
+    }
+
+    #[test]
+    fn test_decimal_format() {
+        tokenize_to_buffer_test!(
+            &[0x52, 0x1c, 0xb0, 0x4c, 0x2], // expected buffer
+            64,                             // buffer size
+            "The answer is %d!",
+            1
+        );
+
+        tokenize_to_buffer_test!(
+            &[0x36, 0xd0, 0xfb, 0x69, 0x1], // expected buffer
+            64,                             // buffer size
+            "No! The answer is %d!",
+            -1
+        );
+
+        tokenize_to_buffer_test!(
+            &[0xa4, 0xad, 0x50, 0x54, 0x0], // expected buffer
+            64,                             // buffer size
+            "I think you'll find that the answer is %d!",
+            0
+        );
+    }
+
+    #[test]
+    fn test_misc_integer_format() {
+        // %d, %i, %o, %u, %x, %X all encode integers the same.
+        tokenize_to_buffer_test!(
+            &[0x52, 0x1c, 0xb0, 0x4c, 0x2], // expected buffer
+            64,                             // buffer size
+            "The answer is %d!",
+            1
+        );
+
+        // Because %i is an alias for %d, it gets converted to a %d by the
+        // `pw_format` macro infrastructure.
+        tokenize_to_buffer_test!(
+            &[0x52, 0x1c, 0xb0, 0x4c, 0x2], // expected buffer
+            64,                             // buffer size
+            "The answer is %i!",
+            1
+        );
+
+        tokenize_to_buffer_test!(
+            &[0x5d, 0x70, 0x12, 0xb4, 0x2], // expected buffer
+            64,                             // buffer size
+            "The answer is %o!",
+            1u32
+        );
+
+        tokenize_to_buffer_test!(
+            &[0x63, 0x58, 0x5f, 0x8f, 0x2], // expected buffer
+            64,                             // buffer size
+            "The answer is %u!",
+            1u32
+        );
+
+        tokenize_to_buffer_test!(
+            &[0x66, 0xcc, 0x05, 0x7d, 0x2], // expected buffer
+            64,                             // buffer size
+            "The answer is %x!",
+            1u32
+        );
+
+        tokenize_to_buffer_test!(
+            &[0x46, 0x4c, 0x16, 0x96, 0x2], // expected buffer
+            64,                             // buffer size
+            "The answer is %X!",
+            1u32
+        );
+    }
+
+    #[test]
+    fn test_string_format() {
+        tokenize_to_buffer_test!(
+            b"\x25\xf6\x2e\x66\x07Pigweed", // expected buffer
+            64,                             // buffer size
+            "Hello: %s!",
+            "Pigweed"
+        );
+    }
+
+    #[test]
+    fn test_string_format_overflow() {
+        tokenize_to_buffer_test!(
+            b"\x25\xf6\x2e\x66\x83Pig", // expected buffer
+            8,                          // buffer size
+            "Hello: %s!",
+            "Pigweed"
+        );
+    }
+
+    #[test]
+    fn test_char_format() {
+        tokenize_to_buffer_test!(
+            &[0x2e, 0x52, 0xac, 0xe4, 0x50], // expected buffer
+            64,                              // buffer size
+            "Hello: %cigweed",
+            "P".as_bytes()[0]
+        );
+    }
+}