1 files changed, 214 insertions, 0 deletions
diff --git a/pw_tokenizer/rust/pw_tokenizer_macro.rs b/pw_tokenizer/rust/pw_tokenizer_macro.rs
new file mode 100644
index 000000000..b271ec433
--- /dev/null
+++ b/pw_tokenizer/rust/pw_tokenizer_macro.rs
@@ -0,0 +1,214 @@
+// Copyright 2023 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+// This proc macro crate is a private API for the `pw_tokenizer` crate.
+#![doc(hidden)]
+
+use std::ffi::CString;
+
+use proc_macro::TokenStream;
+use proc_macro2::Ident;
+use quote::{format_ident, quote};
+use syn::{
+    parse::{Parse, ParseStream},
+    parse_macro_input, Expr, LitStr, Token,
+};
+
+use pw_format::macros::{generate_printf, FormatAndArgs, PrintfFormatMacroGenerator, Result};
+use pw_tokenizer_core::{hash_string, TOKENIZER_ENTRY_MAGIC};
+
+type TokenStream2 = proc_macro2::TokenStream;
+
+// Handles tokenizing (hashing) `string` and adding it to the token database
+// with the specified `domain`.  A detailed description of what's happening is
+// found in the docs for [`pw_tokenizer::token`] macro.
+fn token_backend(domain: &str, string: &str) -> TokenStream2 {
+    let hash = hash_string(string);
+
+    // Line number is omitted as getting that info requires an experimental API:
+    // https://doc.rust-lang.org/proc_macro/struct.Span.html#method.start
+    let ident = format_ident!("_PW_TOKENIZER_STRING_ENTRY_{:08X}", hash);
+
+    // pw_tokenizer is intended for use with ELF files only. Mach-O files (macOS
+    // executables) do not support section names longer than 16 characters, so a
+    // short, unused section name is used on macOS.
+    let section = if cfg!(target_os = "macos") {
+        ",pw,".to_string()
+    } else {
+        format!(".pw_tokenizer.entries.{:08X}", hash)
+    };
+
+    let string = CString::new(string).unwrap();
+    let string_bytes = string.as_bytes_with_nul();
+    let string_bytes_len = string_bytes.len();
+
+    let domain = CString::new(domain).unwrap();
+    let domain_bytes = domain.as_bytes_with_nul();
+    let domain_bytes_len = domain_bytes.len();
+
+    quote! {
+        // Use an inner scope to avoid identifier collision.  Name mangling
+        // will disambiguate these in the symbol table.
+        {
+            #[repr(C, packed(1))]
+            struct TokenEntry {
+                magic: u32,
+                token: u32,
+                domain_size: u32,
+                string_length: u32,
+                domain: [u8; #domain_bytes_len],
+                string: [u8; #string_bytes_len],
+            };
+            // This is currently manually verified to be correct.
+            // TODO: b/287132907 - Add integration tests for token database.
+            #[link_section = #section ]
+            static #ident: TokenEntry = TokenEntry {
+                magic: #TOKENIZER_ENTRY_MAGIC,
+                token: #hash,
+                domain_size: #domain_bytes_len as u32,
+                string_length: #string_bytes_len as u32,
+                domain: [ #(#domain_bytes),* ],
+                string: [ #(#string_bytes),* ],
+            };
+
+            #hash
+        }
+    }
+}
+
+// Documented in `pw_tokenizer::token`.
+#[proc_macro]
+pub fn _token(tokens: TokenStream) -> TokenStream {
+    let input = parse_macro_input!(tokens as LitStr);
+    token_backend("", &input.value()).into()
+}
+
+// Args to tokenize to buffer that are parsed according to the pattern:
+//   ($buffer:expr, $format_string:literal, $($args:expr),*)
+#[derive(Debug)]
+struct TokenizeToBufferArgs {
+    buffer: Expr,
+    format_and_args: FormatAndArgs,
+}
+
+impl Parse for TokenizeToBufferArgs {
+    fn parse(input: ParseStream) -> syn::parse::Result<Self> {
+        let buffer: Expr = input.parse()?;
+        input.parse::<Token![,]>()?;
+        let format_and_args: FormatAndArgs = input.parse()?;
+
+        Ok(TokenizeToBufferArgs {
+            buffer,
+            format_and_args,
+        })
+    }
+}
+
+struct TokenizeToBufferGenerator<'a> {
+    domain: &'a str,
+    buffer: &'a Expr,
+    encoding_fragments: Vec<TokenStream2>,
+}
+
+impl<'a> TokenizeToBufferGenerator<'a> {
+    fn new(domain: &'a str, buffer: &'a Expr) -> Self {
+        Self {
+            domain,
+            buffer,
+            encoding_fragments: Vec::new(),
+        }
+    }
+}
+
+impl<'a> PrintfFormatMacroGenerator for TokenizeToBufferGenerator<'a> {
+    fn finalize(self, format_string: String) -> Result<TokenStream2> {
+        // Locally scoped aliases so we can refer to them in `quote!()`
+        let buffer = self.buffer;
+        let encoding_fragments = self.encoding_fragments;
+
+        // `token_backend` returns a `TokenStream2` which both inserts the
+        // string into the token database and returns the hash value.
+        let token = token_backend(self.domain, &format_string);
+
+        Ok(quote! {
+          {
+            // Wrapping code in an internal function to allow `?` to work in
+            // functions that don't return Results.
+            fn _pw_tokenizer_internal_encode(
+                buffer: &mut [u8],
+                token: u32
+            ) -> __pw_tokenizer_crate::Result<usize> {
+              // use pw_tokenizer's private re-export of these pw_stream bits to
+              // allow referencing with needing `pw_stream` in scope.
+              use __pw_tokenizer_crate::{Cursor, Seek, WriteInteger, WriteVarint};
+              let mut cursor = Cursor::new(buffer);
+              cursor.write_u32_le(&token)?;
+              #(#encoding_fragments);*;
+              Ok(cursor.stream_position()? as usize)
+            }
+            _pw_tokenizer_internal_encode(#buffer, #token)
+          }
+        })
+    }
+
+    fn string_fragment(&mut self, _string: &str) -> Result<()> {
+        // String fragments are encoded directly into the format string.
+        Ok(())
+    }
+
+    fn integer_conversion(&mut self, ty: Ident, expression: Expr) -> Result<Option<String>> {
+        self.encoding_fragments.push(quote! {
+          // pw_tokenizer always uses signed packing for all integers.
+          cursor.write_signed_varint(#ty::from(#expression) as i64)?;
+        });
+
+        Ok(None)
+    }
+
+    fn string_conversion(&mut self, expression: Expr) -> Result<Option<String>> {
+        self.encoding_fragments.push(quote! {
+          __pw_tokenizer_crate::internal::encode_string(&mut cursor, #expression)?;
+        });
+        Ok(None)
+    }
+
+    fn char_conversion(&mut self, expression: Expr) -> Result<Option<String>> {
+        self.encoding_fragments.push(quote! {
+          cursor.write_u8_le(&u8::from(#expression))?;
+        });
+        Ok(None)
+    }
+}
+
+// Generates code to marshal a tokenized string and arguments into a buffer.
+// See [`pw_tokenizer::tokenize_to_buffer`] for details on behavior.
+//
+// Internally the [`AsMut<u8>`] is wrapped in a [`pw_stream::Cursor`] to
+// fill the buffer incrementally.
+#[proc_macro]
+pub fn _tokenize_to_buffer(tokens: TokenStream) -> TokenStream {
+    let input = parse_macro_input!(tokens as TokenizeToBufferArgs);
+
+    // Hard codes domain to "".
+    let generator = TokenizeToBufferGenerator::new("", &input.buffer);
+
+    match generate_printf(generator, input.format_and_args) {
+        Ok(token_stream) => token_stream.into(),
+        Err(e) => e.to_compile_error().into(),
+    }
+}
+
+// Macros tested in `pw_tokenizer` crate.
+#[cfg(test)]
+mod tests {}