Import utf-8 crate am: b083704517 am: 2ac88101f1 am: 5ba1a60209

Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/utf-8/+/2636438 Change-Id: Ic5350753fdadd0ff62554d2a82f8f179357639d7 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
author: Hyun Jae Moon <hyunjaemoon@google.com> 2023-06-23 21:41:31 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2023-06-23 21:41:31 +0000
commit: 6070272c574057efa5c0cea72f589a3431ae6d3e (patch)
tree: 445b30e380aba6746f6bcf0294d417c020eae443
parent: 1162a6d0a1ef23da9eb950383ad80715145a9616 (diff)
parent: 5ba1a60209f37a85832c19100efe5f4226cf50ea (diff)
download: utf-8-6070272c574057efa5c0cea72f589a3431ae6d3e.tar.gz
15 files changed, 985 insertions, 0 deletions
diff --git a/Android.bp b/Android.bp
new file mode 100644
index 0000000..b5771ee
--- /dev/null
+++ b/Android.bp
@@ -0,0 +1,38 @@
+// This file is generated by cargo2android.py --run --device --tests.
+// Do not modify this file as changes will be overridden on upgrade.
+
+
+
+rust_library {
+    name: "libutf8",
+    host_supported: true,
+    crate_name: "utf8",
+    cargo_env_compat: true,
+    cargo_pkg_version: "0.7.6",
+    srcs: ["src/lib.rs"],
+    edition: "2015",
+    apex_available: [
+        "//apex_available:platform",
+        "//apex_available:anyapex",
+    ],
+    product_available: true,
+    vendor_available: true,
+}
+
+rust_test {
+    name: "utf-8_test_tests_unit",
+    host_supported: true,
+    crate_name: "unit",
+    cargo_env_compat: true,
+    cargo_pkg_version: "0.7.6",
+    srcs: ["tests/unit.rs"],
+    test_suites: ["general-tests"],
+    auto_gen_config: true,
+    test_options: {
+        unit_test: true,
+    },
+    edition: "2015",
+    rustlibs: [
+        "libutf8",
+    ],
+}
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..c01a69d
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,29 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "utf-8"
+version = "0.7.6"
+authors = ["Simon Sapin <simon.sapin@exyr.org>"]
+description = "Incremental, zero-copy UTF-8 decoding with error handling"
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/SimonSapin/rust-utf8"
+[profile.bench]
+
+[profile.test]
+
+[lib]
+name = "utf8"
+test = false
+bench = false
+
+[dependencies]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..c9a377d
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,20 @@
+[package]
+name = "utf-8"
+version = "0.7.6"
+authors = ["Simon Sapin <simon.sapin@exyr.org>"]
+description = "Incremental, zero-copy UTF-8 decoding with error handling"
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/SimonSapin/rust-utf8"
+
+[lib]
+name = "utf8"
+test = false
+bench = false
+
+[dependencies]
+
+[profile.test]
+#opt-level = 3
+
+[profile.bench]
+#debug = true
diff --git a/LICENSE b/LICENSE
new file mode 120000
index 0000000..6b579aa
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1 @@
+LICENSE-APACHE
+\ No newline at end of file
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 0000000..1b5ec8b
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,176 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..31aa793
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,23 @@
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..178a89e
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,20 @@
+name: "utf-8"
+description: "Incremental, zero-copy UTF-8 decoding with error handling"
+third_party {
+  url {
+    type: HOMEPAGE
+    value: "https://crates.io/crates/utf-8"
+  }
+  url {
+    type: ARCHIVE
+    value: "https://static.crates.io/crates/utf-8/utf-8-0.7.6.crate"
+  }
+  version: "0.7.6"
+  # Dual-licensed, using the least restrictive per go/thirdpartylicenses#same.
+  license_type: NOTICE
+  last_upgrade_date {
+    year: 2023
+    month: 6
+    day: 2
+  }
+}
diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_APACHE2
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..45dc4dd
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1 @@
+include platform/prebuilts/rust:master:/OWNERS
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..145889b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+# rust-utf8
+
+Incremental, zero-copy UTF-8 decoding for Rust
+
+[Documentation](https://docs.rs/utf-8/)
diff --git a/benches/from_utf8_lossy.rs b/benches/from_utf8_lossy.rs
new file mode 100644
index 0000000..95d9edf
--- /dev/null
+++ b/benches/from_utf8_lossy.rs
@@ -0,0 +1,30 @@
+#![feature(test)]
+
+extern crate test;
+extern crate utf8;
+
+#[path = "../tests/shared/data.rs"]
+mod data;
+
+#[path = "../tests/shared/string_from_utf8_lossy.rs"]
+mod string_from_utf8_lossy;
+
+#[bench]
+fn bench_our_string_from_utf8_lossy(bencher: &mut test::Bencher) {
+    bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum();
+    bencher.iter(|| {
+        for &(input, _expected) in data::DECODED_LOSSY {
+            test::black_box(string_from_utf8_lossy::string_from_utf8_lossy(input));
+        }
+    })
+}
+
+#[bench]
+fn bench_std_string_from_utf8_lossy(bencher: &mut test::Bencher) {
+    bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum();
+    bencher.iter(|| {
+        for &(input, _expected) in data::DECODED_LOSSY {
+            test::black_box(String::from_utf8_lossy(input));
+        }
+    })
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..ec223f2
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,186 @@
+mod lossy;
+mod read;
+
+pub use lossy::LossyDecoder;
+pub use read::{BufReadDecoder, BufReadDecoderError};
+
+use std::cmp;
+use std::error::Error;
+use std::fmt;
+use std::str;
+
+/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
+pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";
+
+#[derive(Debug, Copy, Clone)]
+pub enum DecodeError<'a> {
+    /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
+    /// then call `decode()` again with `remaining_input`.
+    Invalid {
+        valid_prefix: &'a str,
+        invalid_sequence: &'a [u8],
+        remaining_input: &'a [u8],
+    },
+
+    /// Call the `incomplete_suffix.try_complete` method with more input when available.
+    /// If no more input is available, this is an invalid byte sequence.
+    Incomplete {
+        valid_prefix: &'a str,
+        incomplete_suffix: Incomplete,
+    },
+}
+
+impl<'a> fmt::Display for DecodeError<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            DecodeError::Invalid {
+                valid_prefix,
+                invalid_sequence,
+                remaining_input,
+            } => write!(
+                f,
+                "found invalid byte sequence {invalid_sequence:02x?} after \
+                 {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \
+                 unprocessed bytes",
+                invalid_sequence = invalid_sequence,
+                valid_byte_count = valid_prefix.len(),
+                unprocessed_byte_count = remaining_input.len()
+            ),
+            DecodeError::Incomplete {
+                valid_prefix,
+                incomplete_suffix,
+            } => write!(
+                f,
+                "found incomplete byte sequence {incomplete_suffix:02x?} after \
+                 {valid_byte_count} bytes",
+                incomplete_suffix = incomplete_suffix,
+                valid_byte_count = valid_prefix.len()
+            ),
+        }
+    }
+}
+
+impl<'a> Error for DecodeError<'a> {}
+
+#[derive(Debug, Copy, Clone)]
+pub struct Incomplete {
+    pub buffer: [u8; 4],
+    pub buffer_len: u8,
+}
+
+pub fn decode(input: &[u8]) -> Result<&str, DecodeError> {
+    let error = match str::from_utf8(input) {
+        Ok(valid) => return Ok(valid),
+        Err(error) => error,
+    };
+
+    // FIXME: separate function from here to guide inlining?
+    let (valid, after_valid) = input.split_at(error.valid_up_to());
+    let valid = unsafe {
+        str::from_utf8_unchecked(valid)
+    };
+
+    match error.error_len() {
+        Some(invalid_sequence_length) => {
+            let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
+            Err(DecodeError::Invalid {
+                valid_prefix: valid,
+                invalid_sequence: invalid,
+                remaining_input: rest
+            })
+        }
+        None => {
+            Err(DecodeError::Incomplete {
+                valid_prefix: valid,
+                incomplete_suffix: Incomplete::new(after_valid),
+            })
+        }
+    }
+}
+
+impl Incomplete {
+    pub fn empty() -> Self {
+        Incomplete {
+            buffer: [0, 0, 0, 0],
+            buffer_len: 0,
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.buffer_len == 0
+    }
+
+    pub fn new(bytes: &[u8]) -> Self {
+        let mut buffer = [0, 0, 0, 0];
+        let len = bytes.len();
+        buffer[..len].copy_from_slice(bytes);
+        Incomplete {
+            buffer: buffer,
+            buffer_len: len as u8,
+        }
+    }
+
+    /// * `None`: still incomplete, call `try_complete` again with more input.
+    ///   If no more input is available, this is invalid byte sequence.
+    /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`.
+    ///   To keep decoding, pass `remaining_input` to `decode()`.
+    pub fn try_complete<'input>(&mut self, input: &'input [u8])
+                                -> Option<(Result<&str, &[u8]>, &'input [u8])> {
+        let (consumed, opt_result) = self.try_complete_offsets(input);
+        let result = opt_result?;
+        let remaining_input = &input[consumed..];
+        let result_bytes = self.take_buffer();
+        let result = match result {
+            Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
+            Err(()) => Err(result_bytes),
+        };
+        Some((result, remaining_input))
+    }
+
+    fn take_buffer(&mut self) -> &[u8] {
+        let len = self.buffer_len as usize;
+        self.buffer_len = 0;
+        &self.buffer[..len as usize]
+    }
+
+    /// (consumed_from_input, None): not enough input
+    /// (consumed_from_input, Some(Err(()))): error bytes in buffer
+    /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
+    fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
+        let initial_buffer_len = self.buffer_len as usize;
+        let copied_from_input;
+        {
+            let unwritten = &mut self.buffer[initial_buffer_len..];
+            copied_from_input = cmp::min(unwritten.len(), input.len());
+            unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
+        }
+        let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
+        match str::from_utf8(spliced) {
+            Ok(_) => {
+                self.buffer_len = spliced.len() as u8;
+                (copied_from_input, Some(Ok(())))
+            }
+            Err(error) => {
+                let valid_up_to = error.valid_up_to();
+                if valid_up_to > 0 {
+                    let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
+                    self.buffer_len = valid_up_to as u8;
+                    (consumed, Some(Ok(())))
+                } else {
+                    match error.error_len() {
+                        Some(invalid_sequence_length) => {
+                            let consumed = invalid_sequence_length
+                                .checked_sub(initial_buffer_len).unwrap();
+                            self.buffer_len = invalid_sequence_length as u8;
+                            (consumed, Some(Err(())))
+                        }
+                        None => {
+                            self.buffer_len = spliced.len() as u8;
+                            (copied_from_input, None)
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/lossy.rs b/src/lossy.rs
new file mode 100644
index 0000000..00bcdec
--- /dev/null
+++ b/src/lossy.rs
@@ -0,0 +1,92 @@
+use super::*;
+
+/// A push-based, lossy decoder for UTF-8.
+/// Errors are replaced with the U+FFFD replacement character.
+///
+/// Users “push” bytes into the decoder, which in turn “pushes” `&str` slices into a callback.
+///
+/// For example, `String::from_utf8_lossy` (but returning `String` instead of `Cow`)
+/// can be rewritten as:
+///
+/// ```rust
+/// fn string_from_utf8_lossy(input: &[u8]) -> String {
+///     let mut string = String::new();
+///     utf8::LossyDecoder::new(|s| string.push_str(s)).feed(input);
+///     string
+/// }
+/// ```
+///
+/// **Note:** Dropping the decoder signals the end of the input:
+/// If the last input chunk ended with an incomplete byte sequence for a code point,
+/// this is an error and a replacement character is emitted.
+/// Use `std::mem::forget` to inhibit this behavior.
+pub struct LossyDecoder<F: FnMut(&str)> {
+    push_str: F,
+    incomplete: Incomplete,
+}
+
+impl<F: FnMut(&str)> LossyDecoder<F> {
+    /// Create a new decoder from a callback.
+    #[inline]
+    pub fn new(push_str: F) -> Self {
+        LossyDecoder {
+            push_str: push_str,
+            incomplete: Incomplete {
+                buffer: [0, 0, 0, 0],
+                buffer_len: 0,
+            },
+        }
+    }
+
+    /// Feed one chunk of input into the decoder.
+    ///
+    /// The input is decoded lossily
+    /// and the callback called once or more with `&str` string slices.
+    ///
+    /// If the UTF-8 byte sequence for one code point was split into this bytes chunk
+    /// and previous bytes chunks, it will be correctly pieced back together.
+    pub fn feed(&mut self, mut input: &[u8]) {
+        if self.incomplete.buffer_len > 0 {
+            match self.incomplete.try_complete(input) {
+                Some((Ok(s), remaining)) => {
+                    (self.push_str)(s);
+                    input = remaining
+                }
+                Some((Err(_), remaining)) => {
+                    (self.push_str)(REPLACEMENT_CHARACTER);
+                    input = remaining
+                }
+                None => {
+                    return
+                }
+            }
+        }
+        loop {
+            match decode(input) {
+                Ok(s) => {
+                    (self.push_str)(s);
+                    return
+                }
+                Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => {
+                    (self.push_str)(valid_prefix);
+                    self.incomplete = incomplete_suffix;
+                    return
+                }
+                Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => {
+                    (self.push_str)(valid_prefix);
+                    (self.push_str)(REPLACEMENT_CHARACTER);
+                    input = remaining_input
+                }
+            }
+        }
+    }
+}
+
+impl<F: FnMut(&str)> Drop for LossyDecoder<F> {
+    #[inline]
+    fn drop(&mut self) {
+        if self.incomplete.buffer_len > 0 {
+            (self.push_str)(REPLACEMENT_CHARACTER)
+        }
+    }
+}
diff --git a/src/read.rs b/src/read.rs
new file mode 100644
index 0000000..5e38f54
--- /dev/null
+++ b/src/read.rs
@@ -0,0 +1,167 @@
+use std::io::{self, BufRead};
+use std::error::Error;
+use std::fmt;
+use std::str;
+use super::*;
+
+/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
+pub struct BufReadDecoder<B: BufRead> {
+    buf_read: B,
+    bytes_consumed: usize,
+    incomplete: Incomplete,
+}
+
+#[derive(Debug)]
+pub enum BufReadDecoderError<'a> {
+    /// Represents one UTF-8 error in the byte stream.
+    ///
+    /// In lossy decoding, each such error should be replaced with U+FFFD.
+    /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
+    InvalidByteSequence(&'a [u8]),
+
+    /// An I/O error from the underlying byte stream
+    Io(io::Error),
+}
+
+impl<'a> BufReadDecoderError<'a> {
+    /// Replace UTF-8 errors with U+FFFD
+    pub fn lossy(self) -> Result<&'static str, io::Error> {
+        match self {
+            BufReadDecoderError::Io(error) => Err(error),
+            BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER),
+        }
+    }
+}
+
+impl<'a> fmt::Display for BufReadDecoderError<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            BufReadDecoderError::InvalidByteSequence(bytes) => {
+                write!(f, "invalid byte sequence: {:02x?}", bytes)
+            }
+            BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
+        }
+    }
+}
+
+impl<'a> Error for BufReadDecoderError<'a> {
+    fn source(&self) -> Option<&(dyn Error + 'static)> {
+        match *self {
+            BufReadDecoderError::InvalidByteSequence(_) => None,
+            BufReadDecoderError::Io(ref err) => Some(err),
+        }
+    }
+}
+
+impl<B: BufRead> BufReadDecoder<B> {
+    /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`.
+    pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> {
+        let mut decoder = Self::new(buf_read);
+        let mut string = String::new();
+        while let Some(result) = decoder.next_lossy() {
+            string.push_str(result?)
+        }
+        Ok(string)
+    }
+
+    pub fn new(buf_read: B) -> Self {
+        Self {
+            buf_read,
+            bytes_consumed: 0,
+            incomplete: Incomplete::empty(),
+        }
+    }
+
+    /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD.
+    pub fn next_lossy(&mut self) -> Option<io::Result<&str>> {
+        self.next_strict().map(|result| result.or_else(|e| e.lossy()))
+    }
+
+    /// Decode and consume the next chunk of UTF-8 input.
+    ///
+    /// This method is intended to be called repeatedly until it returns `None`,
+    /// which represents EOF from the underlying byte stream.
+    /// This is similar to `Iterator::next`,
+    /// except that decoded chunks borrow the decoder (~iterator)
+    /// so they need to be handled or copied before the next chunk can start decoding.
+    pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
+        enum BytesSource {
+            BufRead(usize),
+            Incomplete,
+        }
+        macro_rules! try_io {
+            ($io_result: expr) => {
+                match $io_result {
+                    Ok(value) => value,
+                    Err(error) => return Some(Err(BufReadDecoderError::Io(error)))
+                }
+            }
+        }
+        let (source, result) = loop {
+            if self.bytes_consumed > 0 {
+                self.buf_read.consume(self.bytes_consumed);
+                self.bytes_consumed = 0;
+            }
+            let buf = try_io!(self.buf_read.fill_buf());
+
+            // Force loop iteration to go through an explicit `continue`
+            enum Unreachable {}
+            let _: Unreachable = if self.incomplete.is_empty() {
+                if buf.is_empty() {
+                    return None  // EOF
+                }
+                match str::from_utf8(buf) {
+                    Ok(_) => {
+                        break (BytesSource::BufRead(buf.len()), Ok(()))
+                    }
+                    Err(error) => {
+                        let valid_up_to = error.valid_up_to();
+                        if valid_up_to > 0 {
+                            break (BytesSource::BufRead(valid_up_to), Ok(()))
+                        }
+                        match error.error_len() {
+                            Some(invalid_sequence_length) => {
+                                break (BytesSource::BufRead(invalid_sequence_length), Err(()))
+                            }
+                            None => {
+                                self.bytes_consumed = buf.len();
+                                self.incomplete = Incomplete::new(buf);
+                                // need more input bytes
+                                continue
+                            }
+                        }
+                    }
+                }
+            } else {
+                if buf.is_empty() {
+                    break (BytesSource::Incomplete, Err(()))  // EOF with incomplete code point
+                }
+                let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
+                self.bytes_consumed = consumed;
+                match opt_result {
+                    None => {
+                        // need more input bytes
+                        continue
+                    }
+                    Some(result) => {
+                        break (BytesSource::Incomplete, result)
+                    }
+                }
+            };
+        };
+        let bytes = match source {
+            BytesSource::BufRead(byte_count) => {
+                self.bytes_consumed = byte_count;
+                let buf = try_io!(self.buf_read.fill_buf());
+                &buf[..byte_count]
+            }
+            BytesSource::Incomplete => {
+                self.incomplete.take_buffer()
+            }
+        };
+        match result {
+            Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
+            Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
+        }
+    }
+}
diff --git a/tests/unit.rs b/tests/unit.rs
new file mode 100644
index 0000000..6839e84
--- /dev/null
+++ b/tests/unit.rs
@@ -0,0 +1,197 @@
+extern crate utf8;
+
+use std::borrow::Cow;
+use std::collections::VecDeque;
+use std::io;
+use utf8::*;
+
+/// A re-implementation of std::str::from_utf8
+pub fn str_from_utf8(input: &[u8]) -> Result<&str, usize> {
+    match decode(input) {
+        Ok(s) => return Ok(s),
+        Err(DecodeError::Invalid { valid_prefix, .. }) |
+        Err(DecodeError::Incomplete { valid_prefix, .. }) => Err(valid_prefix.len()),
+    }
+}
+
+#[test]
+fn test_str_from_utf8() {
+    let xs = b"hello";
+    assert_eq!(str_from_utf8(xs), Ok("hello"));
+
+    let xs = "ศไทย中华Việt Nam".as_bytes();
+    assert_eq!(str_from_utf8(xs), Ok("ศไทย中华Việt Nam"));
+
+    let xs = b"hello\xFF";
+    assert!(str_from_utf8(xs).is_err());
+}
+
+#[test]
+fn test_is_utf8() {
+    // Chars of 1, 2, 3, and 4 bytes
+    assert!(str_from_utf8("eé€\u{10000}".as_bytes()).is_ok());
+    // invalid prefix
+    assert!(str_from_utf8(&[0x80]).is_err());
+    // invalid 2 byte prefix
+    assert!(str_from_utf8(&[0xc0]).is_err());
+    assert!(str_from_utf8(&[0xc0, 0x10]).is_err());
+    // invalid 3 byte prefix
+    assert!(str_from_utf8(&[0xe0]).is_err());
+    assert!(str_from_utf8(&[0xe0, 0x10]).is_err());
+    assert!(str_from_utf8(&[0xe0, 0xff, 0x10]).is_err());
+    // invalid 4 byte prefix
+    assert!(str_from_utf8(&[0xf0]).is_err());
+    assert!(str_from_utf8(&[0xf0, 0x10]).is_err());
+    assert!(str_from_utf8(&[0xf0, 0xff, 0x10]).is_err());
+    assert!(str_from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_err());
+
+    // deny overlong encodings
+    assert!(str_from_utf8(&[0xc0, 0x80]).is_err());
+    assert!(str_from_utf8(&[0xc0, 0xae]).is_err());
+    assert!(str_from_utf8(&[0xe0, 0x80, 0x80]).is_err());
+    assert!(str_from_utf8(&[0xe0, 0x80, 0xaf]).is_err());
+    assert!(str_from_utf8(&[0xe0, 0x81, 0x81]).is_err());
+    assert!(str_from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err());
+    assert!(str_from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err());
+
+    // deny surrogates
+    assert!(str_from_utf8(&[0xED, 0xA0, 0x80]).is_err());
+    assert!(str_from_utf8(&[0xED, 0xBF, 0xBF]).is_err());
+
+    assert!(str_from_utf8(&[0xC2, 0x80]).is_ok());
+    assert!(str_from_utf8(&[0xDF, 0xBF]).is_ok());
+    assert!(str_from_utf8(&[0xE0, 0xA0, 0x80]).is_ok());
+    assert!(str_from_utf8(&[0xED, 0x9F, 0xBF]).is_ok());
+    assert!(str_from_utf8(&[0xEE, 0x80, 0x80]).is_ok());
+    assert!(str_from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok());
+    assert!(str_from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok());
+    assert!(str_from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
+}
+
+/// A re-implementation of String::from_utf8_lossy
+pub fn string_from_utf8_lossy(input: &[u8]) -> Cow<str> {
+    let mut result = decode(input);
+    if let Ok(s) = result {
+        return s.into()
+    }
+    let mut string = String::with_capacity(input.len() + REPLACEMENT_CHARACTER.len());
+    loop {
+        match result {
+            Ok(s) => {
+                string.push_str(s);
+                return string.into()
+            }
+            Err(DecodeError::Incomplete { valid_prefix, .. }) => {
+                string.push_str(valid_prefix);
+                string.push_str(REPLACEMENT_CHARACTER);
+                return string.into()
+            }
+            Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => {
+                string.push_str(valid_prefix);
+                string.push_str(REPLACEMENT_CHARACTER);
+                result = decode(remaining_input);
+            }
+        }
+    }
+}
+
+pub const DECODED_LOSSY: &'static [(&'static [u8], &'static str)] = &[
+    (b"hello", "hello"),
+    (b"\xe0\xb8\xa8\xe0\xb9\x84\xe0\xb8\x97\xe0\xb8\xa2\xe4\xb8\xad\xe5\x8d\x8e", "ศไทย中华"),
+    (b"Vi\xe1\xbb\x87t Nam", "Việt Nam"),
+    (b"Hello\xC2 There\xFF ", "Hello\u{FFFD} There\u{FFFD} "),
+    (b"Hello\xC0\x80 There", "Hello\u{FFFD}\u{FFFD} There"),
+    (b"\xE6\x83 Goodbye", "\u{FFFD} Goodbye"),
+    (b"\xF5foo\xF5\x80bar", "\u{FFFD}foo\u{FFFD}\u{FFFD}bar"),
+    (b"\xF5foo\xF5\xC2", "\u{FFFD}foo\u{FFFD}\u{FFFD}"),
+    (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz"),
+    (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz"),
+    (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar"),
+    (b"\xF0\x90\x80foo", "\u{FFFD}foo"),
+    // surrogates
+    (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar"),
+];
+
+#[test]
+fn test_string_from_utf8_lossy() {
+    for &(input, expected) in DECODED_LOSSY {
+        assert_eq!(string_from_utf8_lossy(input), expected);
+    }
+}
+
+pub fn all_partitions<'a, F>(input: &'a [u8], f: F)
+    where F: Fn(&[&[u8]])
+{
+
+    fn all_partitions_inner<'a, F>(chunks: &mut Vec<&'a [u8]>, input: &'a [u8], f: &F)
+        where F: Fn(&[&[u8]])
+    {
+        if input.is_empty() {
+            f(chunks)
+        }
+        for i in 1..(input.len() + 1) {
+            chunks.push(&input[..i]);
+            all_partitions_inner(chunks, &input[i..], f);
+            chunks.pop();
+        }
+    }
+
+    let mut chunks = Vec::new();
+    all_partitions_inner(&mut chunks, input, &f);
+    assert_eq!(chunks.len(), 0);
+}
+
+#[test]
+fn test_incremental_decoder() {
+    for &(input, expected) in DECODED_LOSSY {
+        all_partitions(input, |chunks| {
+            let mut string = String::new();
+            {
+                let mut decoder = LossyDecoder::new(|s| string.push_str(s));
+                for &chunk in &*chunks {
+                    decoder.feed(chunk);
+                }
+            }
+            assert_eq!(string, expected);
+        });
+    }
+}
+
+#[test]
+fn test_bufread_decoder() {
+    for &(input, expected) in DECODED_LOSSY {
+        all_partitions(input, |chunks| {
+            let chunks = Chunks(chunks.to_vec().into());
+            let string = BufReadDecoder::read_to_string_lossy(chunks).unwrap();
+            assert_eq!(string, expected)
+        });
+    }
+}
+
+struct Chunks<'a>(VecDeque<&'a [u8]>);
+
+impl<'a> io::Read for Chunks<'a> {
+    fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
+        unimplemented!()
+    }
+}
+
+impl<'a> io::BufRead for Chunks<'a> {
+    fn fill_buf(&mut self) -> io::Result<&[u8]> {
+        Ok(*self.0.front().unwrap())
+    }
+
+    fn consume(&mut self, bytes: usize) {
+        {
+            let front = self.0.front_mut().unwrap();
+            *front = &front[bytes..];
+            if !front.is_empty() {
+                return
+            }
+        }
+        if self.0.len() > 1 {
+            self.0.pop_front();
+        }
+    }
+
+}
author	Hyun Jae Moon <hyunjaemoon@google.com>	2023-06-23 21:41:31 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2023-06-23 21:41:31 +0000
commit	6070272c574057efa5c0cea72f589a3431ae6d3e (patch)
tree	445b30e380aba6746f6bcf0294d417c020eae443
parent	1162a6d0a1ef23da9eb950383ad80715145a9616 (diff)
parent	5ba1a60209f37a85832c19100efe5f4226cf50ea (diff)
download	utf-8-6070272c574057efa5c0cea72f589a3431ae6d3e.tar.gz