aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kotur <qtr@google.com>2021-03-16 19:18:13 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2021-03-16 19:18:13 +0000
commit0696e21364edc85a52382f187f1da9e50587bbaf (patch)
tree8464f4a84d51ffc9f8d833d899e70cfd9c5f4309
parent9a8b9f84d17a7e40712987ab7968706a1f94edbb (diff)
parent359dc5454334ad6d0de6a328ca6e8792c97d6d4c (diff)
downloadcsv-core-0696e21364edc85a52382f187f1da9e50587bbaf.tar.gz
Initial import of csv-core-0.1.10. am: 7e6f3508c3 am: 359dc54543
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/csv-core/+/1621401 Change-Id: I2bb6678500fae7573ff9bdabef4bd890acacd2cd
-rw-r--r--.cargo_vcs_info.json5
-rw-r--r--Cargo.toml43
-rw-r--r--Cargo.toml.orig31
-rw-r--r--README.md113
-rw-r--r--benches/bench.rs94
-rw-r--r--src/lib.rs189
-rw-r--r--src/reader.rs2005
-rw-r--r--src/writer.rs1047
8 files changed, 3527 insertions, 0 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644
index 0000000..6de0194
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,5 @@
+{
+ "git": {
+ "sha1": "70c8600b29349f9ee0501577284d8300ae9c8055"
+ }
+}
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..0f43cb0
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,43 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+edition = "2018"
+name = "csv-core"
+version = "0.1.10"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = "Bare bones CSV parsing with no_std support."
+homepage = "https://github.com/BurntSushi/rust-csv"
+documentation = "https://docs.rs/csv-core"
+readme = "README.md"
+keywords = ["csv", "comma", "parser", "delimited", "no_std"]
+categories = ["encoding", "no-std", "parser-implementations"]
+license = "Unlicense/MIT"
+repository = "https://github.com/BurntSushi/rust-csv"
+
+[lib]
+bench = false
+[dependencies.memchr]
+version = "2"
+default-features = false
+[dev-dependencies.arrayvec]
+version = "0.5"
+default-features = false
+
+[features]
+default = []
+libc = ["memchr/libc"]
+[badges.appveyor]
+repository = "BurntSushi/rust-csv"
+
+[badges.travis-ci]
+repository = "BurntSushi/rust-csv"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..110ee06
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,31 @@
+[package]
+name = "csv-core"
+version = "0.1.10" #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = "Bare bones CSV parsing with no_std support."
+documentation = "https://docs.rs/csv-core"
+homepage = "https://github.com/BurntSushi/rust-csv"
+repository = "https://github.com/BurntSushi/rust-csv"
+readme = "README.md"
+keywords = ["csv", "comma", "parser", "delimited", "no_std"]
+license = "Unlicense/MIT"
+categories = ["encoding", "no-std", "parser-implementations"]
+workspace = ".."
+edition = "2018"
+
+[badges]
+travis-ci = { repository = "BurntSushi/rust-csv" }
+appveyor = { repository = "BurntSushi/rust-csv" }
+
+[lib]
+bench = false
+
+[features]
+default = []
+libc = ["memchr/libc"]
+
+[dependencies]
+memchr = { version = "2", default-features = false }
+
+[dev-dependencies]
+arrayvec = { version = "0.5", default-features = false }
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..07afd99
--- /dev/null
+++ b/README.md
@@ -0,0 +1,113 @@
+csv-core
+========
+A fast CSV reader and write for use in a `no_std` context. This crate will
+never use the Rust standard library.
+
+[![Linux build status](https://api.travis-ci.org/BurntSushi/rust-csv.png)](https://travis-ci.org/BurntSushi/rust-csv)
+[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/rust-csv?svg=true)](https://ci.appveyor.com/project/BurntSushi/rust-csv)
+[![](http://meritbadge.herokuapp.com/csv-core)](https://crates.io/crates/csv-core)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+### Documentation
+
+https://docs.rs/csv-core
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+csv-core = "0.1.6"
+```
+
+### Build features
+
+This crate by default links with `libc`, which is done via the `libc` feature.
+Disabling this feature will drop `csv-core`'s dependency on `libc`.
+
+
+### Example: reading CSV
+
+This example shows how to count the number of fields and records in CSV data.
+
+```rust
+use csv_core::{Reader, ReadFieldResult};
+
+let data = "
+foo,bar,baz
+a,b,c
+xxx,yyy,zzz
+";
+
+let mut rdr = Reader::new();
+let mut bytes = data.as_bytes();
+let mut count_fields = 0;
+let mut count_records = 0;
+loop {
+ // We skip handling the output since we don't need it for counting.
+ let (result, nin, _) = rdr.read_field(bytes, &mut [0; 1024]);
+ bytes = &bytes[nin..];
+ match result {
+ ReadFieldResult::InputEmpty => {},
+ ReadFieldResult::OutputFull => panic!("field too large"),
+ ReadFieldResult::Field { record_end } => {
+ count_fields += 1;
+ if record_end {
+ count_records += 1;
+ }
+ }
+ ReadFieldResult::End => break,
+ }
+}
+assert_eq!(3, count_records);
+assert_eq!(9, count_fields);
+```
+
+
+### Example: writing CSV
+
+This example shows how to use the `Writer` API to write valid CSV data. Proper
+quoting is handled automatically.
+
+```rust
+use csv_core::Writer;
+
+// This is where we'll write out CSV data.
+let mut out = &mut [0; 1024];
+// The number of bytes we've written to `out`.
+let mut nout = 0;
+// Create a CSV writer with a default configuration.
+let mut wtr = Writer::new();
+
+// Write a single field. Note that we ignore the `WriteResult` and the number
+// of input bytes consumed since we're doing this by hand.
+let (_, _, n) = wtr.field(&b"foo"[..], &mut out[nout..]);
+nout += n;
+
+// Write a delimiter and then another field that requires quotes.
+let (_, n) = wtr.delimiter(&mut out[nout..]);
+nout += n;
+let (_, _, n) = wtr.field(&b"bar,baz"[..], &mut out[nout..]);
+nout += n;
+let (_, n) = wtr.terminator(&mut out[nout..]);
+nout += n;
+
+// Now write another record.
+let (_, _, n) = wtr.field(&b"a \"b\" c"[..], &mut out[nout..]);
+nout += n;
+let (_, n) = wtr.delimiter(&mut out[nout..]);
+nout += n;
+let (_, _, n) = wtr.field(&b"quux"[..], &mut out[nout..]);
+nout += n;
+
+// We must always call finish once done writing.
+// This ensures that any closing quotes are written.
+let (_, n) = wtr.finish(&mut out[nout..]);
+nout += n;
+
+assert_eq!(&out[..nout], &b"\
+foo,\"bar,baz\"
+\"a \"\"b\"\" c\",quux"[..]);
+```
diff --git a/benches/bench.rs b/benches/bench.rs
new file mode 100644
index 0000000..2aa24a9
--- /dev/null
+++ b/benches/bench.rs
@@ -0,0 +1,94 @@
+#![feature(test)]
+
+extern crate test;
+
+use test::Bencher;
+
+use csv_core::{Reader, ReaderBuilder};
+
+static NFL: &'static str = include_str!("../../examples/data/bench/nfl.csv");
+static GAME: &'static str = include_str!("../../examples/data/bench/game.csv");
+static POP: &'static str =
+ include_str!("../../examples/data/bench/worldcitiespop.csv");
+static MBTA: &'static str =
+ include_str!("../../examples/data/bench/gtfs-mbta-stop-times.csv");
+
+macro_rules! bench {
+ ($name:ident, $data:ident, $counter:ident, $result:expr) => {
+ bench!($name, $data, $counter, $result, false);
+ };
+ ($name:ident, $data:ident, $counter:ident, $result:expr, NFA) => {
+ bench!($name, $data, $counter, $result, true);
+ };
+ ($name:ident, $data:ident, $counter:ident, $result:expr, $nfa:expr) => {
+ #[bench]
+ fn $name(b: &mut Bencher) {
+ let data = $data.as_bytes();
+ b.bytes = data.len() as u64;
+ let mut rdr = ReaderBuilder::new().nfa($nfa).build();
+ b.iter(|| {
+ rdr.reset();
+ assert_eq!($counter(&mut rdr, data), $result);
+ })
+ }
+ };
+}
+
+bench!(count_nfl_field_copy_dfa, NFL, count_fields, 130000);
+bench!(count_nfl_field_copy_nfa, NFL, count_fields, 130000, NFA);
+bench!(count_nfl_record_copy_dfa, NFL, count_records, 10000);
+bench!(count_nfl_record_copy_nfa, NFL, count_records, 10000, NFA);
+
+bench!(count_game_field_copy_dfa, GAME, count_fields, 600000);
+bench!(count_game_field_copy_nfa, GAME, count_fields, 600000, NFA);
+bench!(count_game_record_copy_dfa, GAME, count_records, 100000);
+bench!(count_game_record_copy_nfa, GAME, count_records, 100000, NFA);
+
+bench!(count_pop_field_copy_dfa, POP, count_fields, 140007);
+bench!(count_pop_field_copy_nfa, POP, count_fields, 140007, NFA);
+bench!(count_pop_record_copy_dfa, POP, count_records, 20001);
+bench!(count_pop_record_copy_nfa, POP, count_records, 20001, NFA);
+
+bench!(count_mbta_field_copy_dfa, MBTA, count_fields, 90000);
+bench!(count_mbta_field_copy_nfa, MBTA, count_fields, 90000, NFA);
+bench!(count_mbta_record_copy_dfa, MBTA, count_records, 10000);
+bench!(count_mbta_record_copy_nfa, MBTA, count_records, 10000, NFA);
+
+fn count_fields(rdr: &mut Reader, mut data: &[u8]) -> u64 {
+ use csv_core::ReadFieldResult::*;
+
+ let mut count = 0;
+ let mut field = [0u8; 1024];
+ loop {
+ let (res, nin, _) = rdr.read_field(data, &mut field);
+ data = &data[nin..];
+ match res {
+ InputEmpty => {}
+ OutputFull => panic!("field too large"),
+ Field { .. } => {
+ count += 1;
+ }
+ End => break,
+ }
+ }
+ count
+}
+
+fn count_records(rdr: &mut Reader, mut data: &[u8]) -> u64 {
+ use csv_core::ReadRecordResult::*;
+
+ let mut count = 0;
+ let mut record = [0; 8192];
+ let mut ends = [0; 32];
+ loop {
+ let (res, nin, _, _) = rdr.read_record(data, &mut record, &mut ends);
+ data = &data[nin..];
+ match res {
+ InputEmpty => {}
+ OutputFull | OutputEndsFull => panic!("field too large"),
+ Record => count += 1,
+ End => break,
+ }
+ }
+ count
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..747e58d
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,189 @@
+/*!
+`csv-core` provides a fast CSV reader and writer for use in a `no_std` context.
+
+This crate will never use the standard library. `no_std` support is therefore
+enabled by default.
+
+If you're looking for more ergonomic CSV parsing routines, please use the
+[`csv`](https://docs.rs/csv) crate.
+
+# Overview
+
+This crate has two primary APIs. The `Reader` API provides a CSV parser, and
+the `Writer` API provides a CSV writer.
+
+# Example: reading CSV
+
+This example shows how to count the number of fields and records in CSV data.
+
+```
+use csv_core::{Reader, ReadFieldResult};
+
+let data = "
+foo,bar,baz
+a,b,c
+xxx,yyy,zzz
+";
+
+let mut rdr = Reader::new();
+let mut bytes = data.as_bytes();
+let mut count_fields = 0;
+let mut count_records = 0;
+loop {
+ // We skip handling the output since we don't need it for counting.
+ let (result, nin, _) = rdr.read_field(bytes, &mut [0; 1024]);
+ bytes = &bytes[nin..];
+ match result {
+ ReadFieldResult::InputEmpty => {},
+ ReadFieldResult::OutputFull => panic!("field too large"),
+ ReadFieldResult::Field { record_end } => {
+ count_fields += 1;
+ if record_end {
+ count_records += 1;
+ }
+ }
+ ReadFieldResult::End => break,
+ }
+}
+assert_eq!(3, count_records);
+assert_eq!(9, count_fields);
+```
+
+# Example: writing CSV
+
+This example shows how to use the `Writer` API to write valid CSV data. Proper
+quoting is handled automatically.
+
+```
+use csv_core::Writer;
+
+// This is where we'll write out CSV data.
+let mut out = &mut [0; 1024];
+// The number of bytes we've written to `out`.
+let mut nout = 0;
+// Create a CSV writer with a default configuration.
+let mut wtr = Writer::new();
+
+// Write a single field. Note that we ignore the `WriteResult` and the number
+// of input bytes consumed since we're doing this by hand.
+let (_, _, n) = wtr.field(&b"foo"[..], &mut out[nout..]);
+nout += n;
+
+// Write a delimiter and then another field that requires quotes.
+let (_, n) = wtr.delimiter(&mut out[nout..]);
+nout += n;
+let (_, _, n) = wtr.field(&b"bar,baz"[..], &mut out[nout..]);
+nout += n;
+let (_, n) = wtr.terminator(&mut out[nout..]);
+nout += n;
+
+// Now write another record.
+let (_, _, n) = wtr.field(&b"a \"b\" c"[..], &mut out[nout..]);
+nout += n;
+let (_, n) = wtr.delimiter(&mut out[nout..]);
+nout += n;
+let (_, _, n) = wtr.field(&b"quux"[..], &mut out[nout..]);
+nout += n;
+
+// We must always call finish once done writing.
+// This ensures that any closing quotes are written.
+let (_, n) = wtr.finish(&mut out[nout..]);
+nout += n;
+
+assert_eq!(&out[..nout], &b"\
+foo,\"bar,baz\"
+\"a \"\"b\"\" c\",quux"[..]);
+```
+*/
+
+#![deny(missing_docs)]
+#![no_std]
+
+pub use crate::reader::{
+ ReadFieldNoCopyResult, ReadFieldResult, ReadRecordNoCopyResult,
+ ReadRecordResult, Reader, ReaderBuilder,
+};
+pub use crate::writer::{
+ is_non_numeric, quote, WriteResult, Writer, WriterBuilder,
+};
+
+mod reader;
+mod writer;
+
+/// A record terminator.
+///
+/// Use this to specify the record terminator while parsing CSV. The default is
+/// CRLF, which treats `\r`, `\n` or `\r\n` as a single record terminator.
+#[derive(Clone, Copy, Debug)]
+pub enum Terminator {
+ /// Parses `\r`, `\n` or `\r\n` as a single record terminator.
+ CRLF,
+ /// Parses the byte given as a record terminator.
+ Any(u8),
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl Terminator {
+ /// Checks whether the terminator is set to CRLF.
+ fn is_crlf(&self) -> bool {
+ match *self {
+ Terminator::CRLF => true,
+ Terminator::Any(_) => false,
+ _ => unreachable!(),
+ }
+ }
+
+ fn equals(&self, other: u8) -> bool {
+ match *self {
+ Terminator::CRLF => other == b'\r' || other == b'\n',
+ Terminator::Any(b) => other == b,
+ _ => unreachable!(),
+ }
+ }
+}
+
+impl Default for Terminator {
+ fn default() -> Terminator {
+ Terminator::CRLF
+ }
+}
+
+/// The quoting style to use when writing CSV data.
+#[derive(Clone, Copy, Debug)]
+pub enum QuoteStyle {
+ /// This puts quotes around every field. Always.
+ Always,
+ /// This puts quotes around fields only when necessary.
+ ///
+ /// They are necessary when fields contain a quote, delimiter or record
+ /// terminator. Quotes are also necessary when writing an empty record
+ /// (which is indistinguishable from a record with one empty field).
+ ///
+ /// This is the default.
+ Necessary,
+ /// This puts quotes around all fields that are non-numeric. Namely, when
+ /// writing a field that does not parse as a valid float or integer, then
+ /// quotes will be used even if they aren't strictly necessary.
+ NonNumeric,
+ /// This *never* writes quotes, even if it would produce invalid CSV data.
+ Never,
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl Default for QuoteStyle {
+ fn default() -> QuoteStyle {
+ QuoteStyle::Necessary
+ }
+}
diff --git a/src/reader.rs b/src/reader.rs
new file mode 100644
index 0000000..dbd6dc3
--- /dev/null
+++ b/src/reader.rs
@@ -0,0 +1,2005 @@
+use core::fmt;
+
+use crate::Terminator;
+
+// BE ADVISED
+//
+// This may just be one of the more complicated CSV parsers you'll come across.
+// The implementation never allocates and consists of both a functional NFA
+// parser and a DFA parser. The DFA parser is the work horse and we could elide
+// much of the work involved in making the NFA parser work, but the NFA parser
+// is much easier to debug. The NFA parser is tested alongside the DFA parser,
+// so they should never be out of sync.
+//
+// The basic structure of the implementation is to encode the NFA parser as
+// an explicit state machine in code. The DFA is then generated by populating
+// a transition table on the stack by exhaustively enumerating all possible
+// states on all possible inputs (this is possible because the number of states
+// and the number of inputs is very small).
+//
+// Note that some pieces of the NFA parser (such as the NFA state machine) are
+// required. In particular, the translation from the NFA to the DFA depends on
+// the configuration of the CSV parser as given by the caller, and indeed, this
+// is one of the key performance benefits of the DFA: it doesn't have any
+// overhead (other than a bigger transition table) associated with the number
+// of configuration options.
+//
+// ADVICE FOR HACKERS
+//
+// This code is too clever for its own good. As such, changes to some parts of
+// the code may have a non-obvious impact on other parts. This is mostly
+// motivated by trying to keep the DFA transition table as small as possible,
+// since it is stored on the stack. Here are some tips that may save you some
+// time:
+//
+// * If you add a new NFA state, then you also need to consider how it impacts
+// the DFA. If all of the incoming transitions into an NFA state are
+// epsilon transitions, then it probably isn't materialized in the DFA.
+// If the NFA state indicates that a field or a record has been parsed, then
+// it should be considered final. Let the comments in `NfaState` be your
+// guide.
+// * If you add a new configuration knob to the parser, then you may need to
+// modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant
+// indicates the total number of discriminating bytes in the DFA. And if you
+// modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to
+// add a new class. For example, in order to add parsing support for
+// comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment
+// byte (if one exists) to the list of classes in `build_dfa`.
+// * The special DFA start state doubles as the final state once all input
+// from the caller has been exhausted. We must be careful to guard this
+// case analysis on whether the input is actually exhausted, since the start
+// state is an otherwise valid state.
+
+/// A pull based CSV reader.
+///
+/// This reader parses CSV data using a finite state machine. Callers can
+/// extract parsed data incrementally using one of the `read` methods.
+///
+/// Note that this CSV reader is somewhat encoding agnostic. The source data
+/// needs to be at least ASCII compatible. There is no support for specifying
+/// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead,
+/// any byte can be used, although callers probably want to stick to the ASCII
+/// subset (`<= 0x7F`).
+///
+/// # Usage
+///
+/// A reader has two different ways to read CSV data, each with their own
+/// trade offs.
+///
+/// * `read_field` - Copies a single CSV field into an output buffer while
+/// unescaping quotes. This is simple to use and doesn't require storing an
+/// entire record contiguously in memory, but it is slower.
+/// * `read_record` - Copies an entire CSV record into an output buffer while
+/// unescaping quotes. The ending positions of each field are copied into
+/// an additional buffer. This is harder to use and requires larger output
+/// buffers, but it is faster than `read_field` since it amortizes more
+/// costs.
+///
+/// # RFC 4180
+///
+/// [RFC 4180](https://tools.ietf.org/html/rfc4180)
+/// is the closest thing to a specification for CSV data. Unfortunately,
+/// CSV data that is seen in the wild can vary significantly. Often, the CSV
+/// data is outright invalid. Instead of fixing the producers of bad CSV data,
+/// we have seen fit to make consumers much more flexible in what they accept.
+/// This reader continues that tradition, and therefore, isn't technically
+/// compliant with RFC 4180. In particular, this reader will never return an
+/// error and will always find *a* parse.
+///
+/// Here are some detailed differences from RFC 4180:
+///
+/// * CRLF, LF and CR are each treated as a single record terminator by
+/// default.
+/// * Records are permitted to be of varying length.
+/// * Empty lines (that do not include other whitespace) are ignored.
+#[derive(Clone, Debug)]
+pub struct Reader {
+ /// A table-based DFA for parsing CSV.
+ dfa: Dfa,
+ /// The current DFA state, if the DFA is used.
+ dfa_state: DfaState,
+ /// The current NFA state, if the NFA is used.
+ nfa_state: NfaState,
+ /// The delimiter that separates fields.
+ delimiter: u8,
+ /// The terminator that separates records.
+ term: Terminator,
+ /// The quotation byte.
+ quote: u8,
+ /// Whether to recognize escaped quotes.
+ escape: Option<u8>,
+ /// Whether to recognized doubled quotes.
+ double_quote: bool,
+ /// If enabled, lines beginning with this byte are ignored.
+ comment: Option<u8>,
+ /// If enabled (the default), then quotes are respected. When disabled,
+ /// quotes are not treated specially.
+ quoting: bool,
+ /// Whether to use the NFA for parsing.
+ ///
+ /// Generally this is for debugging. There's otherwise no good reason
+ /// to avoid the DFA.
+ use_nfa: bool,
+ /// The current line number.
+ line: u64,
+ /// Whether this parser has ever read anything.
+ has_read: bool,
+ /// The current position in the output buffer when reading a record.
+ output_pos: usize,
+}
+
+impl Default for Reader {
+ fn default() -> Reader {
+ Reader {
+ dfa: Dfa::new(),
+ dfa_state: DfaState::start(),
+ nfa_state: NfaState::StartRecord,
+ delimiter: b',',
+ term: Terminator::default(),
+ quote: b'"',
+ escape: None,
+ double_quote: true,
+ comment: None,
+ quoting: true,
+ use_nfa: false,
+ line: 1,
+ has_read: false,
+ output_pos: 0,
+ }
+ }
+}
+
+/// Builds a CSV reader with various configuration knobs.
+///
+/// This builder can be used to tweak the field delimiter, record terminator
+/// and more for parsing CSV. Once a CSV `Reader` is built, its configuration
+/// cannot be changed.
+#[derive(Debug, Default)]
+pub struct ReaderBuilder {
+ rdr: Reader,
+}
+
+impl ReaderBuilder {
+ /// Create a new builder.
+ pub fn new() -> ReaderBuilder {
+ ReaderBuilder::default()
+ }
+
+ /// Build a CSV parser from this configuration.
+ pub fn build(&self) -> Reader {
+ let mut rdr = self.rdr.clone();
+ rdr.build_dfa();
+ rdr
+ }
+
+ /// The field delimiter to use when parsing CSV.
+ ///
+ /// The default is `b','`.
+ pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder {
+ self.rdr.delimiter = delimiter;
+ self
+ }
+
+ /// The record terminator to use when parsing CSV.
+ ///
+ /// A record terminator can be any single byte. The default is a special
+ /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n`
+ /// or `\r\n` as a single record terminator.
+ pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder {
+ self.rdr.term = term;
+ self
+ }
+
+ /// The quote character to use when parsing CSV.
+ ///
+ /// The default is `b'"'`.
+ pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder {
+ self.rdr.quote = quote;
+ self
+ }
+
+ /// The escape character to use when parsing CSV.
+ ///
+ /// In some variants of CSV, quotes are escaped using a special escape
+ /// character like `\` (instead of escaping quotes by doubling them).
+ ///
+ /// By default, recognizing these idiosyncratic escapes is disabled.
+ pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder {
+ self.rdr.escape = escape;
+ self
+ }
+
+ /// Enable double quote escapes.
+ ///
+ /// This is enabled by default, but it may be disabled. When disabled,
+ /// doubled quotes are not interpreted as escapes.
+ pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder {
+ self.rdr.double_quote = yes;
+ self
+ }
+
+ /// Enable or disable quoting.
+ ///
+ /// This is enabled by default, but it may be disabled. When disabled,
+ /// quotes are not treated specially.
+ pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder {
+ self.rdr.quoting = yes;
+ self
+ }
+
+ /// The comment character to use when parsing CSV.
+ ///
+ /// If the start of a record begins with the byte given here, then that
+ /// line is ignored by the CSV parser.
+ ///
+ /// This is disabled by default.
+ pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder {
+ self.rdr.comment = comment;
+ self
+ }
+
+ /// A convenience method for specifying a configuration to read ASCII
+ /// delimited text.
+ ///
+ /// This sets the delimiter and record terminator to the ASCII unit
+ /// separator (`\x1F`) and record separator (`\x1E`), respectively.
+ pub fn ascii(&mut self) -> &mut ReaderBuilder {
+ self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E'))
+ }
+
+ /// Enable or disable the NFA for parsing CSV.
+ ///
+ /// This is intended to be a debug option useful for debugging. The NFA
+ /// is always slower than the DFA.
+ #[doc(hidden)]
+ pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder {
+ self.rdr.use_nfa = yes;
+ self
+ }
+}
+
+/// The result of parsing at most one field from CSV data.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ReadFieldResult {
+ /// The caller provided input was exhausted before the end of a field or
+ /// record was found.
+ InputEmpty,
+ /// The caller provided output buffer was filled before an entire field
+ /// could be written to it.
+ OutputFull,
+ /// The end of a field was found.
+ ///
+ /// Note that when `record_end` is true, then the end of this field also
+ /// corresponds to the end of a record.
+ Field {
+ /// Whether this was the last field in a record or not.
+ record_end: bool,
+ },
+ /// All CSV data has been read.
+ ///
+ /// This state can only be returned when an empty input buffer is provided
+ /// by the caller.
+ End,
+}
+
+impl ReadFieldResult {
+ fn from_nfa(
+ state: NfaState,
+ inpdone: bool,
+ outdone: bool,
+ ) -> ReadFieldResult {
+ match state {
+ NfaState::End => ReadFieldResult::End,
+ NfaState::EndRecord | NfaState::CRLF => {
+ ReadFieldResult::Field { record_end: true }
+ }
+ NfaState::EndFieldDelim => {
+ ReadFieldResult::Field { record_end: false }
+ }
+ _ => {
+ assert!(!state.is_field_final());
+ if !inpdone && outdone {
+ ReadFieldResult::OutputFull
+ } else {
+ ReadFieldResult::InputEmpty
+ }
+ }
+ }
+ }
+}
+
+/// The result of parsing at most one field from CSV data while ignoring the
+/// output.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ReadFieldNoCopyResult {
+ /// The caller provided input was exhausted before the end of a field or
+ /// record was found.
+ InputEmpty,
+ /// The end of a field was found.
+ ///
+ /// Note that when `record_end` is true, then the end of this field also
+ /// corresponds to the end of a record.
+ Field {
+ /// Whether this was the last field in a record or not.
+ record_end: bool,
+ },
+ /// All CSV data has been read.
+ ///
+ /// This state can only be returned when an empty input buffer is provided
+ /// by the caller.
+ End,
+}
+
+/// The result of parsing at most one record from CSV data.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ReadRecordResult {
+ /// The caller provided input was exhausted before the end of a record was
+ /// found.
+ InputEmpty,
+ /// The caller provided output buffer was filled before an entire field
+ /// could be written to it.
+ OutputFull,
+ /// The caller provided output buffer of field end poisitions was filled
+ /// before the next field could be parsed.
+ OutputEndsFull,
+ /// The end of a record was found.
+ Record,
+ /// All CSV data has been read.
+ ///
+ /// This state can only be returned when an empty input buffer is provided
+ /// by the caller.
+ End,
+}
+
+impl ReadRecordResult {
+ fn is_record(&self) -> bool {
+ *self == ReadRecordResult::Record
+ }
+
+ fn from_nfa(
+ state: NfaState,
+ inpdone: bool,
+ outdone: bool,
+ endsdone: bool,
+ ) -> ReadRecordResult {
+ match state {
+ NfaState::End => ReadRecordResult::End,
+ NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record,
+ _ => {
+ assert!(!state.is_record_final());
+ if !inpdone && outdone {
+ ReadRecordResult::OutputFull
+ } else if !inpdone && endsdone {
+ ReadRecordResult::OutputEndsFull
+ } else {
+ ReadRecordResult::InputEmpty
+ }
+ }
+ }
+ }
+}
+
+/// The result of parsing at most one record from CSV data while ignoring
+/// output.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ReadRecordNoCopyResult {
+ /// The caller provided input was exhausted before the end of a record was
+ /// found.
+ InputEmpty,
+ /// The end of a record was found.
+ Record,
+ /// All CSV data has been read.
+ ///
+ /// This state can only be returned when an empty input buffer is provided
+ /// by the caller.
+ End,
+}
+
+/// What should be done with input bytes during an NFA transition
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum NfaInputAction {
+ // Do not consume an input byte
+ Epsilon,
+ // Copy input byte to a caller-provided output buffer
+ CopyToOutput,
+ // Consume but do not copy input byte (for example, seeing a field
+ // delimiter will consume an input byte but should not copy it to the
+ // output buffer.
+ Discard,
+}
+
+/// An NFA state is a state that can be visited in the NFA parser.
+///
+/// Given the simplicity of the machine, a subset of NFA states double as DFA
+/// states. NFA states that only have incoming epsilon transitions are
+/// optimized out when converting the machine to a DFA.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+enum NfaState {
+ // These states aren't used in the DFA, so we
+ // assign them meaningless numbers.
+ EndFieldTerm = 200,
+ InRecordTerm = 201,
+ End = 202,
+
+ // All states below are DFA states.
+ StartRecord = 0,
+ StartField = 1,
+ InField = 2,
+ InQuotedField = 3,
+ InEscapedQuote = 4,
+ InDoubleEscapedQuote = 5,
+ InComment = 6,
+ // All states below are "final field" states.
+ // Namely, they indicate that a field has been parsed.
+ EndFieldDelim = 7,
+ // All states below are "final record" states.
+ // Namely, they indicate that a record has been parsed.
+ EndRecord = 8,
+ CRLF = 9,
+}
+
+/// A list of NFA states that have an explicit representation in the DFA.
+const NFA_STATES: &'static [NfaState] = &[
+ NfaState::StartRecord,
+ NfaState::StartField,
+ NfaState::EndFieldDelim,
+ NfaState::InField,
+ NfaState::InQuotedField,
+ NfaState::InEscapedQuote,
+ NfaState::InDoubleEscapedQuote,
+ NfaState::InComment,
+ NfaState::EndRecord,
+ NfaState::CRLF,
+];
+
+impl NfaState {
+ /// Returns true if this state indicates that a field has been parsed.
+ fn is_field_final(&self) -> bool {
+ match *self {
+ NfaState::End
+ | NfaState::EndRecord
+ | NfaState::CRLF
+ | NfaState::EndFieldDelim => true,
+ _ => false,
+ }
+ }
+
+ /// Returns true if this state indicates that a record has been parsed.
+ fn is_record_final(&self) -> bool {
+ match *self {
+ NfaState::End | NfaState::EndRecord | NfaState::CRLF => true,
+ _ => false,
+ }
+ }
+}
+
+impl Reader {
+ /// Create a new CSV reader with a default parser configuration.
+ pub fn new() -> Reader {
+ ReaderBuilder::new().build()
+ }
+
+ /// Reset the parser such that it behaves as if it had never been used.
+ ///
+ /// This may be useful when reading CSV data in a random access pattern.
+ pub fn reset(&mut self) {
+ self.dfa_state = self.dfa.new_state(NfaState::StartRecord);
+ self.nfa_state = NfaState::StartRecord;
+ self.line = 1;
+ self.has_read = false;
+ }
+
+ /// Return the current line number as measured by the number of occurrences
+ /// of `\n`.
+ ///
+ /// Line numbers starts at `1` and are reset when `reset` is called.
+ pub fn line(&self) -> u64 {
+ self.line
+ }
+
+ /// Set the line number.
+ ///
+ /// This is useful after a call to `reset` where the caller knows the
+ /// line number from some additional context.
+ pub fn set_line(&mut self, line: u64) {
+ self.line = line;
+ }
+
+ /// Parse a single CSV field in `input` and copy field data to `output`.
+ ///
+ /// This routine requires a caller provided buffer of CSV data as the
+ /// `input` and a caller provided buffer, `output`, in which to store field
+ /// data extracted from `input`. The field data copied to `output` will
+ /// have its quotes unescaped.
+ ///
+ /// Calling this routine parses at most a single field and returns
+ /// three values indicating the state of the parser. The first value, a
+ /// `ReadFieldResult`, tells the caller what to do next. For example, if
+ /// the entire input was read or if the output buffer was filled before
+ /// a full field had been read, then `ReadFieldResult::InputEmpty` or
+ /// `ReadFieldResult::OutputFull` is returned, respectively. See the
+ /// documentation for `ReadFieldResult` for more details.
+ ///
+ /// The other two values returned correspond to the number of bytes
+ /// read from `input` and written to `output`, respectively.
+ ///
+ /// # Termination
+ ///
+ /// This reader interprets an empty `input` buffer as an indication that
+ /// there is no CSV data left to read. Namely, when the caller has
+ /// exhausted all CSV data, the caller should continue to call `read` with
+ /// an empty input buffer until `ReadFieldResult::End` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This CSV reader can never return an error. Instead, it prefers *a*
+ /// parse over *no* parse.
+ pub fn read_field(
+ &mut self,
+ input: &[u8],
+ output: &mut [u8],
+ ) -> (ReadFieldResult, usize, usize) {
+ let (input, bom_nin) = self.strip_utf8_bom(input);
+ let (res, nin, nout) = if self.use_nfa {
+ self.read_field_nfa(input, output)
+ } else {
+ self.read_field_dfa(input, output)
+ };
+ self.has_read = true;
+ (res, nin + bom_nin, nout)
+ }
+
+ /// Parse a single CSV record in `input` and copy each field contiguously
+ /// to `output`, with the end position of each field written to `ends`.
+ ///
+ /// **NOTE**: This method is more cumbersome to use than `read_field`, but
+ /// it can be faster since it amortizes more work.
+ ///
+ /// This routine requires a caller provided buffer of CSV data as the
+ /// `input` and two caller provided buffers to store the unescaped field
+ /// data (`output`) and the end position of each field in the record
+ /// (`fields`).
+ ///
+ /// Calling this routine parses at most a single record and returns four
+ /// values indicating the state of the parser. The first value, a
+ /// `ReadRecordResult`, tells the caller what to do next. For example, if
+ /// the entire input was read or if the output buffer was filled before a
+ /// full field had been read, then `ReadRecordResult::InputEmpty` or
+ /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if
+ /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is
+ /// returned. See the documentation for `ReadRecordResult` for more
+ /// details.
+ ///
+ /// The other three values correspond to the number of bytes read from
+ /// `input`, the number of bytes written to `output` and the number of
+ /// end positions written to `ends`, respectively.
+ ///
+ /// The end positions written to `ends` are constructed as if there was
+ /// a single contiguous buffer in memory containing the entire row, even
+ /// if `ReadRecordResult::OutputFull` was returned in the middle of reading
+ /// a row.
+ ///
+ /// # Termination
+ ///
+ /// This reader interprets an empty `input` buffer as an indication that
+ /// there is no CSV data left to read. Namely, when the caller has
+ /// exhausted all CSV data, the caller should continue to call `read` with
+ /// an empty input buffer until `ReadRecordResult::End` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This CSV reader can never return an error. Instead, it prefers *a*
+ /// parse over *no* parse.
+ pub fn read_record(
+ &mut self,
+ input: &[u8],
+ output: &mut [u8],
+ ends: &mut [usize],
+ ) -> (ReadRecordResult, usize, usize, usize) {
+ let (input, bom_nin) = self.strip_utf8_bom(input);
+ let (res, nin, nout, nend) = if self.use_nfa {
+ self.read_record_nfa(input, output, ends)
+ } else {
+ self.read_record_dfa(input, output, ends)
+ };
+ self.has_read = true;
+ (res, nin + bom_nin, nout, nend)
+ }
+
+ /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that
+ /// this method will fail to strip off the BOM if only part of the BOM is
+ /// buffered. Hopefully that won't happen very often.
+ fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) {
+ let (input, nin) = if {
+ !self.has_read
+ && input.len() >= 3
+ && &input[0..3] == b"\xef\xbb\xbf"
+ } {
+ (&input[3..], 3)
+ } else {
+ (input, 0)
+ };
+ (input, nin)
+ }
+
+ #[inline(always)]
+ fn read_record_dfa(
+ &mut self,
+ input: &[u8],
+ output: &mut [u8],
+ ends: &mut [usize],
+ ) -> (ReadRecordResult, usize, usize, usize) {
+ if input.is_empty() {
+ let s = self.transition_final_dfa(self.dfa_state);
+ let res =
+ self.dfa.new_read_record_result(s, true, false, false, false);
+ // This part is a little tricky. When reading the final record,
+ // the last result the caller will get is an InputEmpty, and while
+ // they'll have everything they need in `output`, they'll be
+ // missing the final end position of the final field in `ends`.
+ // We insert that here, but we must take care to handle the case
+ // where `ends` doesn't have enough space. If it doesn't have
+ // enough space, then we also can't transition to the next state.
+ return match res {
+ ReadRecordResult::Record => {
+ if ends.is_empty() {
+ return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
+ }
+ self.dfa_state = s;
+ ends[0] = self.output_pos;
+ self.output_pos = 0;
+ (res, 0, 0, 1)
+ }
+ _ => {
+ self.dfa_state = s;
+ (res, 0, 0, 0)
+ }
+ };
+ }
+ if output.is_empty() {
+ return (ReadRecordResult::OutputFull, 0, 0, 0);
+ }
+ if ends.is_empty() {
+ return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
+ }
+ let (mut nin, mut nout, mut nend) = (0, 0, 0);
+ let mut state = self.dfa_state;
+ while nin < input.len() && nout < output.len() && nend < ends.len() {
+ let (s, has_out) = self.dfa.get_output(state, input[nin]);
+ self.line += (input[nin] == b'\n') as u64;
+ state = s;
+ if has_out {
+ output[nout] = input[nin];
+ nout += 1;
+ }
+ nin += 1;
+ if state >= self.dfa.final_field {
+ ends[nend] = self.output_pos + nout;
+ nend += 1;
+ if state > self.dfa.final_field {
+ break;
+ }
+ }
+ if state == self.dfa.in_field || state == self.dfa.in_quoted {
+ self.dfa
+ .classes
+ .scan_and_copy(input, &mut nin, output, &mut nout);
+ }
+ }
+ let res = self.dfa.new_read_record_result(
+ state,
+ false,
+ nin >= input.len(),
+ nout >= output.len(),
+ nend >= ends.len(),
+ );
+ self.dfa_state = state;
+ if res.is_record() {
+ self.output_pos = 0;
+ } else {
+ self.output_pos += nout;
+ }
+ (res, nin, nout, nend)
+ }
+
+ #[inline(always)]
+ fn read_field_dfa(
+ &mut self,
+ input: &[u8],
+ output: &mut [u8],
+ ) -> (ReadFieldResult, usize, usize) {
+ if input.is_empty() {
+ self.dfa_state = self.transition_final_dfa(self.dfa_state);
+ let res = self.dfa.new_read_field_result(
+ self.dfa_state,
+ true,
+ false,
+ false,
+ );
+ return (res, 0, 0);
+ }
+ if output.is_empty() {
+ return (ReadFieldResult::OutputFull, 0, 0);
+ }
+ let (mut nin, mut nout) = (0, 0);
+ let mut state = self.dfa_state;
+ while nin < input.len() && nout < output.len() {
+ let b = input[nin];
+ self.line += (b == b'\n') as u64;
+ let (s, has_out) = self.dfa.get_output(state, b);
+ state = s;
+ if has_out {
+ output[nout] = b;
+ nout += 1;
+ }
+ nin += 1;
+ if state >= self.dfa.final_field {
+ break;
+ }
+ }
+ let res = self.dfa.new_read_field_result(
+ state,
+ false,
+ nin >= input.len(),
+ nout >= output.len(),
+ );
+ self.dfa_state = state;
+ (res, nin, nout)
+ }
+
+ /// Perform the final state transition, i.e., when the caller indicates
+ /// that the input has been exhausted.
+ fn transition_final_dfa(&self, state: DfaState) -> DfaState {
+ // If we''ve already emitted a record or think we're ready to start
+ // parsing a new record, then we should sink into the final state
+ // and never move from there. (pro-tip: the start state doubles as
+ // the final state!)
+ if state >= self.dfa.final_record || state.is_start() {
+ self.dfa.new_state_final_end()
+ } else {
+ self.dfa.new_state_final_record()
+ }
+ }
+
+ /// Write the transition tables for the DFA based on this parser's
+ /// configuration.
+ fn build_dfa(&mut self) {
+ // A naive DFA transition table has
+ // `cells = (# number of states) * (# size of alphabet)`. While we
+ // could get away with that, the table would have `10 * 256 = 2560`
+ // entries. Even worse, in order to avoid a multiplication instruction
+ // when computing the next transition, we store the starting index of
+ // each state's row, which would not be representible in a single byte.
+ // So we'd need a `u16`, which doubles our transition table size to
+ // ~5KB. This is a lot to put on the stack, even though it probably
+ // fits in the L1 cache of most modern CPUs.
+ //
+ // To avoid this, we note that while our "true" alphabet
+ // has 256 distinct possibilities, the DFA itself is only
+ // discriminatory on a very small subset of that alphabet. For
+ // example, assuming neither `a` nor `b` are set as special
+ // quote/comment/escape/delimiter/terminator bytes, they are otherwise
+ // indistinguishable to the DFA, so it would be OK to treat them as
+ // if they were equivalent. That is, they are in the same equivalence
+ // class.
+ //
+ // As it turns out, using this logic, we can shrink our effective
+ // alphabet down to 7 equivalence classes:
+ //
+ // 1. The field delimiter.
+ // 2. The record terminator.
+ // 3. If the record terminator is CRLF, then CR and LF are
+ // distinct equivalence classes.
+ // 4. The quote byte.
+ // 5. The escape byte.
+ // 6. The comment byte.
+ // 7. Everything else.
+ //
+ // We add those equivalence classes here. If more configuration knobs
+ // are added to the parser with more discriminating bytes, then this
+ // logic will need to be adjusted further.
+ //
+ // Even though this requires an extra bit of indirection when computing
+ // the next transition, microbenchmarks say that it doesn't make much
+ // of a difference. Perhaps because everything fits into the L1 cache.
+ self.dfa.classes.add(self.delimiter);
+ if self.quoting {
+ self.dfa.classes.add(self.quote);
+ if let Some(escape) = self.escape {
+ self.dfa.classes.add(escape);
+ }
+ }
+ if let Some(comment) = self.comment {
+ self.dfa.classes.add(comment);
+ }
+ match self.term {
+ Terminator::Any(b) => self.dfa.classes.add(b),
+ Terminator::CRLF => {
+ self.dfa.classes.add(b'\r');
+ self.dfa.classes.add(b'\n');
+ }
+ _ => unreachable!(),
+ }
+ // Build the DFA transition table by computing the DFA state for all
+ // possible combinations of state and input byte.
+ for &state in NFA_STATES {
+ for c in (0..256).map(|c| c as u8) {
+ let mut nfa_result = (state, NfaInputAction::Epsilon);
+ // Consume NFA states until we hit a non-epsilon transition.
+ while nfa_result.0 != NfaState::End
+ && nfa_result.1 == NfaInputAction::Epsilon
+ {
+ nfa_result = self.transition_nfa(nfa_result.0, c);
+ }
+ let from = self.dfa.new_state(state);
+ let to = self.dfa.new_state(nfa_result.0);
+ self.dfa.set(
+ from,
+ c,
+ to,
+ nfa_result.1 == NfaInputAction::CopyToOutput,
+ );
+ }
+ }
+ self.dfa_state = self.dfa.new_state(NfaState::StartRecord);
+ self.dfa.finish();
+ }
+
+ // The NFA implementation follows. The transition_final_nfa and
+ // transition_nfa methods are required for the DFA to operate. The
+ // rest are included for completeness (and debugging). Note that this
+ // NFA implementation is included in most of the CSV parser tests below.
+
+ #[inline(always)]
+ fn read_record_nfa(
+ &mut self,
+ input: &[u8],
+ output: &mut [u8],
+ ends: &mut [usize],
+ ) -> (ReadRecordResult, usize, usize, usize) {
+ if input.is_empty() {
+ let s = self.transition_final_nfa(self.nfa_state);
+ let res = ReadRecordResult::from_nfa(s, false, false, false);
+ return match res {
+ ReadRecordResult::Record => {
+ if ends.is_empty() {
+ return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
+ }
+ self.nfa_state = s;
+ ends[0] = self.output_pos;
+ self.output_pos = 0;
+ (res, 0, 0, 1)
+ }
+ _ => {
+ self.nfa_state = s;
+ (res, 0, 0, 0)
+ }
+ };
+ }
+ if output.is_empty() {
+ return (ReadRecordResult::OutputFull, 0, 0, 0);
+ }
+ if ends.is_empty() {
+ return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
+ }
+ let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0);
+ let mut state = self.nfa_state;
+ while nin < input.len() && nout < output.len() && nend < ends.len() {
+ let (s, io) = self.transition_nfa(state, input[nin]);
+ match io {
+ NfaInputAction::CopyToOutput => {
+ output[nout] = input[nin];
+ nout += 1;
+ nin += 1;
+ }
+ NfaInputAction::Discard => {
+ nin += 1;
+ }
+ NfaInputAction::Epsilon => {}
+ }
+ state = s;
+ if state.is_field_final() {
+ ends[nend] = nout;
+ nend += 1;
+ if state != NfaState::EndFieldDelim {
+ break;
+ }
+ }
+ }
+ let res = ReadRecordResult::from_nfa(
+ state,
+ nin >= input.len(),
+ nout >= output.len(),
+ nend >= ends.len(),
+ );
+ self.nfa_state = state;
+ self.output_pos = if res.is_record() { 0 } else { nout };
+ (res, nin, nout, nend)
+ }
+
+ #[inline(always)]
+ fn read_field_nfa(
+ &mut self,
+ input: &[u8],
+ output: &mut [u8],
+ ) -> (ReadFieldResult, usize, usize) {
+ if input.is_empty() {
+ self.nfa_state = self.transition_final_nfa(self.nfa_state);
+ let res = ReadFieldResult::from_nfa(self.nfa_state, false, false);
+ return (res, 0, 0);
+ }
+ if output.is_empty() {
+ // If the output buffer is empty, then we can never make progress,
+ // so just quit now.
+ return (ReadFieldResult::OutputFull, 0, 0);
+ }
+ let (mut nin, mut nout) = (0, 0);
+ let mut state = self.nfa_state;
+ while nin < input.len() && nout < output.len() {
+ let (s, io) = self.transition_nfa(state, input[nin]);
+ match io {
+ NfaInputAction::CopyToOutput => {
+ output[nout] = input[nin];
+ nout += 1;
+ nin += 1;
+ }
+ NfaInputAction::Discard => {
+ nin += 1;
+ }
+ NfaInputAction::Epsilon => (),
+ }
+ state = s;
+ if state.is_field_final() {
+ break;
+ }
+ }
+ let res = ReadFieldResult::from_nfa(
+ state,
+ nin >= input.len(),
+ nout >= output.len(),
+ );
+ self.nfa_state = state;
+ (res, nin, nout)
+ }
+
+ /// Compute the final NFA transition after all caller-provided input has
+ /// been exhausted.
+ #[inline(always)]
+ fn transition_final_nfa(&self, state: NfaState) -> NfaState {
+ use self::NfaState::*;
+ match state {
+ End | StartRecord | EndRecord | InComment | CRLF => End,
+ StartField | EndFieldDelim | EndFieldTerm | InField
+ | InQuotedField | InEscapedQuote | InDoubleEscapedQuote
+ | InRecordTerm => EndRecord,
+ }
+ }
+
+ /// Compute the next NFA state given the current NFA state and the current
+ /// input byte.
+ ///
+ /// This returns the next NFA state along with an NfaInputAction that
+ /// indicates what should be done with the input byte (nothing for an epsilon
+ /// transition, copied to a caller provided output buffer, or discarded).
+ #[inline(always)]
+ fn transition_nfa(
+ &self,
+ state: NfaState,
+ c: u8,
+ ) -> (NfaState, NfaInputAction) {
+ use self::NfaState::*;
+ match state {
+ End => (End, NfaInputAction::Epsilon),
+ StartRecord => {
+ if self.term.equals(c) {
+ (StartRecord, NfaInputAction::Discard)
+ } else if self.comment == Some(c) {
+ (InComment, NfaInputAction::Discard)
+ } else {
+ (StartField, NfaInputAction::Epsilon)
+ }
+ }
+ EndRecord => (StartRecord, NfaInputAction::Epsilon),
+ StartField => {
+ if self.quoting && self.quote == c {
+ (InQuotedField, NfaInputAction::Discard)
+ } else if self.delimiter == c {
+ (EndFieldDelim, NfaInputAction::Discard)
+ } else if self.term.equals(c) {
+ (EndFieldTerm, NfaInputAction::Epsilon)
+ } else {
+ (InField, NfaInputAction::CopyToOutput)
+ }
+ }
+ EndFieldDelim => (StartField, NfaInputAction::Epsilon),
+ EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon),
+ InField => {
+ if self.delimiter == c {
+ (EndFieldDelim, NfaInputAction::Discard)
+ } else if self.term.equals(c) {
+ (EndFieldTerm, NfaInputAction::Epsilon)
+ } else {
+ (InField, NfaInputAction::CopyToOutput)
+ }
+ }
+ InQuotedField => {
+ if self.quoting && self.quote == c {
+ (InDoubleEscapedQuote, NfaInputAction::Discard)
+ } else if self.quoting && self.escape == Some(c) {
+ (InEscapedQuote, NfaInputAction::Discard)
+ } else {
+ (InQuotedField, NfaInputAction::CopyToOutput)
+ }
+ }
+ InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput),
+ InDoubleEscapedQuote => {
+ if self.quoting && self.double_quote && self.quote == c {
+ (InQuotedField, NfaInputAction::CopyToOutput)
+ } else if self.delimiter == c {
+ (EndFieldDelim, NfaInputAction::Discard)
+ } else if self.term.equals(c) {
+ (EndFieldTerm, NfaInputAction::Epsilon)
+ } else {
+ (InField, NfaInputAction::CopyToOutput)
+ }
+ }
+ InComment => {
+ if b'\n' == c {
+ (StartRecord, NfaInputAction::Discard)
+ } else {
+ (InComment, NfaInputAction::Discard)
+ }
+ }
+ InRecordTerm => {
+ if self.term.is_crlf() && b'\r' == c {
+ (CRLF, NfaInputAction::Discard)
+ } else {
+ (EndRecord, NfaInputAction::Discard)
+ }
+ }
+ CRLF => {
+ if b'\n' == c {
+ (StartRecord, NfaInputAction::Discard)
+ } else {
+ (StartRecord, NfaInputAction::Epsilon)
+ }
+ }
+ }
+ }
+}
+
+/// The number of slots in the DFA transition table.
+///
+/// This number is computed by multiplying the maximum number of transition
+/// classes (7) by the total number of NFA states that are used in the DFA
+/// (10).
+///
+/// The number of transition classes is determined by an equivalence class of
+/// bytes, where every byte in the same equivalence classes is
+/// indistinguishable from any other byte with respect to the DFA. For example,
+/// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape,
+/// then the DFA will never discriminate between `a` or `b`, so they can
+/// effectively be treated as identical. This reduces storage space
+/// substantially.
+///
+/// The total number of NFA states (13) is greater than the total number of
+/// NFA states that are in the DFA. In particular, any NFA state that can only
+/// be reached by epsilon transitions will never have explicit usage in the
+/// DFA.
+const TRANS_CLASSES: usize = 7;
+const DFA_STATES: usize = 10;
+const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES;
+
+/// The number of possible transition classes. (See the comment on `TRANS_SIZE`
+/// for more details.)
+const CLASS_SIZE: usize = 256;
+
+/// A representation of a DFA.
+///
+/// For the most part, this is a transition table, but various optimizations
+/// have been applied to reduce its memory footprint.
+struct Dfa {
+ /// The core transition table. Each row corresponds to the transitions for
+ /// each input equivalence class. (Input bytes are mapped to their
+ /// corresponding equivalence class with the `classes` map.)
+ ///
+ /// DFA states are represented as an index corresponding to the start of
+ /// its row in this table.
+ trans: [DfaState; TRANS_SIZE],
+ /// A table with the same layout as `trans`, except its values indicate
+ /// whether a particular `(state, equivalence class)` pair should emit an
+ /// output byte.
+ has_output: [bool; TRANS_SIZE],
+ /// A map from input byte to equivalence class.
+ ///
+ /// This is responsible for reducing the effective alphabet size from
+ /// 256 to `TRANS_CLASSES`.
+ classes: DfaClasses,
+ /// The DFA state corresponding to being inside an unquoted field.
+ in_field: DfaState,
+ /// The DFA state corresponding to being inside an quoted field.
+ in_quoted: DfaState,
+ /// The minimum DFA state that indicates a field has been parsed. All DFA
+ /// states greater than this are also final-field states.
+ final_field: DfaState,
+ /// The minimum DFA state that indicates a record has been parsed. All DFA
+ /// states greater than this are also final-record states.
+ final_record: DfaState,
+}
+
+impl Dfa {
+ fn new() -> Dfa {
+ Dfa {
+ trans: [DfaState(0); TRANS_SIZE],
+ has_output: [false; TRANS_SIZE],
+ classes: DfaClasses::new(),
+ in_field: DfaState(0),
+ in_quoted: DfaState(0),
+ final_field: DfaState(0),
+ final_record: DfaState(0),
+ }
+ }
+
+ fn new_state(&self, nfa_state: NfaState) -> DfaState {
+ let nclasses = self.classes.num_classes() as u8;
+ let idx = (nfa_state as u8).checked_mul(nclasses).unwrap();
+ DfaState(idx)
+ }
+
+ fn new_state_final_end(&self) -> DfaState {
+ self.new_state(NfaState::StartRecord)
+ }
+
+ fn new_state_final_record(&self) -> DfaState {
+ self.new_state(NfaState::EndRecord)
+ }
+
+ fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) {
+ let cls = self.classes.classes[c as usize];
+ let idx = state.0 as usize + cls as usize;
+ (self.trans[idx], self.has_output[idx])
+ }
+
+ fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) {
+ let cls = self.classes.classes[c as usize];
+ let idx = from.0 as usize + cls as usize;
+ self.trans[idx] = to;
+ self.has_output[idx] = output;
+ }
+
+ fn finish(&mut self) {
+ self.in_field = self.new_state(NfaState::InField);
+ self.in_quoted = self.new_state(NfaState::InQuotedField);
+ self.final_field = self.new_state(NfaState::EndFieldDelim);
+ self.final_record = self.new_state(NfaState::EndRecord);
+ }
+
+ fn new_read_field_result(
+ &self,
+ state: DfaState,
+ is_final_trans: bool,
+ inpdone: bool,
+ outdone: bool,
+ ) -> ReadFieldResult {
+ if state >= self.final_record {
+ ReadFieldResult::Field { record_end: true }
+ } else if state == self.final_field {
+ ReadFieldResult::Field { record_end: false }
+ } else if is_final_trans && state.is_start() {
+ ReadFieldResult::End
+ } else {
+ debug_assert!(state < self.final_field);
+ if !inpdone && outdone {
+ ReadFieldResult::OutputFull
+ } else {
+ ReadFieldResult::InputEmpty
+ }
+ }
+ }
+
+ fn new_read_record_result(
+ &self,
+ state: DfaState,
+ is_final_trans: bool,
+ inpdone: bool,
+ outdone: bool,
+ endsdone: bool,
+ ) -> ReadRecordResult {
+ if state >= self.final_record {
+ ReadRecordResult::Record
+ } else if is_final_trans && state.is_start() {
+ ReadRecordResult::End
+ } else {
+ debug_assert!(state < self.final_record);
+ if !inpdone && outdone {
+ ReadRecordResult::OutputFull
+ } else if !inpdone && endsdone {
+ ReadRecordResult::OutputEndsFull
+ } else {
+ ReadRecordResult::InputEmpty
+ }
+ }
+ }
+}
+
+/// A map from input byte to equivalence class.
+struct DfaClasses {
+ classes: [u8; CLASS_SIZE],
+ next_class: usize,
+}
+
+impl DfaClasses {
+ fn new() -> DfaClasses {
+ DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 }
+ }
+
+ fn add(&mut self, b: u8) {
+ if self.next_class > CLASS_SIZE {
+ panic!("added too many classes")
+ }
+ self.classes[b as usize] = self.next_class as u8;
+ self.next_class = self.next_class + 1;
+ }
+
+ fn num_classes(&self) -> usize {
+ self.next_class as usize
+ }
+
+ /// Scan and copy the input bytes to the output buffer quickly.
+ ///
+ /// This assumes that the current state of the DFA is either `InField` or
+ /// `InQuotedField`. In this case, all bytes corresponding to the first
+ /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are
+ /// guaranteed to never result in a state transition out of the current
+ /// state. This function takes advantage of that copies every byte from
+ /// `input` in the first equivalence class to `output`. Once a byte is seen
+ /// outside the first equivalence class, we quit and should fall back to
+ /// the main DFA loop.
+ #[inline(always)]
+ fn scan_and_copy(
+ &self,
+ input: &[u8],
+ nin: &mut usize,
+ output: &mut [u8],
+ nout: &mut usize,
+ ) {
+ while *nin < input.len()
+ && *nout < output.len()
+ && self.classes[input[*nin] as usize] == 0
+ {
+ output[*nout] = input[*nin];
+ *nin += 1;
+ *nout += 1;
+ }
+ }
+}
+
+/// A single DFA state.
+///
+/// A DFA state is represented by the starting index of its corresponding row
+/// in the DFA transition table. This representation allows us to elide a
+/// single multiplication instruction when computing the next transition for
+/// a particular input byte.
+#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
+struct DfaState(u8);
+
+impl DfaState {
+ fn start() -> DfaState {
+ DfaState(0)
+ }
+
+ fn is_start(&self) -> bool {
+ self.0 == 0
+ }
+}
+
+impl fmt::Debug for Dfa {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "Dfa(N/A)")
+ }
+}
+
+impl fmt::Debug for DfaClasses {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(
+ f,
+ "DfaClasses {{ classes: N/A, next_class: {:?} }}",
+ self.next_class
+ )
+ }
+}
+
+impl Clone for Dfa {
+ fn clone(&self) -> Dfa {
+ let mut dfa = Dfa::new();
+ dfa.trans.copy_from_slice(&self.trans);
+ dfa
+ }
+}
+
+impl Clone for DfaClasses {
+ fn clone(&self) -> DfaClasses {
+ let mut x = DfaClasses::new();
+ x.classes.copy_from_slice(&self.classes);
+ x
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use core::str;
+
+ use arrayvec::{ArrayString, ArrayVec};
+
+ use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator};
+
+ type Csv = ArrayVec<[Row; 10]>;
+ type Row = ArrayVec<[Field; 10]>;
+ type Field = ArrayString<[u8; 10]>;
+
+ // OMG I HATE BYTE STRING LITERALS SO MUCH.
+ fn b(s: &str) -> &[u8] {
+ s.as_bytes()
+ }
+
+ macro_rules! csv {
+ ($([$($field:expr),*]),*) => {{
+ #[allow(unused_mut)]
+ fn x() -> Csv {
+ let mut csv = Csv::new();
+ $(
+ let mut row = Row::new();
+ $(
+ row.push(Field::from($field).unwrap());
+ )*
+ csv.push(row);
+ )*
+ csv
+ }
+ x()
+ }}
+ }
+
+ macro_rules! parses_to {
+ ($name:ident, $data:expr, $expected:expr) => {
+ parses_to!($name, $data, $expected, |builder| builder);
+ };
+ ($name:ident, $data:expr, $expected:expr, $config:expr) => {
+ #[test]
+ fn $name() {
+ let mut builder = ReaderBuilder::new();
+ builder.nfa(true);
+ $config(&mut builder);
+ let mut rdr = builder.build();
+ let got = parse_by_field(&mut rdr, $data);
+ let expected = $expected;
+ assert_eq!(expected, got, "nfa by field");
+
+ let mut builder = ReaderBuilder::new();
+ builder.nfa(true);
+ $config(&mut builder);
+ let mut rdr = builder.build();
+ let got = parse_by_record(&mut rdr, $data);
+ let expected = $expected;
+ assert_eq!(expected, got, "nfa by record");
+
+ let mut builder = ReaderBuilder::new();
+ $config(&mut builder);
+ let mut rdr = builder.build();
+ let got = parse_by_field(&mut rdr, $data);
+ let expected = $expected;
+ assert_eq!(expected, got, "dfa by field");
+
+ let mut builder = ReaderBuilder::new();
+ $config(&mut builder);
+ let mut rdr = builder.build();
+ let got = parse_by_record(&mut rdr, $data);
+ let expected = $expected;
+ assert_eq!(expected, got, "dfa by record");
+ }
+ };
+ }
+
+ fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv {
+ let mut data = data.as_bytes();
+ let mut field = [0u8; 10];
+ let mut csv = Csv::new();
+ let mut row = Row::new();
+ let mut outpos = 0;
+ loop {
+ let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]);
+ data = &data[nin..];
+ outpos += nout;
+
+ match res {
+ ReadFieldResult::InputEmpty => {
+ if !data.is_empty() {
+ panic!("missing input data")
+ }
+ }
+ ReadFieldResult::OutputFull => panic!("field too large"),
+ ReadFieldResult::Field { record_end } => {
+ let s = str::from_utf8(&field[..outpos]).unwrap();
+ row.push(Field::from(s).unwrap());
+ outpos = 0;
+ if record_end {
+ csv.push(row);
+ row = Row::new();
+ }
+ }
+ ReadFieldResult::End => {
+ return csv;
+ }
+ }
+ }
+ }
+
+ fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv {
+ use crate::ReadRecordResult::*;
+
+ let mut data = data.as_bytes();
+ let mut record = [0; 1024];
+ let mut ends = [0; 10];
+
+ let mut csv = Csv::new();
+ let (mut outpos, mut endpos) = (0, 0);
+ loop {
+ let (res, nin, nout, nend) = rdr.read_record(
+ data,
+ &mut record[outpos..],
+ &mut ends[endpos..],
+ );
+ data = &data[nin..];
+ outpos += nout;
+ endpos += nend;
+
+ match res {
+ InputEmpty => {
+ if !data.is_empty() {
+ panic!("missing input data")
+ }
+ }
+ OutputFull => panic!("record too large (out buffer)"),
+ OutputEndsFull => panic!("record too large (end buffer)"),
+ Record => {
+ let s = str::from_utf8(&record[..outpos]).unwrap();
+ let mut start = 0;
+ let mut row = Row::new();
+ for &end in &ends[..endpos] {
+ row.push(Field::from(&s[start..end]).unwrap());
+ start = end;
+ }
+ csv.push(row);
+ outpos = 0;
+ endpos = 0;
+ }
+ End => return csv,
+ }
+ }
+ }
+
+ parses_to!(one_row_one_field, "a", csv![["a"]]);
+ parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]);
+ parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]);
+ parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]);
+ parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]);
+ parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]);
+ parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]);
+ parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]);
+ parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]);
+ parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]);
+ parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]);
+ parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]);
+
+ parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]);
+ parses_to!(
+ many_rows_many_fields,
+ "a,b,c\nx,y,z",
+ csv![["a", "b", "c"], ["x", "y", "z"]]
+ );
+ parses_to!(
+ many_rows_trailing_comma,
+ "a,b,\nx,y,",
+ csv![["a", "b", ""], ["x", "y", ""]]
+ );
+ parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]);
+ parses_to!(
+ many_rows_many_fields_lf,
+ "a,b,c\nx,y,z\n",
+ csv![["a", "b", "c"], ["x", "y", "z"]]
+ );
+ parses_to!(
+ many_rows_trailing_comma_lf,
+ "a,b,\nx,y,\n",
+ csv![["a", "b", ""], ["x", "y", ""]]
+ );
+ parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]);
+ parses_to!(
+ many_rows_many_fields_crlf,
+ "a,b,c\r\nx,y,z\r\n",
+ csv![["a", "b", "c"], ["x", "y", "z"]]
+ );
+ parses_to!(
+ many_rows_trailing_comma_crlf,
+ "a,b,\r\nx,y,\r\n",
+ csv![["a", "b", ""], ["x", "y", ""]]
+ );
+ parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]);
+ parses_to!(
+ many_rows_many_fields_cr,
+ "a,b,c\rx,y,z\r",
+ csv![["a", "b", "c"], ["x", "y", "z"]]
+ );
+ parses_to!(
+ many_rows_trailing_comma_cr,
+ "a,b,\rx,y,\r",
+ csv![["a", "b", ""], ["x", "y", ""]]
+ );
+
+ parses_to!(
+ trailing_lines_no_record,
+ "\n\n\na,b,c\nx,y,z\n\n\n",
+ csv![["a", "b", "c"], ["x", "y", "z"]]
+ );
+ parses_to!(
+ trailing_lines_no_record_cr,
+ "\r\r\ra,b,c\rx,y,z\r\r\r",
+ csv![["a", "b", "c"], ["x", "y", "z"]]
+ );
+ parses_to!(
+ trailing_lines_no_record_crlf,
+ "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n",
+ csv![["a", "b", "c"], ["x", "y", "z"]]
+ );
+
+ parses_to!(empty, "", csv![]);
+ parses_to!(empty_lines, "\n\n\n\n", csv![]);
+ parses_to!(
+ empty_lines_interspersed,
+ "\n\na,b\n\n\nx,y\n\n\nm,n\n",
+ csv![["a", "b"], ["x", "y"], ["m", "n"]]
+ );
+ parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]);
+ parses_to!(
+ empty_lines_interspersed_crlf,
+ "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n",
+ csv![["a", "b"], ["x", "y"], ["m", "n"]]
+ );
+ parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]);
+ parses_to!(
+ empty_lines_interspersed_mixed,
+ "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n",
+ csv![["a", "b"], ["x", "y"], ["m", "n"]]
+ );
+ parses_to!(empty_lines_cr, "\r\r\r\r", csv![]);
+ parses_to!(
+ empty_lines_interspersed_cr,
+ "\r\ra,b\r\r\rx,y\r\r\rm,n\r",
+ csv![["a", "b"], ["x", "y"], ["m", "n"]]
+ );
+
+ parses_to!(
+ term_weird,
+ "zza,bzc,dzz",
+ csv![["a", "b"], ["c", "d"]],
+ |b: &mut ReaderBuilder| {
+ b.terminator(Terminator::Any(b'z'));
+ }
+ );
+
+ parses_to!(
+ ascii_delimited,
+ "a\x1fb\x1ec\x1fd",
+ csv![["a", "b"], ["c", "d"]],
+ |b: &mut ReaderBuilder| {
+ b.ascii();
+ }
+ );
+
+ parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]);
+ parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]);
+ parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]);
+
+ parses_to!(quote_empty, "\"\"", csv![[""]]);
+ parses_to!(quote_lf, "\"\"\n", csv![[""]]);
+ parses_to!(quote_space, "\" \"", csv![[" "]]);
+ parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]);
+ parses_to!(quote_outer_space, " \"a\" ", csv![[" \"a\" "]]);
+
+ parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| {
+ b.quote(b'z');
+ });
+
+ // This one is pretty hokey.
+ // I don't really know what the "right" behavior is.
+ parses_to!(
+ quote_delimiter,
+ ",a,,b",
+ csv![["a,b"]],
+ |b: &mut ReaderBuilder| {
+ b.quote(b',');
+ }
+ );
+
+ parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]);
+ parses_to!(
+ quote_escapes_no_double,
+ r#""a""b""#,
+ csv![[r#"a"b""#]],
+ |b: &mut ReaderBuilder| {
+ b.double_quote(false);
+ }
+ );
+ parses_to!(
+ quote_escapes,
+ r#""a\"b""#,
+ csv![[r#"a"b"#]],
+ |b: &mut ReaderBuilder| {
+ b.escape(Some(b'\\'));
+ }
+ );
+ parses_to!(
+ quote_escapes_change,
+ r#""az"b""#,
+ csv![[r#"a"b"#]],
+ |b: &mut ReaderBuilder| {
+ b.escape(Some(b'z'));
+ }
+ );
+
+ parses_to!(
+ quote_escapes_with_comma,
+ r#""\"A,B\"""#,
+ csv![[r#""A,B""#]],
+ |b: &mut ReaderBuilder| {
+ b.escape(Some(b'\\')).double_quote(false);
+ }
+ );
+
+ parses_to!(
+ quoting_disabled,
+ r#""abc,foo""#,
+ csv![[r#""abc"#, r#"foo""#]],
+ |b: &mut ReaderBuilder| {
+ b.quoting(false);
+ }
+ );
+
+ parses_to!(
+ delimiter_tabs,
+ "a\tb",
+ csv![["a", "b"]],
+ |b: &mut ReaderBuilder| {
+ b.delimiter(b'\t');
+ }
+ );
+ parses_to!(
+ delimiter_weird,
+ "azb",
+ csv![["a", "b"]],
+ |b: &mut ReaderBuilder| {
+ b.delimiter(b'z');
+ }
+ );
+
+ parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]);
+ parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]);
+
+ parses_to!(
+ comment_1,
+ "foo\n# hi\nbar\n",
+ csv![["foo"], ["bar"]],
+ |b: &mut ReaderBuilder| {
+ b.comment(Some(b'#'));
+ }
+ );
+ parses_to!(
+ comment_2,
+ "foo\n # hi\nbar\n",
+ csv![["foo"], [" # hi"], ["bar"]],
+ |b: &mut ReaderBuilder| {
+ b.comment(Some(b'#'));
+ }
+ );
+ parses_to!(
+ comment_3,
+ "foo\n# hi\nbar\n",
+ csv![["foo"], ["# hi"], ["bar"]],
+ |b: &mut ReaderBuilder| {
+ b.comment(Some(b'\n'));
+ }
+ );
+ parses_to!(
+ comment_4,
+ "foo,b#ar,baz",
+ csv![["foo", "b#ar", "baz"]],
+ |b: &mut ReaderBuilder| {
+ b.comment(Some(b'#'));
+ }
+ );
+ parses_to!(
+ comment_5,
+ "foo,#bar,baz",
+ csv![["foo", "#bar", "baz"]],
+ |b: &mut ReaderBuilder| {
+ b.comment(Some(b'#'));
+ }
+ );
+
+ macro_rules! assert_read {
+ (
+ $rdr:expr, $input:expr, $output:expr,
+ $expect_in:expr, $expect_out:expr, $expect_res:expr
+ ) => {{
+ let (res, nin, nout) = $rdr.read_field($input, $output);
+ assert_eq!($expect_in, nin);
+ assert_eq!($expect_out, nout);
+ assert_eq!($expect_res, res);
+ }};
+ }
+
+ // This tests that feeding a new reader with an empty buffer sends us
+ // straight to End.
+ #[test]
+ fn stream_empty() {
+ use crate::ReadFieldResult::*;
+
+ let mut rdr = Reader::new();
+ assert_read!(rdr, &[], &mut [], 0, 0, End);
+ }
+
+ // Test that a single space is treated as a single field.
+ #[test]
+ fn stream_space() {
+ use crate::ReadFieldResult::*;
+
+ let mut rdr = Reader::new();
+ assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty);
+ assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true });
+ assert_read!(rdr, &[], &mut [0], 0, 0, End);
+ }
+
+ // Test that a single comma ...
+ #[test]
+ fn stream_comma() {
+ use crate::ReadFieldResult::*;
+
+ let mut rdr = Reader::new();
+ assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false });
+ assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true });
+ assert_read!(rdr, &[], &mut [0], 0, 0, End);
+ }
+
+ // Test that we can read a single large field in multiple output
+ // buffers.
+ #[test]
+ fn stream_output_chunks() {
+ use crate::ReadFieldResult::*;
+
+ let mut inp = b("fooquux");
+ let out = &mut [0; 2];
+ let mut rdr = Reader::new();
+
+ assert_read!(rdr, inp, out, 2, 2, OutputFull);
+ assert_eq!(out, b("fo"));
+ inp = &inp[2..];
+
+ assert_read!(rdr, inp, out, 2, 2, OutputFull);
+ assert_eq!(out, b("oq"));
+ inp = &inp[2..];
+
+ assert_read!(rdr, inp, out, 2, 2, OutputFull);
+ assert_eq!(out, b("uu"));
+ inp = &inp[2..];
+
+ assert_read!(rdr, inp, out, 1, 1, InputEmpty);
+ assert_eq!(&out[..1], b("x"));
+ inp = &inp[1..];
+ assert!(inp.is_empty());
+
+ assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
+ assert_read!(rdr, inp, out, 0, 0, End);
+ }
+
+ // Test that we can read a single large field across multiple input
+ // buffers.
+ #[test]
+ fn stream_input_chunks() {
+ use crate::ReadFieldResult::*;
+
+ let out = &mut [0; 10];
+ let mut rdr = Reader::new();
+
+ assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty);
+ assert_eq!(&out[..2], b("fo"));
+
+ assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty);
+ assert_eq!(&out[..4], b("fooq"));
+
+ assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty);
+ assert_eq!(&out[..6], b("fooquu"));
+
+ assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty);
+ assert_eq!(&out[..7], b("fooquux"));
+
+ assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
+ assert_read!(rdr, &[], out, 0, 0, End);
+ }
+
+ // Test we can read doubled quotes correctly in a stream.
+ #[test]
+ fn stream_doubled_quotes() {
+ use crate::ReadFieldResult::*;
+
+ let out = &mut [0; 10];
+ let mut rdr = Reader::new();
+
+ assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty);
+ assert_eq!(&out[..2], b("fo"));
+
+ assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty);
+ assert_eq!(&out[..4], b("fo\"o"));
+
+ assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
+ assert_read!(rdr, &[], out, 0, 0, End);
+ }
+
+ // Test we can read escaped quotes correctly in a stream.
+ #[test]
+ fn stream_escaped_quotes() {
+ use crate::ReadFieldResult::*;
+
+ let out = &mut [0; 10];
+ let mut builder = ReaderBuilder::new();
+ let mut rdr = builder.escape(Some(b'\\')).build();
+
+ assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty);
+ assert_eq!(&out[..2], b("fo"));
+
+ assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty);
+ assert_eq!(&out[..4], b("fo\"o"));
+
+ assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
+ assert_read!(rdr, &[], out, 0, 0, End);
+ }
+
+ // Test that empty output buffers don't wreak havoc.
+ #[test]
+ fn stream_empty_output() {
+ use crate::ReadFieldResult::*;
+
+ let out = &mut [0; 10];
+ let mut rdr = Reader::new();
+
+ assert_read!(
+ rdr,
+ b("foo,bar"),
+ out,
+ 4,
+ 3,
+ Field { record_end: false }
+ );
+ assert_eq!(&out[..3], b("foo"));
+
+ assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull);
+
+ assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty);
+ assert_eq!(&out[..3], b("bar"));
+
+ assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
+ assert_read!(rdr, &[], out, 0, 0, End);
+ }
+
+ // Test that we can reset the parser mid-stream and count on it to do
+ // the right thing.
+ #[test]
+ fn reset_works() {
+ use crate::ReadFieldResult::*;
+
+ let out = &mut [0; 10];
+ let mut rdr = Reader::new();
+
+ assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty);
+ assert_eq!(&out[..3], b("foo"));
+
+ // Without reseting the parser state, the reader will remember that
+ // we're in a quoted field, and therefore interpret the leading double
+ // quotes below as a single quote and the trailing quote as a matching
+ // terminator. With the reset, however, the parser forgets the quoted
+ // field and treats the leading double quotes as a syntax quirk and
+ // drops them, in addition to hanging on to the trailing unmatched
+ // quote. (Matches Python's behavior.)
+ rdr.reset();
+
+ assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty);
+ assert_eq!(&out[..4], b("bar\""));
+ }
+
+ // Test the line number reporting is correct.
+ #[test]
+ fn line_numbers() {
+ use crate::ReadFieldResult::*;
+
+ let out = &mut [0; 10];
+ let mut rdr = Reader::new();
+
+ assert_eq!(1, rdr.line());
+
+ assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty);
+ assert_eq!(5, rdr.line());
+
+ assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false });
+ assert_eq!(5, rdr.line());
+
+ assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true });
+ assert_eq!(6, rdr.line());
+
+ assert_read!(rdr, &[], &mut [0], 0, 0, End);
+ assert_eq!(6, rdr.line());
+ }
+
+ macro_rules! assert_read_record {
+ (
+ $rdr:expr, $input:expr, $output:expr, $ends:expr,
+ $expect_in:expr, $expect_out:expr,
+ $expect_end:expr, $expect_res:expr
+ ) => {{
+ let (res, nin, nout, nend) =
+ $rdr.read_record($input, $output, $ends);
+ assert_eq!($expect_res, res, "result");
+ assert_eq!($expect_in, nin, "input");
+ assert_eq!($expect_out, nout, "output");
+ assert_eq!($expect_end, nend, "ends");
+ }};
+ }
+
+ // Test that we can incrementally read a record.
+ #[test]
+ fn stream_record() {
+ use crate::ReadRecordResult::*;
+
+ let mut inp = b("foo,bar\nbaz");
+ let out = &mut [0; 1024];
+ let ends = &mut [0; 10];
+ let mut rdr = Reader::new();
+
+ assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
+ assert_eq!(ends[0], 3);
+ assert_eq!(ends[1], 6);
+ inp = &inp[8..];
+
+ assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty);
+ inp = &inp[3..];
+
+ assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record);
+ assert_eq!(ends[0], 3);
+
+ assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
+ }
+
+ // Test that if our output ends are full during the last read that
+ // we get an appropriate state returned.
+ #[test]
+ fn stream_record_last_end_output_full() {
+ use crate::ReadRecordResult::*;
+
+ let mut inp = b("foo,bar\nbaz");
+ let out = &mut [0; 1024];
+ let ends = &mut [0; 10];
+ let mut rdr = Reader::new();
+
+ assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
+ assert_eq!(ends[0], 3);
+ assert_eq!(ends[1], 6);
+ inp = &inp[8..];
+
+ assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty);
+ inp = &inp[3..];
+
+ assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull);
+ assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record);
+ assert_eq!(ends[0], 3);
+
+ assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
+ }
+}
diff --git a/src/writer.rs b/src/writer.rs
new file mode 100644
index 0000000..4f94301
--- /dev/null
+++ b/src/writer.rs
@@ -0,0 +1,1047 @@
+use core::fmt;
+use core::str;
+
+use memchr::memchr;
+
+use crate::{QuoteStyle, Terminator};
+
+/// A builder for configuring a CSV writer.
+///
+/// This builder permits specifying the CSV delimiter, terminator, quoting
+/// style and more.
+#[derive(Debug)]
+pub struct WriterBuilder {
+ wtr: Writer,
+}
+
+impl WriterBuilder {
+ /// Create a new builder for configuring a CSV writer.
+ pub fn new() -> WriterBuilder {
+ let wtr = Writer {
+ state: WriterState::default(),
+ requires_quotes: [false; 256],
+ delimiter: b',',
+ term: Terminator::Any(b'\n'),
+ style: QuoteStyle::default(),
+ quote: b'"',
+ escape: b'\\',
+ double_quote: true,
+ };
+ WriterBuilder { wtr: wtr }
+ }
+
+ /// Builder a CSV writer from this configuration.
+ pub fn build(&self) -> Writer {
+ use crate::Terminator::*;
+
+ let mut wtr = self.wtr.clone();
+ wtr.requires_quotes[self.wtr.delimiter as usize] = true;
+ wtr.requires_quotes[self.wtr.quote as usize] = true;
+ if !self.wtr.double_quote {
+ // We only need to quote the escape character if the escape
+ // character is used for escaping quotes.
+ wtr.requires_quotes[self.wtr.escape as usize] = true;
+ }
+ match self.wtr.term {
+ CRLF | Any(b'\n') | Any(b'\r') => {
+ // This is a bit hokey. By default, the record terminator
+ // is '\n', but we still need to quote '\r' (even if our
+ // terminator is only `\n`) because the reader interprets '\r'
+ // as a record terminator by default.
+ wtr.requires_quotes[b'\r' as usize] = true;
+ wtr.requires_quotes[b'\n' as usize] = true;
+ }
+ Any(b) => {
+ wtr.requires_quotes[b as usize] = true;
+ }
+ _ => unreachable!(),
+ }
+ wtr
+ }
+
+ /// The field delimiter to use when writing CSV.
+ ///
+ /// The default is `b','`.
+ pub fn delimiter(&mut self, delimiter: u8) -> &mut WriterBuilder {
+ self.wtr.delimiter = delimiter;
+ self
+ }
+
+ /// The record terminator to use when writing CSV.
+ ///
+ /// A record terminator can be any single byte. The default is `\n`.
+ ///
+ /// Note that RFC 4180 specifies that record terminators should be `\r\n`.
+ /// To use `\r\n`, use the special `Terminator::CRLF` value.
+ pub fn terminator(&mut self, term: Terminator) -> &mut WriterBuilder {
+ self.wtr.term = term;
+ self
+ }
+
+ /// The quoting style to use when writing CSV.
+ ///
+ /// By default, this is set to `QuoteStyle::Necessary`, which will only
+ /// use quotes when they are necessary to preserve the integrity of data.
+ ///
+ /// Note that unless the quote style is set to `Never`, an empty field is
+ /// quoted if it is the only field in a record.
+ pub fn quote_style(&mut self, style: QuoteStyle) -> &mut WriterBuilder {
+ self.wtr.style = style;
+ self
+ }
+
+ /// The quote character to use when writing CSV.
+ ///
+ /// The default value is `b'"'`.
+ pub fn quote(&mut self, quote: u8) -> &mut WriterBuilder {
+ self.wtr.quote = quote;
+ self
+ }
+
+ /// The escape character to use when writing CSV.
+ ///
+ /// This is only used when `double_quote` is set to `false`.
+ ///
+ /// The default value is `b'\\'`.
+ pub fn escape(&mut self, escape: u8) -> &mut WriterBuilder {
+ self.wtr.escape = escape;
+ self
+ }
+
+ /// The quoting escape mechanism to use when writing CSV.
+ ///
+ /// When enabled (which is the default), quotes are escaped by doubling
+ /// them. e.g., `"` escapes to `""`.
+ ///
+ /// When disabled, quotes are escaped with the escape character (which
+ /// is `\\` by default).
+ pub fn double_quote(&mut self, yes: bool) -> &mut WriterBuilder {
+ self.wtr.double_quote = yes;
+ self
+ }
+}
+
+impl Default for WriterBuilder {
+ fn default() -> WriterBuilder {
+ WriterBuilder::new()
+ }
+}
+
+/// The result of writing CSV data.
+///
+/// A value of this type is returned from every interaction with `Writer`. It
+/// informs the caller how to proceed, namely, by indicating whether more
+/// input should be given (`InputEmpty`) or if a bigger output buffer is needed
+/// (`OutputFull`).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum WriteResult {
+ /// This result occurs when all of the bytes from the given input have
+ /// been processed.
+ InputEmpty,
+ /// This result occurs when the output buffer was too small to process
+ /// all of the input bytes. Generally, this means the caller must call
+ /// the corresponding method again with the rest of the input and more
+ /// room in the output buffer.
+ OutputFull,
+}
+
+/// A writer for CSV data.
+///
+/// # RFC 4180
+///
+/// This writer conforms to RFC 4180 with one exception: it doesn't guarantee
+/// that all records written are of the same length. Instead, the onus is on
+/// the caller to ensure that all records written are of the same length.
+///
+/// Note that the default configuration of a `Writer` uses `\n` for record
+/// terminators instead of `\r\n` as specified by RFC 4180. Use the
+/// `terminator` method on `WriterBuilder` to set the terminator to `\r\n` if
+/// it's desired.
+pub struct Writer {
+ state: WriterState,
+ requires_quotes: [bool; 256],
+ delimiter: u8,
+ term: Terminator,
+ style: QuoteStyle,
+ quote: u8,
+ escape: u8,
+ double_quote: bool,
+}
+
+impl Clone for Writer {
+ fn clone(&self) -> Writer {
+ let mut requires_quotes = [false; 256];
+ for i in 0..256 {
+ requires_quotes[i] = self.requires_quotes[i];
+ }
+ Writer {
+ state: self.state.clone(),
+ requires_quotes: requires_quotes,
+ delimiter: self.delimiter,
+ term: self.term,
+ style: self.style,
+ quote: self.quote,
+ escape: self.escape,
+ double_quote: self.double_quote,
+ }
+ }
+}
+
+impl fmt::Debug for Writer {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("Writer")
+ .field("state", &self.state)
+ .field("delimiter", &self.delimiter)
+ .field("term", &self.term)
+ .field("style", &self.style)
+ .field("quote", &self.quote)
+ .field("escape", &self.escape)
+ .field("double_quote", &self.double_quote)
+ .finish()
+ }
+}
+
+#[derive(Clone, Debug)]
+struct WriterState {
+ /// This is set whenever we've begun writing the contents of a field, even
+ /// if the contents are empty. We use it to avoid re-computing whether
+ /// quotes are necessary.
+ in_field: bool,
+ /// This is set whenever we've started writing a field that is enclosed in
+ /// quotes. When the writer is finished, or if a delimiter or terminator
+ /// are written, then a closing quote is inserted when this is true.
+ quoting: bool,
+ /// The number of total bytes written for the current record.
+ ///
+ /// If the writer is finished or a terminator is written when this is `0`,
+ /// then an empty field is added as a pair of adjacent quotes.
+ record_bytes: u64,
+}
+
+impl Writer {
+ /// Creates a new CSV writer with the default configuration.
+ pub fn new() -> Writer {
+ Writer::default()
+ }
+
+ /// Finish writing CSV data to `output`.
+ ///
+ /// This must be called when one is done writing CSV data to `output`.
+ /// In particular, it will write closing quotes if necessary.
+ pub fn finish(&mut self, mut output: &mut [u8]) -> (WriteResult, usize) {
+ let mut nout = 0;
+ if self.state.record_bytes == 0 && self.state.in_field {
+ assert!(!self.state.quoting);
+ let (res, o) = self.write(&[self.quote, self.quote], output);
+ if o == 0 {
+ return (res, 0);
+ }
+ output = &mut moving(output)[o..];
+ nout += o;
+ self.state.record_bytes += o as u64;
+ }
+ if !self.state.quoting {
+ return (WriteResult::InputEmpty, nout);
+ }
+ let (res, o) = self.write(&[self.quote], output);
+ if o == 0 {
+ return (res, nout);
+ }
+ nout += o;
+ self.state.record_bytes = 0;
+ self.state.in_field = false;
+ self.state.quoting = false;
+ (res, nout)
+ }
+
+ /// Write a single CSV field from `input` to `output` while employing this
+ /// writer's quoting style.
+ ///
+ /// This returns the result of writing field data, in addition to the
+ /// number of bytes consumed from `input` and the number of bytes
+ /// written to `output`.
+ ///
+ /// The result of writing field data is either `WriteResult::InputEmpty`
+ /// or `WriteResult::OutputFull`. The former occurs when all bytes in
+ /// `input` were copied to `output`, while the latter occurs when `output`
+ /// is too small to fit everything from `input`. The maximum number of
+ /// bytes that can be written to `output` is `2 + (2 * input.len())`
+ /// because of quoting. (The worst case is a field consisting entirely
+ /// of quotes.)
+ ///
+ /// Multiple successive calls to `field` will write more data to the same
+ /// field. Subsequent fields can be written by calling either `delimiter`
+ /// or `terminator` first.
+ ///
+ /// If this writer's quoting style is `QuoteStyle::Necessary`, then `input`
+ /// should contain the *entire* field. Otherwise, whether the field needs
+ /// to be quoted or not cannot be determined.
+ pub fn field(
+ &mut self,
+ input: &[u8],
+ mut output: &mut [u8],
+ ) -> (WriteResult, usize, usize) {
+ let (mut nin, mut nout) = (0, 0);
+
+ if !self.state.in_field {
+ self.state.quoting = self.should_quote(input);
+ if self.state.quoting {
+ let (res, o) = self.write(&[self.quote], output);
+ if o == 0 {
+ return (res, 0, 0);
+ }
+ output = &mut moving(output)[o..];
+ nout += o;
+ self.state.record_bytes += o as u64;
+ }
+ self.state.in_field = true;
+ }
+ let (res, i, o) = if self.state.quoting {
+ quote(input, output, self.quote, self.escape, self.double_quote)
+ } else {
+ write_optimistic(input, output)
+ };
+ nin += i;
+ nout += o;
+ self.state.record_bytes += o as u64;
+ (res, nin, nout)
+ }
+
+ /// Write the configured field delimiter to `output`.
+ ///
+ /// If the output buffer does not have enough room to fit
+ /// a field delimiter, then nothing is written to `output`
+ /// and `WriteResult::OutputFull` is returned. Otherwise,
+ /// `WriteResult::InputEmpty` is returned along with the number of bytes
+ /// written to `output` (which is `1` in case of an unquoted
+ /// field, or `2` in case of an end quote and a field separator).
+ pub fn delimiter(
+ &mut self,
+ mut output: &mut [u8],
+ ) -> (WriteResult, usize) {
+ let mut nout = 0;
+ if self.state.quoting {
+ let (res, o) = self.write(&[self.quote], output);
+ if o == 0 {
+ return (res, o);
+ }
+ output = &mut moving(output)[o..];
+ nout += o;
+ self.state.record_bytes += o as u64;
+ self.state.quoting = false;
+ }
+ let (res, o) = self.write(&[self.delimiter], output);
+ if o == 0 {
+ return (res, nout);
+ }
+ nout += o;
+ self.state.record_bytes += o as u64;
+ self.state.in_field = false;
+ (res, nout)
+ }
+
+ /// Write the configured record terminator to `output`.
+ ///
+ /// If the output buffer does not have enough room to fit a record
+ /// terminator, then no part of the terminator is written and
+ /// `WriteResult::OutputFull` is returned. Otherwise,
+ /// `WriteResult::InputEmpty` is returned along with the number of bytes
+ /// written to `output` (which is always `1` or `2`).
+ pub fn terminator(
+ &mut self,
+ mut output: &mut [u8],
+ ) -> (WriteResult, usize) {
+ let mut nout = 0;
+ if self.state.record_bytes == 0 {
+ assert!(!self.state.quoting);
+ let (res, o) = self.write(&[self.quote, self.quote], output);
+ if o == 0 {
+ return (res, 0);
+ }
+ output = &mut moving(output)[o..];
+ nout += o;
+ self.state.record_bytes += o as u64;
+ }
+ if self.state.quoting {
+ let (res, o) = self.write(&[self.quote], output);
+ if o == 0 {
+ return (res, o);
+ }
+ output = &mut moving(output)[o..];
+ nout += o;
+ self.state.record_bytes += o as u64;
+ self.state.quoting = false;
+ }
+ let (res, o) = match self.term {
+ Terminator::CRLF => write_pessimistic(&[b'\r', b'\n'], output),
+ Terminator::Any(b) => write_pessimistic(&[b], output),
+ _ => unreachable!(),
+ };
+ if o == 0 {
+ return (res, nout);
+ }
+ nout += o;
+ self.state.record_bytes = 0;
+ self.state.in_field = false;
+ (res, nout)
+ }
+
+ /// Returns true if and only if the given input field *requires* quotes to
+ /// preserve the integrity of `input` while taking into account the current
+ /// configuration of this writer (except for the configured quoting style).
+ #[inline]
+ fn needs_quotes(&self, mut input: &[u8]) -> bool {
+ let mut needs = false;
+ while !needs && input.len() >= 8 {
+ needs = self.requires_quotes[input[0] as usize]
+ || self.requires_quotes[input[1] as usize]
+ || self.requires_quotes[input[2] as usize]
+ || self.requires_quotes[input[3] as usize]
+ || self.requires_quotes[input[4] as usize]
+ || self.requires_quotes[input[5] as usize]
+ || self.requires_quotes[input[6] as usize]
+ || self.requires_quotes[input[7] as usize];
+ input = &input[8..];
+ }
+ needs || input.iter().any(|&b| self.is_special_byte(b))
+ }
+
+ /// Returns true if and only if the given byte corresponds to a special
+ /// byte in this CSV writer's configuration.
+ ///
+ /// Note that this does **not** take into account this writer's quoting
+ /// style.
+ #[inline]
+ pub fn is_special_byte(&self, b: u8) -> bool {
+ self.requires_quotes[b as usize]
+ }
+
+ /// Returns true if and only if we should put the given field data
+ /// in quotes. This takes the quoting style into account.
+ #[inline]
+ pub fn should_quote(&self, input: &[u8]) -> bool {
+ match self.style {
+ QuoteStyle::Always => true,
+ QuoteStyle::Never => false,
+ QuoteStyle::NonNumeric => is_non_numeric(input),
+ QuoteStyle::Necessary => self.needs_quotes(input),
+ _ => unreachable!(),
+ }
+ }
+
+ /// Return the delimiter used for this writer.
+ #[inline]
+ pub fn get_delimiter(&self) -> u8 {
+ self.delimiter
+ }
+
+ /// Return the terminator used for this writer.
+ #[inline]
+ pub fn get_terminator(&self) -> Terminator {
+ self.term
+ }
+
+ /// Return the quoting style used for this writer.
+ #[inline]
+ pub fn get_quote_style(&self) -> QuoteStyle {
+ self.style
+ }
+
+ /// Return the quote character used for this writer.
+ #[inline]
+ pub fn get_quote(&self) -> u8 {
+ self.quote
+ }
+
+ /// Return the escape character used for this writer.
+ #[inline]
+ pub fn get_escape(&self) -> u8 {
+ self.escape
+ }
+
+ /// Return whether this writer doubles quotes or not. When the writer
+ /// does not double quotes, it will escape them using the escape character.
+ #[inline]
+ pub fn get_double_quote(&self) -> bool {
+ self.double_quote
+ }
+
+ fn write(&self, data: &[u8], output: &mut [u8]) -> (WriteResult, usize) {
+ if data.len() > output.len() {
+ (WriteResult::OutputFull, 0)
+ } else {
+ output[..data.len()].copy_from_slice(data);
+ (WriteResult::InputEmpty, data.len())
+ }
+ }
+}
+
+impl Default for Writer {
+ fn default() -> Writer {
+ WriterBuilder::new().build()
+ }
+}
+
+impl Default for WriterState {
+ fn default() -> WriterState {
+ WriterState { in_field: false, quoting: false, record_bytes: 0 }
+ }
+}
+
+/// Returns true if and only if the given input is non-numeric.
+pub fn is_non_numeric(input: &[u8]) -> bool {
+ let s = match str::from_utf8(input) {
+ Err(_) => return true,
+ Ok(s) => s,
+ };
+ // I suppose this could be faster if we wrote validators of numbers instead
+ // of using the actual parser, but that's probably a lot of work for a bit
+ // of a niche feature.
+ !s.parse::<f64>().is_ok() && !s.parse::<i128>().is_ok()
+}
+
+/// Escape quotes `input` and writes the result to `output`.
+///
+/// If `input` does not have a `quote`, then the contents of `input` are
+/// copied verbatim to `output`.
+///
+/// If `output` is not big enough to store the fully quoted contents of
+/// `input`, then `WriteResult::OutputFull` is returned. The `output` buffer
+/// will require a maximum of storage of `2 * input.len()` in the worst case
+/// (where every byte is a quote).
+///
+/// In streaming contexts, `quote` should be called in a loop until
+/// `WriteResult::InputEmpty` is returned. It is possible to write an infinite
+/// loop if your output buffer is less than 2 bytes in length (the minimum
+/// storage space required to store an escaped quote).
+///
+/// In addition to the `WriteResult`, the number of consumed bytes from `input`
+/// and the number of bytes written to `output` are also returned.
+///
+/// `quote` is the quote byte and `escape` is the escape byte. If
+/// `double_quote` is true, then quotes are escaped by doubling them,
+/// otherwise, quotes are escaped with the `escape` byte.
+///
+/// N.B. This function is provided for low level usage. It is called
+/// automatically if you're using a `Writer`.
+pub fn quote(
+ mut input: &[u8],
+ mut output: &mut [u8],
+ quote: u8,
+ escape: u8,
+ double_quote: bool,
+) -> (WriteResult, usize, usize) {
+ let (mut nin, mut nout) = (0, 0);
+ loop {
+ match memchr(quote, input) {
+ None => {
+ let (res, i, o) = write_optimistic(input, output);
+ nin += i;
+ nout += o;
+ return (res, nin, nout);
+ }
+ Some(next_quote) => {
+ let (res, i, o) =
+ write_optimistic(&input[..next_quote], output);
+ input = &input[i..];
+ output = &mut moving(output)[o..];
+ nin += i;
+ nout += o;
+ if let WriteResult::OutputFull = res {
+ return (res, nin, nout);
+ }
+ if double_quote {
+ let (res, o) = write_pessimistic(&[quote, quote], output);
+ if let WriteResult::OutputFull = res {
+ return (res, nin, nout);
+ }
+ nout += o;
+ output = &mut moving(output)[o..];
+ } else {
+ let (res, o) = write_pessimistic(&[escape, quote], output);
+ if let WriteResult::OutputFull = res {
+ return (res, nin, nout);
+ }
+ nout += o;
+ output = &mut moving(output)[o..];
+ }
+ nin += 1;
+ input = &input[1..];
+ }
+ }
+ }
+}
+
+/// Copy the bytes from `input` to `output`. If `output` is too small to fit
+/// everything from `input`, then copy `output.len()` bytes from `input`.
+/// Otherwise, copy everything from `input` into `output`.
+///
+/// In the first case (`output` is too small), `WriteResult::OutputFull` is
+/// returned, in addition to the number of bytes consumed from `input` and
+/// the number of bytes written to `output`.
+///
+/// In the second case (`input` is no bigger than `output`),
+/// `WriteResult::InputEmpty` is returned, in addition to the number of bytes
+/// consumed from `input` and the number of bytes written to `output`.
+fn write_optimistic(
+ input: &[u8],
+ output: &mut [u8],
+) -> (WriteResult, usize, usize) {
+ if input.len() > output.len() {
+ let input = &input[..output.len()];
+ output.copy_from_slice(input);
+ (WriteResult::OutputFull, output.len(), output.len())
+ } else {
+ output[..input.len()].copy_from_slice(input);
+ (WriteResult::InputEmpty, input.len(), input.len())
+ }
+}
+
+/// Copy the bytes from `input` to `output` only if `input` is no bigger than
+/// `output`. If `input` is bigger than `output`, then return
+/// `WriteResult::OutputFull` and copy nothing into `output`. Otherwise,
+/// return `WriteResult::InputEmpty` and the number of bytes copied into
+/// `output`.
+fn write_pessimistic(input: &[u8], output: &mut [u8]) -> (WriteResult, usize) {
+ if input.len() > output.len() {
+ (WriteResult::OutputFull, 0)
+ } else {
+ output[..input.len()].copy_from_slice(input);
+ (WriteResult::InputEmpty, input.len())
+ }
+}
+
+/// This avoids reborrowing.
+/// See: https://bluss.github.io/rust/fun/2015/10/11/stuff-the-identity-function-does/
+fn moving<T>(x: T) -> T {
+ x
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::writer::WriteResult::*;
+ use crate::writer::{quote, QuoteStyle, Writer, WriterBuilder};
+
+ // OMG I HATE BYTE STRING LITERALS SO MUCH.
+ fn b(s: &str) -> &[u8] {
+ s.as_bytes()
+ }
+ fn s(b: &[u8]) -> &str {
+ ::core::str::from_utf8(b).unwrap()
+ }
+
+ macro_rules! assert_field {
+ (
+ $wtr:expr, $inp:expr, $out:expr,
+ $expect_in:expr, $expect_out:expr,
+ $expect_res:expr, $expect_data:expr
+ ) => {{
+ let (res, i, o) = $wtr.field($inp, $out);
+ assert_eq!($expect_res, res, "result");
+ assert_eq!($expect_in, i, "input");
+ assert_eq!($expect_out, o, "output");
+ assert_eq!($expect_data, s(&$out[..o]), "data");
+ }};
+ }
+
+ macro_rules! assert_write {
+ (
+ $wtr:expr, $which:ident, $out:expr,
+ $expect_out:expr, $expect_res:expr, $expect_data:expr
+ ) => {{
+ let (res, o) = $wtr.$which($out);
+ assert_eq!($expect_res, res, "result");
+ assert_eq!($expect_out, o, "output");
+ assert_eq!($expect_data, s(&$out[..o]), "data");
+ }};
+ }
+
+ #[test]
+ fn writer_one_field() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc");
+ n += 3;
+
+ assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, "");
+ }
+
+ #[test]
+ fn writer_one_empty_field_terminator() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+
+ assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, "");
+ assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n");
+ assert_write!(wtr, finish, &mut out[..], 0, InputEmpty, "");
+ }
+
+ #[test]
+ fn writer_one_empty_field_finish() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+
+ assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, "");
+ assert_write!(wtr, finish, &mut out[..], 2, InputEmpty, "\"\"");
+ }
+
+ #[test]
+ fn writer_many_one_empty_field_finish() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+
+ assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, "");
+ assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n");
+ assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, "");
+ assert_write!(wtr, finish, &mut out[..], 2, InputEmpty, "\"\"");
+ }
+
+ #[test]
+ fn writer_many_one_empty_field_terminator() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+
+ assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, "");
+ assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n");
+ assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, "");
+ assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n");
+ assert_write!(wtr, finish, &mut out[..], 0, InputEmpty, "");
+ }
+
+ #[test]
+ fn writer_one_field_quote() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(
+ wtr,
+ b("a\"bc"),
+ &mut out[n..],
+ 4,
+ 6,
+ InputEmpty,
+ "\"a\"\"bc"
+ );
+ n += 6;
+
+ assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\"");
+ }
+
+ #[test]
+ fn writer_one_field_stream() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc");
+ n += 3;
+ assert_field!(wtr, b("x"), &mut out[n..], 1, 1, InputEmpty, "x");
+ n += 1;
+
+ assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, "");
+ }
+
+ #[test]
+ fn writer_one_field_stream_quote() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(
+ wtr,
+ b("abc\""),
+ &mut out[n..],
+ 4,
+ 6,
+ InputEmpty,
+ "\"abc\"\""
+ );
+ n += 6;
+ assert_field!(wtr, b("x"), &mut out[n..], 1, 1, InputEmpty, "x");
+ n += 1;
+
+ assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\"");
+ }
+
+ #[test]
+ fn writer_one_field_stream_quote_partial() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 4];
+
+ assert_field!(wtr, b("ab\"xyz"), out, 2, 3, OutputFull, "\"ab");
+ assert_field!(wtr, b("\"xyz"), out, 3, 4, OutputFull, "\"\"xy");
+ assert_field!(wtr, b("z"), out, 1, 1, InputEmpty, "z");
+ assert_write!(wtr, finish, out, 1, InputEmpty, "\"");
+ }
+
+ #[test]
+ fn writer_two_fields() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc");
+ n += 3;
+ assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ",");
+ n += 1;
+ assert_field!(wtr, b("yz"), &mut out[n..], 2, 2, InputEmpty, "yz");
+ n += 2;
+
+ assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, "");
+
+ assert_eq!("abc,yz", s(&out[..n]));
+ }
+
+ #[test]
+ fn writer_two_fields_non_numeric() {
+ let mut wtr =
+ WriterBuilder::new().quote_style(QuoteStyle::NonNumeric).build();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(wtr, b("abc"), &mut out[n..], 3, 4, InputEmpty, "\"abc");
+ n += 4;
+ assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\",");
+ n += 2;
+ assert_field!(wtr, b("5.2"), &mut out[n..], 3, 3, InputEmpty, "5.2");
+ n += 3;
+ assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ",");
+ n += 1;
+ assert_field!(wtr, b("98"), &mut out[n..], 2, 2, InputEmpty, "98");
+ n += 2;
+
+ assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, "");
+
+ assert_eq!("\"abc\",5.2,98", s(&out[..n]));
+ }
+
+ #[test]
+ fn writer_two_fields_quote() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(
+ wtr,
+ b("a,bc"),
+ &mut out[n..],
+ 4,
+ 5,
+ InputEmpty,
+ "\"a,bc"
+ );
+ n += 5;
+ assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\",");
+ n += 2;
+ assert_field!(wtr, b("\nz"), &mut out[n..], 2, 3, InputEmpty, "\"\nz");
+ n += 3;
+
+ assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\"");
+ n += 1;
+
+ assert_eq!("\"a,bc\",\"\nz\"", s(&out[..n]));
+ }
+
+ #[test]
+ fn writer_two_fields_two_records() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc");
+ n += 3;
+ assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ",");
+ n += 1;
+ assert_field!(wtr, b("yz"), &mut out[n..], 2, 2, InputEmpty, "yz");
+ n += 2;
+ assert_write!(wtr, terminator, &mut out[n..], 1, InputEmpty, "\n");
+ n += 1;
+ assert_field!(wtr, b("foo"), &mut out[n..], 3, 3, InputEmpty, "foo");
+ n += 3;
+ assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ",");
+ n += 1;
+ assert_field!(wtr, b("quux"), &mut out[n..], 4, 4, InputEmpty, "quux");
+ n += 4;
+
+ assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, "");
+
+ assert_eq!("abc,yz\nfoo,quux", s(&out[..n]));
+ }
+
+ #[test]
+ fn writer_two_fields_two_records_quote() {
+ let mut wtr = Writer::new();
+ let out = &mut [0; 1024];
+ let mut n = 0;
+
+ assert_field!(
+ wtr,
+ b("a,bc"),
+ &mut out[n..],
+ 4,
+ 5,
+ InputEmpty,
+ "\"a,bc"
+ );
+ n += 5;
+ assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\",");
+ n += 2;
+ assert_field!(wtr, b("\nz"), &mut out[n..], 2, 3, InputEmpty, "\"\nz");
+ n += 3;
+ assert_write!(wtr, terminator, &mut out[n..], 2, InputEmpty, "\"\n");
+ n += 2;
+ assert_field!(
+ wtr,
+ b("f\"oo"),
+ &mut out[n..],
+ 4,
+ 6,
+ InputEmpty,
+ "\"f\"\"oo"
+ );
+ n += 6;
+ assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\",");
+ n += 2;
+ assert_field!(
+ wtr,
+ b("quux,"),
+ &mut out[n..],
+ 5,
+ 6,
+ InputEmpty,
+ "\"quux,"
+ );
+ n += 6;
+
+ assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\"");
+ n += 1;
+
+ assert_eq!("\"a,bc\",\"\nz\"\n\"f\"\"oo\",\"quux,\"", s(&out[..n]));
+ }
+
+ macro_rules! assert_quote {
+ (
+ $inp:expr, $out:expr,
+ $expect_in:expr, $expect_out:expr,
+ $expect_res:expr, $expect_data:expr
+ ) => {
+ assert_quote!(
+ $inp,
+ $out,
+ $expect_in,
+ $expect_out,
+ $expect_res,
+ $expect_data,
+ true
+ );
+ };
+ (
+ $inp:expr, $out:expr,
+ $expect_in:expr, $expect_out:expr,
+ $expect_res:expr, $expect_data:expr,
+ $double_quote:expr
+ ) => {{
+ let (res, i, o) = quote($inp, $out, b'"', b'\\', $double_quote);
+ assert_eq!($expect_res, res, "result");
+ assert_eq!($expect_in, i, "input");
+ assert_eq!($expect_out, o, "output");
+ assert_eq!(b($expect_data), &$out[..o], "data");
+ }};
+ }
+
+ #[test]
+ fn quote_empty() {
+ let inp = b("");
+ let out = &mut [0; 1024];
+
+ assert_quote!(inp, out, 0, 0, InputEmpty, "");
+ }
+
+ #[test]
+ fn quote_no_quotes() {
+ let inp = b("foobar");
+ let out = &mut [0; 1024];
+
+ assert_quote!(inp, out, 6, 6, InputEmpty, "foobar");
+ }
+
+ #[test]
+ fn quote_one_quote() {
+ let inp = b("\"");
+ let out = &mut [0; 1024];
+
+ assert_quote!(inp, out, 1, 2, InputEmpty, r#""""#);
+ }
+
+ #[test]
+ fn quote_two_quotes() {
+ let inp = b("\"\"");
+ let out = &mut [0; 1024];
+
+ assert_quote!(inp, out, 2, 4, InputEmpty, r#""""""#);
+ }
+
+ #[test]
+ fn quote_escaped_one() {
+ let inp = b("\"");
+ let out = &mut [0; 1024];
+
+ assert_quote!(inp, out, 1, 2, InputEmpty, r#"\""#, false);
+ }
+
+ #[test]
+ fn quote_escaped_two() {
+ let inp = b("\"\"");
+ let out = &mut [0; 1024];
+
+ assert_quote!(inp, out, 2, 4, InputEmpty, r#"\"\""#, false);
+ }
+
+ #[test]
+ fn quote_misc() {
+ let inp = b(r#"foo "bar" baz "quux"?"#);
+ let out = &mut [0; 1024];
+
+ assert_quote!(
+ inp,
+ out,
+ 21,
+ 25,
+ InputEmpty,
+ r#"foo ""bar"" baz ""quux""?"#
+ );
+ }
+
+ #[test]
+ fn quote_stream_no_quotes() {
+ let mut inp = b("fooba");
+ let out = &mut [0; 2];
+
+ assert_quote!(inp, out, 2, 2, OutputFull, "fo");
+ inp = &inp[2..];
+ assert_quote!(inp, out, 2, 2, OutputFull, "ob");
+ inp = &inp[2..];
+ assert_quote!(inp, out, 1, 1, InputEmpty, "a");
+ }
+
+ #[test]
+ fn quote_stream_quotes() {
+ let mut inp = b(r#"a"bc"d""#);
+ let out = &mut [0; 2];
+
+ assert_quote!(inp, out, 1, 1, OutputFull, "a");
+ inp = &inp[1..];
+ assert_quote!(inp, out, 1, 2, OutputFull, r#""""#);
+ inp = &inp[1..];
+ assert_quote!(inp, out, 2, 2, OutputFull, "bc");
+ inp = &inp[2..];
+ assert_quote!(inp, out, 1, 2, OutputFull, r#""""#);
+ inp = &inp[1..];
+ assert_quote!(inp, out, 1, 1, OutputFull, "d");
+ inp = &inp[1..];
+ assert_quote!(inp, out, 1, 2, InputEmpty, r#""""#);
+ }
+}