diff options
author | Matthew Maurer <mmaurer@google.com> | 2023-06-16 21:46:27 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2023-06-16 21:46:27 +0000 |
commit | d1ef55ff39217d0a988a8e81107b0983184ab9e5 (patch) | |
tree | d45b1c9a0f6e40a99e34d574e097bfb0acf72c21 | |
parent | 8286ad0160dcea93a9f7a55e8f65c25c2988af48 (diff) | |
parent | f5ad6d6af2280636d59f4f92335c6b59e07b6254 (diff) | |
download | xml-rs-d1ef55ff39217d0a988a8e81107b0983184ab9e5.tar.gz |
Upgrade xml-rs to 0.8.15-cvss-cries-wolf am: bec0e9a523 am: 89fe311d3f am: c1ce2203d0 am: f5ad6d6af2
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/xml-rs/+/2626331
Change-Id: Ic18b42f29853fb8a4e13e9f35d05b80b2c6e0da2
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
57 files changed, 2748 insertions, 2917 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index 6e0c55d..f0a8a38 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,6 @@ { "git": { - "sha1": "7cd06954fd6e22b7dbf9ea02ff4e22f9ff6309fd" - } -} + "sha1": "c4705ddc172950c28f9b229f368ad8f4cba81e3f" + }, + "path_in_vcs": "" +}
\ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index daca69f..0000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: CI - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - test: - runs-on: ubuntu-latest - strategy: - matrix: - rust: [stable, beta, nightly] - - steps: - - uses: actions/checkout@v2 - - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: ${{ matrix.rust }} - override: true - - - uses: actions-rs/cargo@v1 - with: - command: build - - - uses: actions-rs/cargo@v1 - with: - command: test @@ -21,13 +21,12 @@ license { rust_library { name: "libxml_rust", stem: "libxml", - // has rustc warnings host_supported: true, crate_name: "xml", cargo_env_compat: true, - cargo_pkg_version: "0.8.4", + cargo_pkg_version: "0.8.15-cvss-cries-wolf", srcs: ["src/lib.rs"], - edition: "2015", + edition: "2021", apex_available: [ "//apex_available:platform", "com.android.virt", @@ -36,63 +35,14 @@ rust_library { vendor_available: true, } -rust_defaults { - name: "xml-rs_test_defaults", - crate_name: "xml_rs", - cargo_env_compat: true, - cargo_pkg_version: "0.8.4", - test_suites: ["general-tests"], - auto_gen_config: true, - edition: "2015", - rustlibs: [ - "liblazy_static", - "libxml_rust", - ], -} - -rust_test { - name: "xml-rs_test_tests_event_reader", - defaults: ["xml-rs_test_defaults"], - // has rustc warnings - host_supported: true, - srcs: ["tests/event_reader.rs"], - test_options: { - unit_test: true, - }, -} - -rust_test { - name: "xml-rs_test_tests_event_writer", - defaults: ["xml-rs_test_defaults"], - // has rustc warnings - host_supported: true, - srcs: ["tests/event_writer.rs"], - test_options: { - unit_test: true, - }, - data: ["tests/documents/*"], -} - -rust_test { - name: "xml-rs_test_tests_streaming", - defaults: ["xml-rs_test_defaults"], - // has rustc warnings - host_supported: true, - srcs: ["tests/streaming.rs"], - test_options: { - unit_test: true, - }, -} - rust_binary { name: "xml_analyze", - // has rustc warnings host_supported: true, crate_name: "xml_analyze", cargo_env_compat: true, - cargo_pkg_version: "0.8.4", + cargo_pkg_version: "0.8.15-cvss-cries-wolf", srcs: ["src/analyze.rs"], - edition: "2015", + edition: "2021", rustlibs: [ "libxml_rust", ], @@ -3,24 +3,44 @@ # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies +# to registry (e.g., crates.io) dependencies. # -# If you believe there's an error in this file please file an -# issue against the rust-lang/cargo repository. If you're -# editing this file be aware that the upstream Cargo.toml -# will likely look very different (and much more reasonable) +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. [package] +edition = "2021" +rust-version = "1.58" name = "xml-rs" -version = "0.8.4" +version = "0.8.15-cvss-cries-wolf" authors = ["Vladimir Matveev <vmatveev@citrine.cc>"] +include = [ + "src/**", + "LICENSE", + "README.md", +] description = "An XML library in pure Rust" -documentation = "http://docs.rs/xml-rs/" -readme = "Readme.md" -keywords = ["xml", "parsing", "parser"] -categories = ["parsing"] +homepage = "https://lib.rs/crates/xml-rs" +documentation = "https://docs.rs/xml-rs/" +readme = "README.md" +keywords = [ + "xml", + "parser", + "sax", + "parsing", + "writer", +] +categories = ["parser-implementations"] license = "MIT" -repository = "https://github.com/netvl/xml-rs" +repository = "https://github.com/kornelski/xml-rs" + +[package.metadata.docs.rs] +targets = ["x86_64-unknown-linux-gnu"] + +[package.metadata.release] +tag-message = "" +tag-name = "{{version}}" [lib] name = "xml" @@ -29,8 +49,9 @@ path = "src/lib.rs" [[bin]] name = "xml-analyze" path = "src/analyze.rs" + [dev-dependencies.doc-comment] version = "0.3" -[dev-dependencies.lazy_static] -version = "1.2.0" +[badges.maintenance] +status = "actively-developed" diff --git a/Cargo.toml.orig b/Cargo.toml.orig index c8df8e6..0282e7a 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,14 +1,18 @@ [package] name = "xml-rs" -version = "0.8.4" +version = "0.8.15-cvss-cries-wolf" authors = ["Vladimir Matveev <vmatveev@citrine.cc>"] license = "MIT" description = "An XML library in pure Rust" -repository = "https://github.com/netvl/xml-rs" -documentation = "http://docs.rs/xml-rs/" -readme = "Readme.md" -keywords = ["xml", "parsing", "parser"] -categories = ["parsing"] +repository = "https://github.com/kornelski/xml-rs" +homepage = "https://lib.rs/crates/xml-rs" +documentation = "https://docs.rs/xml-rs/" +readme = "README.md" +keywords = ["xml", "parser", "sax", "parsing", "writer"] +categories = ["parser-implementations"] +edition = "2021" +rust-version = "1.58" +include = ["src/**", "LICENSE", "README.md"] [lib] name = "xml" @@ -20,4 +24,13 @@ path = "src/analyze.rs" [dev-dependencies] doc-comment = "0.3" -lazy_static = "1.2.0" + +[badges] +maintenance = { status = "actively-developed" } + +[package.metadata.docs.rs] +targets = ["x86_64-unknown-linux-gnu"] + +[package.metadata.release] +tag-name = "{{version}}" +tag-message = "" diff --git a/Changelog.md b/Changelog.md deleted file mode 100644 index 3cca8b8..0000000 --- a/Changelog.md +++ /dev/null @@ -1,126 +0,0 @@ -## Version 0.8.4 - -* Fixed recognition of `?>`, `]]>` and `/>` tokens as characters. -* Fixed writer output operations to use `write_all` to ensure that the data - is written fully. -* The document declaration is now written before any characters automatically. - -## Version 0.8.3 - -* Added a new parser option, `ignore_root_level_whitespace`, which makes the parser - skip emitting whitespace events outside of the root element when set to `true`. - This helps with certain tasks like canonicalization. - -## Version 0.8.2 - -* Added a new parser option, `replace_unknown_entity_references`, which allows to ignore - invalid Unicode code points and replace them with a Unicode "replacement character" - during parsing. This can be helpful to deal with e.g. UTF-16 surrogate pairs. -* Added a new emitter option, `pad_self_closing`, which determines the style of the self-closing - elements when they are emitted: `<a />` (`true`) vs `<a/>` (`false`). - -## Version 0.8.1 - -* Fixed various issues with tests introduced by updates in Rust. -* Adjusted the lexer to ignore contents of the `<!DOCTYPE>` tag. -* Removed unnecessary unsafety in tests. -* Added tests for doc comments in the readme file. -* Switched to GitHub Actions from Travis CI. - -## Version 0.8.0 - -* Same as 0.7.1, with 0.7.1 being yanked because of the incorrect semver bump. - -## Version 0.7.1 - -* Removed dependency on bitflags. -* Added the `XmlWriter::inner_mut()` method. -* Fixed some rustdoc warnings. - -## Version 0.7.0 - -* Same as 0.6.2, with 0.6.2 being yanked because of the incompatible bump of minimum required version of rustc. - -## Version 0.6.2 - -* Bumped `bitflags` to 1.0. - -## Version 0.6.1 - -* Fixed the writer to escape some special characters when writing attribute values. - -## Version 0.6.0 - -* Changed the target type of extra entities from `char` to `String`. This is an incompatible - change. - -## Version 0.5.0 - -* Added support for ignoring EOF errors in order to read documents from streams incrementally. -* Bumped `bitflags` to 0.9. - -## Version 0.4.1 - -* Added missing `Debug` implementation to `xml::writer::XmlEvent`. - -## Version 0.4.0 - -* Bumped version number, since changes introduced in 0.3.7 break backwards compatibility. - -## Version 0.3.8 - -* Fixed a problem introduced in 0.3.7 with entities in attributes causing parsing errors. - -## Version 0.3.7 - -* Fixed the problem with parsing non-whitespace character entities as whitespace (issue #140). -* Added support for configuring custom entities in the parser configuration. - -## Version 0.3.6 - -* Added an `Error` implementation for `EmitterError`. -* Fixed escaping of strings with multi-byte code points. - -## Version 0.3.5 - -* Added `Debug` implementation for `XmlVersion`. -* Fixed some failing tests. - -## Version 0.3.3 - -* Updated `bitflags` to 0.7. - -## Version 0.3.2 - -* Added `From<io::Error>` for `xml::reader::Error`, which improves usability of working with parsing errors. - -## Version 0.3.1 - -* Bumped `bitflags` dependency to 0.4, some internal warning fixes. - -## Version 0.3.0 - -* Changed error handling in `EventReader` - now I/O errors are properly bubbled up from the lexer. - -## Version 0.2.4 - -* Fixed #112 - incorrect handling of namespace redefinitions when writing a document. - -## Version 0.2.3 - -* Added `into_inner()` methods to `EventReader` and `EventWriter`. - -## Version 0.2.2 - -* Using `join` instead of the deprecated `connect`. -* Added a simple XML analyzer program which demonstrates library usage and can be used to check XML documents for well-formedness. -* Fixed incorrect handling of unqualified attribute names (#107). -* Added this changelog. - -## Version 0.2.1 - -* Fixed #105 - incorrect handling of double dashes. - -## Version 0.2.0 - -* Major update, includes proper document writing support and significant architecture changes. @@ -1,3 +1,7 @@ +# This project was upgraded with external_updater. +# Usage: tools/external_updater/updater.sh update rust/crates/xml-rs +# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md + name: "xml-rs" description: "An XML library in pure Rust" third_party { @@ -7,13 +11,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/xml-rs/xml-rs-0.8.4.crate" + value: "https://static.crates.io/crates/xml-rs/xml-rs-0.8.15-cvss-cries-wolf.crate" } - version: "0.8.4" + version: "0.8.15-cvss-cries-wolf" license_type: NOTICE last_upgrade_date { - year: 2021 - month: 8 - day: 9 + year: 2023 + month: 6 + day: 14 } } diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa4ba7f --- /dev/null +++ b/README.md @@ -0,0 +1,209 @@ +xml-rs, an XML library for Rust +=============================== + +[![CI](https://github.com/kornelski/xml-rs/actions/workflows/main.yml/badge.svg)](https://github.com/kornelski/xml-rs/actions/workflows/main.yml) +[![crates.io][crates-io-img]](https://lib.rs/crates/xml-rs) +[![docs][docs-img]](https://docs.rs/xml-rs/) + +[Documentation](https://docs.rs/xml-rs/) + + [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg + [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg + +xml-rs is an XML library for the [Rust](https://www.rust-lang.org/) programming language. +It supports reading and writing of XML documents in a streaming fashion (without DOM). + +### Features + +* API based on `Iterator`s and regular `String`s without tricky lifetimes. + +* XML spec conformance better than other pure-Rust libraries. + +* Support for UTF-16, UTF-8, ISO-8859-1, and ASCII encodings. + +* Written entirely in the safe Rust subset. + + +The API is heavily inspired by Java Streaming API for XML ([StAX][stax]). It contains a pull parser much like StAX event reader. It provides an iterator API, so you can leverage Rust's existing iterators library features. + + [stax]: https://en.wikipedia.org/wiki/StAX + +It also provides a streaming document writer much like StAX event writer. +This writer consumes its own set of events, but reader events can be converted to +writer events easily, and so it is possible to write XML transformation chains in a pretty +clean manner. + +This parser is mostly full-featured, however, there are limitations: +* Legacy code pages and non-Unicode encodings are not supported; +* DTD validation is not supported (but entities defined in the internal subset are supported); +* attribute value normalization is not performed, and end-of-line characters are not normalized either. + +Other than that the parser tries to be mostly XML-1.1-compliant. + +Writer is also mostly full-featured with the following limitations: +* no support for encodings other than UTF-8, +* no support for emitting `<!DOCTYPE>` declarations; +* more validations of input are needed, for example, checking that namespace prefixes are bounded + or comments are well-formed. + +Building and using +------------------ + +xml-rs uses [Cargo](https://crates.io), so add it with `cargo add xml` or modify `Cargo.toml`: + +```toml +[dependencies] +xml = "0.8" +``` + +The package exposes a single crate called `xml`. + +Reading XML documents +--------------------- + +[`xml::reader::EventReader`](EventReader) requires a [`Read`](stdread) instance to read from. It can be a `File` wrapped in `BufReader`, or a `Vec<u8>`, or a `&[u8]` slice. + +[EventReader]: https://docs.rs/xml-rs/latest/xml/reader/struct.EventReader.html +[stdread]: https://doc.rust-lang.org/stable/std/io/trait.Read.html + +`EventReader` implements `IntoIterator` trait, so you can use it in a `for` loop directly: + +```rust,no_run +use std::fs::File; +use std::io::BufReader; + +use xml::reader::{EventReader, XmlEvent}; + +fn main() -> std::io::Result<()> { + let file = File::open("file.xml")?; + let file = BufReader::new(file); // Buffering is important for performance + + let parser = EventReader::new(file); + let mut depth = 0; + for e in parser { + match e { + Ok(XmlEvent::StartElement { name, .. }) => { + println!("{:spaces$}+{name}", "", spaces = depth * 2); + depth += 1; + } + Ok(XmlEvent::EndElement { name }) => { + depth -= 1; + println!("{:spaces$}-{name}", "", spaces = depth * 2); + } + Err(e) => { + eprintln!("Error: {e}"); + break; + } + // There's more: https://docs.rs/xml-rs/latest/xml/reader/enum.XmlEvent.html + _ => {} + } + } + + Ok(()) +} +``` + +Document parsing can end normally or with an error. Regardless of exact cause, the parsing +process will be stopped, and the iterator will terminate normally. + +You can also have finer control over when to pull the next event from the parser using its own +`next()` method: + +```rust,ignore +match parser.next() { + ... +} +``` + +Upon the end of the document or an error, the parser will remember the last event and will always +return it in the result of `next()` call afterwards. If iterator is used, then it will yield +error or end-of-document event once and will produce `None` afterwards. + +It is also possible to tweak parsing process a little using [`xml::reader::ParserConfig`][ParserConfig] structure. +See its documentation for more information and examples. + +[ParserConfig]: https://docs.rs/xml-rs/latest/xml/reader/struct.ParserConfig.html + +You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a +small program (BTW, it is built with `cargo build` and can be run after that) which shows various +statistics about specified XML document. It can also be used to check for well-formedness of +XML documents - if a document is not well-formed, this program will exit with an error. + +Writing XML documents +--------------------- + +xml-rs also provides a streaming writer much like StAX event writer. With it you can write an +XML document to any `Write` implementor. + +```rust,no_run +use std::io; +use xml::writer::{EmitterConfig, XmlEvent}; + +/// A simple demo syntax where "+foo" makes `<foo>`, "-foo" makes `</foo>` +fn make_event_from_line(line: &str) -> XmlEvent { + let line = line.trim(); + if let Some(name) = line.strip_prefix("+") { + XmlEvent::start_element(name).into() + } else if line.starts_with("-") { + XmlEvent::end_element().into() + } else { + XmlEvent::characters(line).into() + } +} + +fn main() -> io::Result<()> { + let input = io::stdin(); + let output = io::stdout(); + let mut writer = EmitterConfig::new() + .perform_indent(true) + .create_writer(output); + + let mut line = String::new(); + loop { + line.clear(); + let bytes_read = input.read_line(&mut line)?; + if bytes_read == 0 { + break; // EOF + } + + let event = make_event_from_line(&line); + if let Err(e) = writer.write(event) { + panic!("Write error: {e}") + } + } + Ok(()) +} +``` + +The code example above also demonstrates how to create a writer out of its configuration. +Similar thing also works with `EventReader`. + +The library provides an XML event building DSL which helps to construct complex events, +e.g. ones having namespace definitions. Some examples: + +```rust,ignore +// <a:hello a:param="value" xmlns:a="urn:some:document"> +XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document") + +// <hello b:config="name" xmlns="urn:default:uri"> +XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri") + +// <![CDATA[some unescaped text]]> +XmlEvent::cdata("some unescaped text") +``` + +Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL. +There are more examples in [`xml::writer::XmlEvent`][XmlEvent] documentation. + +[XmlEvent]: https://docs.rs/xml-rs/latest/xml/reader/enum.XmlEvent.html + +The writer has multiple configuration options; see `EmitterConfig` documentation for more +information. + +[EmitterConfig]: https://docs.rs/xml-rs/latest/xml/writer/struct.EmitterConfig.html + +Bug reports +------------ + +Please report issues at: <https://github.com/kornelski/xml-rs/issues>. + diff --git a/Readme.md b/Readme.md deleted file mode 100644 index 5ab88f8..0000000 --- a/Readme.md +++ /dev/null @@ -1,236 +0,0 @@ -xml-rs, an XML library for Rust -=============================== - -[![Build Status][build-status-img]](https://github.com/netvl/xml-rs/actions?query=workflow%3ACI) -[![crates.io][crates-io-img]](https://crates.io/crates/xml-rs) -[![docs][docs-img]](https://docs.rs/xml-rs/) - -[Documentation](https://docs.rs/xml-rs/) - - [build-status-img]: https://img.shields.io/github/workflow/status/netvl/xml-rs/CI/master?style=flat-square - [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg?style=flat-square - [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg?style=flat-square - -xml-rs is an XML library for [Rust](http://www.rust-lang.org/) programming language. -It is heavily inspired by Java [Streaming API for XML (StAX)][stax]. - - [stax]: https://en.wikipedia.org/wiki/StAX - -This library currently contains pull parser much like [StAX event reader][stax-reader]. -It provides iterator API, so you can leverage Rust's existing iterators library features. - - [stax-reader]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventReader.html - -It also provides a streaming document writer much like [StAX event writer][stax-writer]. -This writer consumes its own set of events, but reader events can be converted to -writer events easily, and so it is possible to write XML transformation chains in a pretty -clean manner. - - [stax-writer]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventWriter.html - -This parser is mostly full-featured, however, there are limitations: -* no other encodings but UTF-8 are supported yet, because no stream-based encoding library - is available now; when (or if) one will be available, I'll try to make use of it; -* DTD validation is not supported, `<!DOCTYPE>` declarations are completely ignored; thus no - support for custom entities too; internal DTD declarations are likely to cause parsing errors; -* attribute value normalization is not performed, and end-of-line characters are not normalized too. - -Other than that the parser tries to be mostly XML-1.0-compliant. - -Writer is also mostly full-featured with the following limitations: -* no support for encodings other than UTF-8, for the same reason as above; -* no support for emitting `<!DOCTYPE>` declarations; -* more validations of input are needed, for example, checking that namespace prefixes are bounded - or comments are well-formed. - -What is planned (highest priority first, approximately): - -0. missing features required by XML standard (e.g. aforementioned normalization and - proper DTD parsing); -1. miscellaneous features of the writer; -2. parsing into a DOM tree and its serialization back to XML text; -3. SAX-like callback-based parser (fairly easy to implement over pull parser); -4. DTD validation; -5. (let's dream a bit) XML Schema validation. - -Building and using ------------------- - -xml-rs uses [Cargo](http://crates.io), so just add a dependency section in your project's manifest: - -```toml -[dependencies] -xml-rs = "0.8" -``` - -The package exposes a single crate called `xml`: - -```rust -extern crate xml; -``` - -Reading XML documents ---------------------- - -`xml::reader::EventReader` requires a `Read` instance to read from. When a proper stream-based encoding -library is available, it is likely that xml-rs will be switched to use whatever character stream structure -this library would provide, but currently it is a `Read`. - -Using `EventReader` is very straightforward. Just provide a `Read` instance to obtain an iterator -over events: - -```rust,no_run -extern crate xml; - -use std::fs::File; -use std::io::BufReader; - -use xml::reader::{EventReader, XmlEvent}; - -fn indent(size: usize) -> String { - const INDENT: &'static str = " "; - (0..size).map(|_| INDENT) - .fold(String::with_capacity(size*INDENT.len()), |r, s| r + s) -} - -fn main() { - let file = File::open("file.xml").unwrap(); - let file = BufReader::new(file); - - let parser = EventReader::new(file); - let mut depth = 0; - for e in parser { - match e { - Ok(XmlEvent::StartElement { name, .. }) => { - println!("{}+{}", indent(depth), name); - depth += 1; - } - Ok(XmlEvent::EndElement { name }) => { - depth -= 1; - println!("{}-{}", indent(depth), name); - } - Err(e) => { - println!("Error: {}", e); - break; - } - _ => {} - } - } -} -``` - -`EventReader` implements `IntoIterator` trait, so you can just use it in a `for` loop directly. -Document parsing can end normally or with an error. Regardless of exact cause, the parsing -process will be stopped, and iterator will terminate normally. - -You can also have finer control over when to pull the next event from the parser using its own -`next()` method: - -```rust,ignore -match parser.next() { - ... -} -``` - -Upon the end of the document or an error the parser will remember that last event and will always -return it in the result of `next()` call afterwards. If iterator is used, then it will yield -error or end-of-document event once and will produce `None` afterwards. - -It is also possible to tweak parsing process a little using `xml::reader::ParserConfig` structure. -See its documentation for more information and examples. - -You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a -small program (BTW, it is built with `cargo build` and can be run after that) which shows various -statistics about specified XML document. It can also be used to check for well-formedness of -XML documents - if a document is not well-formed, this program will exit with an error. - -Writing XML documents ---------------------- - -xml-rs also provides a streaming writer much like StAX event writer. With it you can write an -XML document to any `Write` implementor. - -```rust,no_run -extern crate xml; - -use std::fs::File; -use std::io::{self, Write}; - -use xml::writer::{EventWriter, EmitterConfig, XmlEvent, Result}; - -fn handle_event<W: Write>(w: &mut EventWriter<W>, line: String) -> Result<()> { - let line = line.trim(); - let event: XmlEvent = if line.starts_with("+") && line.len() > 1 { - XmlEvent::start_element(&line[1..]).into() - } else if line.starts_with("-") { - XmlEvent::end_element().into() - } else { - XmlEvent::characters(&line).into() - }; - w.write(event) -} - -fn main() { - let mut file = File::create("output.xml").unwrap(); - - let mut input = io::stdin(); - let mut output = io::stdout(); - let mut writer = EmitterConfig::new().perform_indent(true).create_writer(&mut file); - loop { - print!("> "); output.flush().unwrap(); - let mut line = String::new(); - match input.read_line(&mut line) { - Ok(0) => break, - Ok(_) => match handle_event(&mut writer, line) { - Ok(_) => {} - Err(e) => panic!("Write error: {}", e) - }, - Err(e) => panic!("Input error: {}", e) - } - } -} -``` - -The code example above also demonstrates how to create a writer out of its configuration. -Similar thing also works with `EventReader`. - -The library provides an XML event building DSL which helps to construct complex events, -e.g. ones having namespace definitions. Some examples: - -```rust,ignore -// <a:hello a:param="value" xmlns:a="urn:some:document"> -XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document") - -// <hello b:config="name" xmlns="urn:default:uri"> -XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri") - -// <![CDATA[some unescaped text]]> -XmlEvent::cdata("some unescaped text") -``` - -Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL. -There are more examples in `xml::writer::XmlEvent` documentation. - -The writer has multiple configuration options; see `EmitterConfig` documentation for more -information. - -Other things ------------- - -No performance tests or measurements are done. The implementation is rather naive, and no specific -optimizations are made. Hopefully the library is sufficiently fast to process documents of common size. -I intend to add benchmarks in future, but not until more important features are added. - -Known issues ------------- - -All known issues are present on GitHub issue tracker: <http://github.com/netvl/xml-rs/issues>. -Feel free to post any found problems there. - -License -------- - -This library is licensed under MIT license. - ---- -Copyright (C) Vladimir Matveev, 2014-2020 diff --git a/design.md b/design.md deleted file mode 100644 index da67c7b..0000000 --- a/design.md +++ /dev/null @@ -1,37 +0,0 @@ -# Reader - -Basic features: - * [x] Parsing XML 1.0 documents and returning a stream of events - - [ ] Support reading embedded DTD schemas - - [ ] Support for embedded entities - * [x] Support for namespaces and emitting namespace information in events - * [ ] \[maybe\] push-based wrapper - * Missing XML features - - [ ] Support for different encodings - - [ ] Attribute values normalization - - [ ] EOL characters normalization - -Advanced features: - * [ ] DTD schema validation - * [ ] XSD schema validation - -# Writer - -Basic features: - * [x] Writing basic XML 1.0 documents in UTF-8 - * [x] Writing XML 1.0 documents with namespace support - * [x] Support for writing elements with empty body as empty elements - * [x] Pretty-printed and compact output - * [ ] Writing XML document with embedded DTDs and DTD references - * Misc features: - - [ ] Support for different encodings - - [x] Support for writing CDATA as characters - - [ ] Checking events for invalid characters (e.g. `--` in comments) - - [ ] Check for namespaces more correctly, i.e. check both for prefix and namespace URI - - [ ] Support checking namespace prefix presence in the current namespace for events with prefix but without namespace - - [ ] Support checking namespace prefix for events with both prefix and namespace URI - -# Other - -DOM-based API: - * [ ] Basic support for DOM-based API diff --git a/src/analyze.rs b/src/analyze.rs index d369d2f..d50b2d9 100644 --- a/src/analyze.rs +++ b/src/analyze.rs @@ -1,37 +1,23 @@ #![forbid(unsafe_code)] -extern crate xml; - use std::cmp; +use std::collections::HashSet; use std::env; -use std::io::{self, Read, Write, BufReader}; use std::fs::File; -use std::collections::HashSet; +use std::io::{self, BufReader, Read}; -use xml::ParserConfig; use xml::reader::XmlEvent; +use xml::ParserConfig; -macro_rules! abort { - ($code:expr) => {::std::process::exit($code)}; - ($code:expr, $($args:tt)+) => {{ - writeln!(&mut ::std::io::stderr(), $($args)+).unwrap(); - ::std::process::exit($code); - }} -} - -fn main() { +fn main() -> Result<(), Box<dyn std::error::Error>> { let mut file; let mut stdin; - let source: &mut Read = match env::args().nth(1) { - Some(file_name) => { - file = File::open(file_name) - .unwrap_or_else(|e| abort!(1, "Cannot open input file: {}", e)); - &mut file - } - None => { - stdin = io::stdin(); - &mut stdin - } + let source: &mut dyn Read = if let Some(file_name) = env::args().nth(1) { + file = File::open(file_name).map_err(|e| format!("Cannot open input file: {e}"))?; + &mut file + } else { + stdin = io::stdin(); + &mut stdin }; let reader = ParserConfig::new() @@ -51,49 +37,49 @@ fn main() { let mut max_depth = 0; for e in reader { + let e = e.map_err(|e| format!("Error parsing XML document: {e}"))?; match e { - Ok(e) => match e { - XmlEvent::StartDocument { version, encoding, standalone } => - println!( - "XML document version {}, encoded in {}, {}standalone", - version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } - ), - XmlEvent::EndDocument => println!("Document finished"), - XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, - XmlEvent::Whitespace(_) => {} // can't happen due to configuration - XmlEvent::Characters(s) => { - character_blocks += 1; - characters += s.len(); - } - XmlEvent::CData(s) => { - cdata_blocks += 1; - characters += s.len(); - } - XmlEvent::Comment(s) => { - comment_blocks += 1; - comment_characters += s.len(); - } - XmlEvent::StartElement { namespace, .. } => { - depth += 1; - max_depth = cmp::max(max_depth, depth); - elements += 1; - namespaces.extend(namespace.0.into_iter().map(|(_, ns_uri)| ns_uri)); - } - XmlEvent::EndElement { .. } => { - depth -= 1; - } - }, - Err(e) => abort!(1, "Error parsing XML document: {}", e) - } + XmlEvent::StartDocument { version, encoding, standalone } => + println!( + "XML document version {}, encoded in {}, {}standalone", + version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } + ), + XmlEvent::EndDocument => println!("Document finished"), + XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, + XmlEvent::Whitespace(_) => {} // can't happen due to configuration + XmlEvent::Characters(s) => { + character_blocks += 1; + characters += s.len(); + } + XmlEvent::CData(s) => { + cdata_blocks += 1; + characters += s.len(); + } + XmlEvent::Comment(s) => { + comment_blocks += 1; + comment_characters += s.len(); + } + XmlEvent::StartElement { namespace, .. } => { + depth += 1; + max_depth = cmp::max(max_depth, depth); + elements += 1; + namespaces.extend(namespace.0.into_values()); + } + XmlEvent::EndElement { .. } => { + depth -= 1; + } + }; } + namespaces.remove(xml::namespace::NS_EMPTY_URI); namespaces.remove(xml::namespace::NS_XMLNS_URI); namespaces.remove(xml::namespace::NS_XML_URI); - println!("Elements: {}, maximum depth: {}", elements, max_depth); + println!("Elements: {elements}, maximum depth: {max_depth}"); println!("Namespaces (excluding built-in): {}", namespaces.len()); - println!("Characters: {}, characters blocks: {}, CDATA blocks: {}", - characters, character_blocks, cdata_blocks); - println!("Comment blocks: {}, comment characters: {}", comment_blocks, comment_characters); - println!("Processing instructions (excluding built-in): {}", processing_instructions); + println!("Characters: {characters}, characters blocks: {character_blocks}, CDATA blocks: {cdata_blocks}"); + println!("Comment blocks: {comment_blocks}, comment characters: {comment_characters}"); + println!("Processing instructions (excluding built-in): {processing_instructions}"); + + Ok(()) } diff --git a/src/attribute.rs b/src/attribute.rs index 8728f49..112bf24 100644 --- a/src/attribute.rs +++ b/src/attribute.rs @@ -3,8 +3,8 @@ use std::fmt; -use name::{Name, OwnedName}; -use escape::escape_str_attribute; +use crate::escape::{Escaped, AttributeEscapes}; +use crate::name::{Name, OwnedName}; /// A borrowed version of an XML attribute. /// @@ -15,18 +15,19 @@ pub struct Attribute<'a> { pub name: Name<'a>, /// Attribute value. - pub value: &'a str + pub value: &'a str, } impl<'a> fmt::Display for Attribute<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}=\"{}\"", self.name, escape_str_attribute(self.value)) + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, Escaped::<AttributeEscapes>::new(self.value)) } } impl<'a> Attribute<'a> { /// Creates an owned attribute out of this borrowed one. #[inline] + #[must_use] pub fn to_owned(&self) -> OwnedAttribute { OwnedAttribute { name: self.name.into(), @@ -36,8 +37,9 @@ impl<'a> Attribute<'a> { /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value. #[inline] + #[must_use] pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> { - Attribute { name, value, } + Attribute { name, value } } } @@ -50,15 +52,17 @@ pub struct OwnedAttribute { pub name: OwnedName, /// Attribute value. - pub value: String + pub value: String, } impl OwnedAttribute { /// Returns a borrowed `Attribute` out of this owned one. - pub fn borrow(&self) -> Attribute { + #[must_use] + #[inline] + pub fn borrow(&self) -> Attribute<'_> { Attribute { name: self.name.borrow(), - value: &*self.value, + value: &self.value, } } @@ -73,27 +77,27 @@ impl OwnedAttribute { } impl fmt::Display for OwnedAttribute { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}=\"{}\"", self.name, escape_str_attribute(&*self.value)) + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, Escaped::<AttributeEscapes>::new(&self.value)) } } #[cfg(test)] mod tests { - use super::{Attribute}; + use super::Attribute; - use name::Name; + use crate::name::Name; #[test] fn attribute_display() { let attr = Attribute::new( Name::qualified("attribute", "urn:namespace", Some("n")), - "its value with > & \" ' < weird symbols" + "its value with > & \" ' < weird symbols", ); assert_eq!( &*attr.to_string(), "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\"" - ) + ); } } diff --git a/src/common.rs b/src/common.rs index 029e851..a1bf3ac 100644 --- a/src/common.rs +++ b/src/common.rs @@ -14,6 +14,7 @@ pub struct TextPosition { impl TextPosition { /// Creates a new position initialized to the beginning of the document #[inline] + #[must_use] pub fn new() -> TextPosition { TextPosition { row: 0, column: 0 } } @@ -21,14 +22,14 @@ impl TextPosition { /// Advances the position in a line #[inline] pub fn advance(&mut self, count: u8) { - self.column += count as u64; + self.column += u64::from(count); } /// Advances the position in a line to the next tab position #[inline] pub fn advance_to_tab(&mut self, width: u8) { - let width = width as u64; - self.column += width - self.column % width + let width = u64::from(width); + self.column += width - self.column % width; } /// Advances the position to the beginning of the next line @@ -40,15 +41,15 @@ impl TextPosition { } impl fmt::Debug for TextPosition { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}:{}", self.row + 1, self.column + 1) } } impl fmt::Display for TextPosition { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}:{}", self.row + 1, self.column + 1) } } @@ -69,26 +70,27 @@ impl Position for TextPosition { } /// XML version enumeration. -#[derive(Copy, Clone, PartialEq, Eq)] +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] pub enum XmlVersion { /// XML version 1.0. Version10, /// XML version 1.1. - Version11 + Version11, } impl fmt::Display for XmlVersion { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { - XmlVersion::Version10 => write!(f, "1.0"), - XmlVersion::Version11 => write!(f, "1.1") - } + XmlVersion::Version10 => "1.0", + XmlVersion::Version11 => "1.1", + }.fmt(f) } } impl fmt::Debug for XmlVersion { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self, f) } } @@ -97,33 +99,45 @@ impl fmt::Debug for XmlVersion { /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +#[must_use] +#[inline] pub fn is_whitespace_char(c: char) -> bool { - match c { - '\x20' | '\x09' | '\x0d' | '\x0a' => true, - _ => false - } + matches!(c, '\x20' | '\x0a' | '\x09' | '\x0d') } /// Checks whether the given string is compound only by white space -/// characters (`S`) using the previous is_whitespace_char to check +/// characters (`S`) using the previous `is_whitespace_char` to check /// all characters of this string pub fn is_whitespace_str(s: &str) -> bool { s.chars().all(is_whitespace_char) } +pub fn is_xml10_char(c: char) -> bool { + matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) +} + +pub fn is_xml11_char(c: char) -> bool { + matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) +} + +pub fn is_xml11_char_not_restricted(c: char) -> bool { + is_xml11_char(c) && !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}') +} + /// Checks whether the given character is a name start character (`NameStartChar`) /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +#[must_use] pub fn is_name_start_char(c: char) -> bool { match c { - ':' | 'A'...'Z' | '_' | 'a'...'z' | - '\u{C0}'...'\u{D6}' | '\u{D8}'...'\u{F6}' | '\u{F8}'...'\u{2FF}' | - '\u{370}'...'\u{37D}' | '\u{37F}'...'\u{1FFF}' | - '\u{200C}'...'\u{200D}' | '\u{2070}'...'\u{218F}' | - '\u{2C00}'...'\u{2FEF}' | '\u{3001}'...'\u{D7FF}' | - '\u{F900}'...'\u{FDCF}' | '\u{FDF0}'...'\u{FFFD}' | - '\u{10000}'...'\u{EFFFF}' => true, + ':' | 'A'..='Z' | '_' | 'a'..='z' | + '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | + '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | + '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | + '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | + '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | + '\u{10000}'..='\u{EFFFF}' => true, _ => false } } @@ -132,11 +146,12 @@ pub fn is_name_start_char(c: char) -> bool { /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +#[must_use] pub fn is_name_char(c: char) -> bool { match c { _ if is_name_start_char(c) => true, - '-' | '.' | '0'...'9' | '\u{B7}' | - '\u{300}'...'\u{36F}' | '\u{203F}'...'\u{2040}' => true, + '-' | '.' | '0'..='9' | '\u{B7}' | + '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}' => true, _ => false } } diff --git a/src/escape.rs b/src/escape.rs index 18298b9..1fcfd06 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -1,81 +1,102 @@ //! Contains functions for performing XML special characters escaping. -use std::borrow::Cow; +use std::{borrow::Cow, marker::PhantomData, fmt::{Display, Result, Formatter}}; -enum Value { - Char(char), - Str(&'static str) -} +pub(crate) trait Escapes { + fn escape(c: u8) -> Option<&'static str>; -impl Value { - fn dispatch_for_attribute(c: char) -> Value { - match c { - '<' => Value::Str("<"), - '>' => Value::Str(">"), - '"' => Value::Str("""), - '\'' => Value::Str("'"), - '&' => Value::Str("&"), - '\n' => Value::Str("
"), - '\r' => Value::Str("
"), - _ => Value::Char(c) - } + fn byte_needs_escaping(c: u8) -> bool{ + Self::escape(c).is_some() } - fn dispatch_for_pcdata(c: char) -> Value { - match c { - '<' => Value::Str("<"), - '&' => Value::Str("&"), - _ => Value::Char(c) - } + fn str_needs_escaping(s: &str) -> bool{ + s.bytes().any(|c| Self::escape(c).is_some()) } } -enum Process<'a> { - Borrowed(&'a str), - Owned(String) +pub(crate) struct Escaped<'a, E: Escapes> { + _escape_phantom: PhantomData<E>, + to_escape: &'a str, } -impl<'a> Process<'a> { - fn process(&mut self, (i, next): (usize, Value)) { - match next { - Value::Str(s) => match *self { - Process::Owned(ref mut o) => o.push_str(s), - Process::Borrowed(b) => { - let mut r = String::with_capacity(b.len() + s.len()); - r.push_str(&b[..i]); - r.push_str(s); - *self = Process::Owned(r); - } - }, - Value::Char(c) => match *self { - Process::Borrowed(_) => {} - Process::Owned(ref mut o) => o.push(c) - } +impl<'a, E: Escapes> Escaped<'a, E> { + pub fn new(s: &'a str) -> Self { + Escaped { + _escape_phantom: PhantomData, + to_escape: s, } } +} - fn into_result(self) -> Cow<'a, str> { - match self { - Process::Borrowed(b) => Cow::Borrowed(b), - Process::Owned(o) => Cow::Owned(o) + +impl<'a, E: Escapes> Display for Escaped<'a, E> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let mut total_remaining = self.to_escape; + + // find the next occurence + while let Some(n) = total_remaining + .bytes() + .position(E::byte_needs_escaping) + { + let (start, remaining) = total_remaining.split_at(n); + + f.write_str(start)?; + + // unwrap is safe because we checked is_some for position n earlier + let next_byte = remaining.bytes().next().unwrap(); + let replacement = E::escape(next_byte).unwrap(); + f.write_str(replacement)?; + + total_remaining = &remaining[1..]; } + + f.write_str(total_remaining) } } -impl<'a> Extend<(usize, Value)> for Process<'a> { - fn extend<I: IntoIterator<Item=(usize, Value)>>(&mut self, it: I) { - for v in it.into_iter() { - self.process(v); - } +fn escape_str<E: Escapes>(s: &str) -> Cow<'_, str> { + if E::str_needs_escaping(s) { + Cow::Owned(format!("{}", Escaped::<E>::new(s))) + } else { + Cow::Borrowed(s) } } -fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow<str> { - let mut p = Process::Borrowed(s); - p.extend(s.char_indices().map(|(ind, c)| (ind, dispatch(c)))); - p.into_result() +macro_rules! escapes { + { + $name: ident, + $($k: expr => $v: expr),* $(,)? + } => { + pub(crate) struct $name; + + impl Escapes for $name { + fn escape(c: u8) -> Option<&'static str> { + match c { + $( $k => Some($v),)* + _ => None + } + } + } + }; } +escapes!( + AttributeEscapes, + b'<' => "<", + b'>' => ">", + b'"' => """, + b'\'' => "'", + b'&' => "&", + b'\n' => "
", + b'\r' => "
", +); + +escapes!( + PcDataEscapes, + b'<' => "<", + b'&' => "&", +); + /// Performs escaping of common XML characters inside an attribute value. /// /// This function replaces several important markup characters with their @@ -86,13 +107,18 @@ fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow<str> { /// * `"` → `"` /// * `'` → `'` /// * `&` → `&` +/// +/// The following characters are escaped so that attributes are printed on +/// a single line: +/// * `\n` → `
` +/// * `\r` → `
` /// /// The resulting string is safe to use inside XML attribute values or in PCDATA sections. /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] -pub fn escape_str_attribute(s: &str) -> Cow<str> { - escape_str(s, Value::dispatch_for_attribute) +#[must_use] pub fn escape_str_attribute(s: &str) -> Cow<'_, str> { + escape_str::<AttributeEscapes>(s) } /// Performs escaping of common XML characters inside PCDATA. @@ -107,15 +133,25 @@ pub fn escape_str_attribute(s: &str) -> Cow<str> { /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] -pub fn escape_str_pcdata(s: &str) -> Cow<str> { - escape_str(s, Value::dispatch_for_pcdata) +#[must_use] pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> { + escape_str::<PcDataEscapes>(s) } #[cfg(test)] mod tests { - use super::{escape_str_pcdata, escape_str_attribute}; + use super::{escape_str_attribute, escape_str_pcdata}; - // TODO: add more tests + #[test] + fn test_escape_str_attribute() { + assert_eq!(escape_str_attribute("<>'\"&\n\r"), "<>'"&

"); + assert_eq!(escape_str_attribute("no_escapes"), "no_escapes"); + } + + #[test] + fn test_escape_str_pcdata() { + assert_eq!(escape_str_pcdata("<&"), "<&"); + assert_eq!(escape_str_pcdata("no_escapes"), "no_escapes"); + } #[test] fn test_escape_multibyte_code_points() { @@ -123,4 +159,3 @@ mod tests { assert_eq!(escape_str_pcdata("☃<"), "☃<"); } } - @@ -1,29 +1,30 @@ //#![warn(missing_doc)] -#![allow(dead_code)] -#![allow(unused_variables)] #![forbid(non_camel_case_types)] #![forbid(unsafe_code)] +#![allow(clippy::redundant_closure_for_method_calls)] +#![allow(clippy::module_name_repetitions)] //! This crate currently provides an almost XML 1.0/1.1-compliant pull parser. +//! +//! Please note that functions of this parser may panic. +//! If a panic could cause a Denial Of Service in your codebase, *you're* responsible for wrapping access to this library in `catch_unwind`. #[cfg(doctest)] -#[macro_use] -extern crate doc_comment; +doc_comment::doctest!("../README.md"); -#[cfg(doctest)] -doctest!("../Readme.md"); - -pub use reader::EventReader; -pub use reader::ParserConfig; -pub use writer::EventWriter; -pub use writer::EmitterConfig; +pub use crate::reader::EventReader; +pub use crate::reader::ParserConfig; +pub use crate::util::Encoding; +pub use crate::writer::EmitterConfig; +pub use crate::writer::EventWriter; -pub mod macros; -pub mod name; pub mod attribute; pub mod common; pub mod escape; +#[doc(hidden)] // FIXME: not supposed to be public +pub mod macros; +pub mod name; pub mod namespace; pub mod reader; -pub mod writer; mod util; +pub mod writer; diff --git a/src/macros.rs b/src/macros.rs index 1cce3d6..25916d3 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -5,7 +5,8 @@ macro_rules! gen_setter { ($target:ty, $field:ident : into $t:ty) => { impl $target { - /// Sets the field to the provided value and returns updated config object. + /// See [`ParserConfig`][crate::ParserConfig] fields docs for details + #[inline] pub fn $field<T: Into<$t>>(mut self, value: T) -> $target { self.$field = value.into(); self @@ -14,13 +15,38 @@ macro_rules! gen_setter { }; ($target:ty, $field:ident : val $t:ty) => { impl $target { - /// Sets the field to the provided value and returns updated config object. + /// See [`ParserConfig`][crate::ParserConfig] fields docs for details + #[inline] pub fn $field(mut self, value: $t) -> $target { self.$field = value; self } } - } + }; + ($target:ty, $field:ident : delegate $t:ty) => { + impl $target { + /// See [`ParserConfig`][crate::ParserConfig] fields docs for details + #[inline] + pub fn $field(mut self, value: $t) -> $target { + self.c.$field = value; + self + } + } + }; + ($target:ty, $field:ident : c2 $t:ty) => { + impl $target { + /// See [`ParserConfig2`][crate::reader::ParserConfig] fields docs for details + #[inline] + #[must_use] + pub fn $field(self, value: $t) -> ParserConfig2 { + ParserConfig2 { + c: self, + ..Default::default() + } + .$field(value) + } + } + }; } macro_rules! gen_setters { diff --git a/src/name.rs b/src/name.rs index a20eae2..fc11981 100644 --- a/src/name.rs +++ b/src/name.rs @@ -4,7 +4,7 @@ use std::fmt; use std::str::FromStr; -use namespace::NS_NO_PREFIX; +use crate::namespace::NS_NO_PREFIX; /// Represents a qualified XML name. /// @@ -53,16 +53,16 @@ pub struct Name<'a> { pub namespace: Option<&'a str>, /// A name prefix, e.g. `xsi` in `xsi:string`. - pub prefix: Option<&'a str> + pub prefix: Option<&'a str>, } impl<'a> From<&'a str> for Name<'a> { fn from(s: &'a str) -> Name<'a> { - let mut parts = s.splitn(2, ":").fuse(); + let mut parts = s.splitn(2, ':').fuse(); match (parts.next(), parts.next()) { (Some(name), None) => Name::local(name), (Some(prefix), Some(name)) => Name::prefixed(name, prefix), - _ => unreachable!() + _ => unreachable!(), } } } @@ -74,52 +74,56 @@ impl<'a> From<(&'a str, &'a str)> for Name<'a> { } impl<'a> fmt::Display for Name<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(namespace) = self.namespace { - write!(f, "{{{}}}", namespace)?; + write!(f, "{{{namespace}}}")?; } if let Some(prefix) = self.prefix { - write!(f, "{}:", prefix)?; + write!(f, "{prefix}:")?; } - write!(f, "{}", self.local_name) + f.write_str(self.local_name) } } impl<'a> Name<'a> { /// Returns an owned variant of the qualified name. + #[must_use] pub fn to_owned(&self) -> OwnedName { OwnedName { local_name: self.local_name.into(), - namespace: self.namespace.map(|s| s.into()), - prefix: self.prefix.map(|s| s.into()) + namespace: self.namespace.map(std::convert::Into::into), + prefix: self.prefix.map(std::convert::Into::into), } } /// Returns a new `Name` instance representing plain local name. #[inline] - pub fn local(local_name: &str) -> Name { + #[must_use] + pub fn local(local_name: &str) -> Name<'_> { Name { local_name, prefix: None, - namespace: None + namespace: None, } } /// Returns a new `Name` instance with the given local name and prefix. #[inline] + #[must_use] pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> { Name { local_name, namespace: None, - prefix: Some(prefix) + prefix: Some(prefix), } } /// Returns a new `Name` instance representing a qualified name with or without a prefix and /// with a namespace URI. #[inline] + #[must_use] pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> { Name { local_name, @@ -132,6 +136,7 @@ impl<'a> Name<'a> { /// /// This method is different from the autoimplemented `to_string()` because it does not /// include namespace URI in the result. + #[must_use] pub fn to_repr(&self) -> String { self.repr_display().to_string() } @@ -142,12 +147,14 @@ impl<'a> Name<'a> { /// This method is needed for efficiency purposes in order not to create unnecessary /// allocations. #[inline] - pub fn repr_display(&self) -> ReprDisplay { + #[must_use] + pub fn repr_display(&self) -> ReprDisplay<'_, '_> { ReprDisplay(self) } /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant. #[inline] + #[must_use] pub fn prefix_repr(&self) -> &str { self.prefix.unwrap_or(NS_NO_PREFIX) } @@ -155,13 +162,13 @@ impl<'a> Name<'a> { /// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is /// displayed in an XML document. -pub struct ReprDisplay<'a, 'b:'a>(&'a Name<'b>); +pub struct ReprDisplay<'a, 'b>(&'a Name<'b>); -impl<'a, 'b:'a> fmt::Display for ReprDisplay<'a, 'b> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl<'a, 'b: 'a> fmt::Display for ReprDisplay<'a, 'b> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.0.prefix { Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name), - None => write!(f, "{}", self.0.local_name) + None => self.0.local_name.fmt(f), } } } @@ -183,18 +190,20 @@ pub struct OwnedName { impl fmt::Display for OwnedName { #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.borrow(), f) } } impl OwnedName { /// Constructs a borrowed `Name` based on this owned name. - pub fn borrow(&self) -> Name { + #[must_use] + #[inline] + pub fn borrow(&self) -> Name<'_> { Name { - local_name: &*self.local_name, - namespace: self.namespace.as_ref().map(|s| &**s), - prefix: self.prefix.as_ref().map(|s| &**s), + local_name: &self.local_name, + namespace: self.namespace.as_deref(), + prefix: self.prefix.as_deref(), } } @@ -217,22 +226,24 @@ impl OwnedName { OwnedName { local_name: local_name.into(), namespace: Some(namespace.into()), - prefix: prefix.map(|v| v.into()) + prefix: prefix.map(std::convert::Into::into), } } /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix` /// but avoids extra work. #[inline] + #[must_use] pub fn prefix_ref(&self) -> Option<&str> { - self.prefix.as_ref().map(|s| &**s) + self.prefix.as_deref() } /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace` /// but avoids extra work. #[inline] + #[must_use] pub fn namespace_ref(&self) -> Option<&str> { - self.namespace.as_ref().map(|s| &**s) + self.namespace.as_deref() } } diff --git a/src/namespace.rs b/src/namespace.rs index 1ab4a5c..216a982 100644 --- a/src/namespace.rs +++ b/src/namespace.rs @@ -1,9 +1,9 @@ //! Contains namespace manipulation types and functions. -use std::iter::{Map, Rev}; -use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::btree_map::Iter as Entries; +use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::HashSet; +use std::iter::{Map, Rev}; use std::slice::Iter; /// Designates prefix for namespace definitions. @@ -11,14 +11,14 @@ use std::slice::Iter; /// See [Namespaces in XML][namespace] spec for more information. /// /// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl -pub const NS_XMLNS_PREFIX: &'static str = "xmlns"; +pub const NS_XMLNS_PREFIX: &str = "xmlns"; /// Designates the standard URI for `xmlns` prefix. /// -/// See [A Namespace Name for xmlns Attributes][1] for more information. +/// See [A Namespace Name for xmlns Attributes][namespace] for more information. /// /// [namespace]: http://www.w3.org/2000/xmlns/ -pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/"; +pub const NS_XMLNS_URI: &str = "http://www.w3.org/2000/xmlns/"; /// Designates prefix for a namespace containing several special predefined attributes. /// @@ -29,12 +29,12 @@ pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/"; /// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag /// [3]: http://www.w3.org/TR/xmlbase/ /// [4]: http://www.w3.org/TR/xml-id/ -pub const NS_XML_PREFIX: &'static str = "xml"; +pub const NS_XML_PREFIX: &str = "xml"; /// Designates the standard URI for `xml` prefix. /// /// See `NS_XML_PREFIX` documentation for more information. -pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace"; +pub const NS_XML_URI: &str = "http://www.w3.org/XML/1998/namespace"; /// Designates the absence of prefix in a qualified name. /// @@ -52,7 +52,7 @@ pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace" /// By default empty prefix corresponds to absence of namespace, but this can change either /// when writing an XML document (manually) or when reading an XML document (based on namespace /// declarations). -pub const NS_NO_PREFIX: &'static str = ""; +pub const NS_NO_PREFIX: &str = ""; /// Designates an empty namespace URI, which is equivalent to absence of namespace. /// @@ -60,7 +60,7 @@ pub const NS_NO_PREFIX: &'static str = ""; /// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with /// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping /// in a namespace back to its default value. -pub const NS_EMPTY_URI: &'static str = ""; +pub const NS_EMPTY_URI: &str = ""; /// Namespace is a map from prefixes to namespace URIs. /// @@ -71,16 +71,21 @@ pub struct Namespace(pub BTreeMap<String, String>); impl Namespace { /// Returns an empty namespace. #[inline] - pub fn empty() -> Namespace { Namespace(BTreeMap::new()) } + #[must_use] + pub fn empty() -> Namespace { + Namespace(BTreeMap::new()) + } /// Checks whether this namespace is empty. #[inline] + #[must_use] pub fn is_empty(&self) -> bool { self.0.is_empty() } /// Checks whether this namespace is essentially empty, that is, it does not contain /// anything but default mappings. + #[must_use] pub fn is_essentially_empty(&self) -> bool { // a shortcut for a namespace which is definitely not empty if self.0.len() > 3 { return false; } @@ -101,7 +106,7 @@ impl Namespace { /// # Return value /// `true` if this namespace contains the given prefix, `false` otherwise. #[inline] - pub fn contains<P: ?Sized+AsRef<str>>(&self, prefix: &P) -> bool { + pub fn contains<P: ?Sized + AsRef<str>>(&self, prefix: &P) -> bool { self.0.contains_key(prefix.as_ref()) } @@ -157,7 +162,7 @@ impl Namespace { /// /// # Return value /// Namespace URI corresponding to the given prefix, if it is present. - pub fn get<'a, P: ?Sized+AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> { + pub fn get<'a, P: ?Sized + AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> { self.0.get(prefix.as_ref()).map(|s| &**s) } } @@ -174,7 +179,7 @@ impl<'a> IntoIterator for &'a Namespace { fn into_iter(self) -> Self::IntoIter { fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> { - (&*prefix, &*uri) + (prefix, uri) } self.0.iter().map(mapper) } @@ -190,7 +195,10 @@ pub struct NamespaceStack(pub Vec<Namespace>); impl NamespaceStack { /// Returns an empty namespace stack. #[inline] - pub fn empty() -> NamespaceStack { NamespaceStack(Vec::with_capacity(2)) } + #[must_use] + pub fn empty() -> NamespaceStack { + NamespaceStack(Vec::with_capacity(2)) + } /// Returns a namespace stack with default items in it. /// @@ -199,6 +207,7 @@ impl NamespaceStack { /// * `xml` → `http://www.w3.org/XML/1998/namespace`; /// * `xmlns` → `http://www.w3.org/2000/xmlns/`. #[inline] + #[must_use] pub fn default() -> NamespaceStack { let mut nst = NamespaceStack::empty(); nst.push_empty(); @@ -246,6 +255,7 @@ impl NamespaceStack { /// /// Panics if the stack is empty. #[inline] + #[must_use] pub fn peek(&self) -> &Namespace { self.0.last().unwrap() } @@ -294,7 +304,11 @@ impl NamespaceStack { pub fn put<P, U>(&mut self, prefix: P, uri: U) -> bool where P: Into<String>, U: Into<String> { - self.0.last_mut().unwrap().put(prefix, uri) + if let Some(ns) = self.0.last_mut() { + ns.put(prefix, uri) + } else { + false + } } /// Performs a search for the given prefix in the whole stack. @@ -306,7 +320,7 @@ impl NamespaceStack { /// # Parameters /// * `prefix` --- namespace prefix. #[inline] - pub fn get<'a, P: ?Sized+AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> { + pub fn get<'a, P: ?Sized + AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> { let prefix = prefix.as_ref(); for ns in self.0.iter().rev() { match ns.get(prefix) { @@ -321,9 +335,10 @@ impl NamespaceStack { /// /// Namespaces are combined in left-to-right order, that is, rightmost namespace /// elements take priority over leftmost ones. + #[must_use] pub fn squash(&self) -> Namespace { let mut result = BTreeMap::new(); - for ns in self.0.iter() { + for ns in &self.0 { result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone()))); } Namespace(result) @@ -333,13 +348,14 @@ impl NamespaceStack { /// /// See `CheckedTarget` for more information. #[inline] - pub fn checked_target(&mut self) -> CheckedTarget { + pub fn checked_target(&mut self) -> CheckedTarget<'_> { CheckedTarget(self) } /// Returns an iterator over all mappings in this namespace stack. #[inline] - pub fn iter(&self) -> NamespaceStackMappings { + #[must_use] + pub fn iter(&self) -> NamespaceStackMappings<'_> { self.into_iter() } } @@ -361,7 +377,7 @@ impl NamespaceStack { pub struct NamespaceStackMappings<'a> { namespaces: Rev<Iter<'a, Namespace>>, current_namespace: Option<NamespaceMappings<'a>>, - used_keys: HashSet<&'a str> + used_keys: HashSet<&'a str>, } impl<'a> NamespaceStackMappings<'a> { @@ -379,7 +395,7 @@ impl<'a> Iterator for NamespaceStackMappings<'a> { if self.current_namespace.is_none() && !self.go_to_next_namespace() { return None; } - let next_item = self.current_namespace.as_mut().unwrap().next(); + let next_item = self.current_namespace.as_mut()?.next(); match next_item { // There is an element in the current namespace @@ -412,7 +428,7 @@ impl<'a> IntoIterator for &'a NamespaceStack { NamespaceStackMappings { namespaces: self.0.iter().rev(), current_namespace: None, - used_keys: HashSet::new() + used_keys: HashSet::new(), } } } diff --git a/src/reader/mod.rs b/src/reader.rs index 90f5b52..71ea79b 100644 --- a/src/reader/mod.rs +++ b/src/reader.rs @@ -3,44 +3,46 @@ //! The most important type in this module is `EventReader`, which provides an iterator //! view for events in XML document. -use std::io::{Read}; +use std::io::Read; +use std::iter::FusedIterator; use std::result; -use common::{Position, TextPosition}; +use crate::common::{Position, TextPosition}; pub use self::config::ParserConfig; -pub use self::events::XmlEvent; +pub use self::config::ParserConfig2; +pub use self::events::XmlEvent; use self::parser::PullParser; -mod lexer; -mod parser; mod config; mod events; +mod lexer; +mod parser; mod error; pub use self::error::{Error, ErrorKind}; /// A result type yielded by `XmlReader`. -pub type Result<T> = result::Result<T, Error>; +pub type Result<T, E = Error> = result::Result<T, E>; /// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. pub struct EventReader<R: Read> { source: R, - parser: PullParser + parser: PullParser, } impl<R: Read> EventReader<R> { /// Creates a new reader, consuming the given stream. #[inline] pub fn new(source: R) -> EventReader<R> { - EventReader::new_with_config(source, ParserConfig::new()) + EventReader::new_with_config(source, ParserConfig2::new()) } /// Creates a new reader with the provded configuration, consuming the given stream. #[inline] - pub fn new_with_config(source: R, config: ParserConfig) -> EventReader<R> { - EventReader { source: source, parser: PullParser::new(config) } + pub fn new_with_config(source: R, config: impl Into<ParserConfig2>) -> EventReader<R> { + EventReader { source, parser: PullParser::new(config) } } /// Pulls and returns next XML event from the stream. @@ -52,6 +54,27 @@ impl<R: Read> EventReader<R> { self.parser.next(&mut self.source) } + /// Skips all XML events until the next end tag at the current level. + /// + /// Convenience function that is useful for the case where you have + /// encountered a start tag that is of no interest and want to + /// skip the entire XML subtree until the corresponding end tag. + #[inline] + pub fn skip(&mut self) -> Result<()> { + let mut depth = 1; + + while depth > 0 { + match self.next()? { + XmlEvent::StartElement { .. } => depth += 1, + XmlEvent::EndElement { .. } => depth -= 1, + XmlEvent::EndDocument => unreachable!(), + _ => {} + } + } + + Ok(()) + } + pub fn source(&self) -> &R { &self.source } pub fn source_mut(&mut self) -> &mut R { &mut self.source } @@ -88,7 +111,7 @@ impl<R: Read> IntoIterator for EventReader<R> { /// it will be returned by the iterator once, and then it will stop producing events. pub struct Events<R: Read> { reader: EventReader<R>, - finished: bool + finished: bool, } impl<R: Read> Events<R> { @@ -103,17 +126,20 @@ impl<R: Read> Events<R> { } +impl<R: Read> FusedIterator for Events<R> { +} + impl<R: Read> Iterator for Events<R> { type Item = Result<XmlEvent>; #[inline] fn next(&mut self) -> Option<Result<XmlEvent>> { - if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None } - else { + if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { + None + } else { let ev = self.reader.next(); - match ev { - Ok(XmlEvent::EndDocument) | Err(_) => self.finished = true, - _ => {} + if let Ok(XmlEvent::EndDocument) | Err(_) = ev { + self.finished = true; } Some(ev) } @@ -123,6 +149,7 @@ impl<R: Read> Iterator for Events<R> { impl<'r> EventReader<&'r [u8]> { /// A convenience method to create an `XmlReader` from a string slice. #[inline] + #[must_use] pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> { EventReader::new(source.as_bytes()) } diff --git a/src/reader/config.rs b/src/reader/config.rs index 0abb165..3351997 100644 --- a/src/reader/config.rs +++ b/src/reader/config.rs @@ -1,8 +1,9 @@ //! Contains parser configuration structure. -use std::io::Read; use std::collections::HashMap; +use std::io::Read; -use reader::EventReader; +use crate::reader::EventReader; +use crate::util::Encoding; /// Parser configuration structure. /// @@ -103,6 +104,8 @@ impl ParserConfig { /// .ignore_comments(true) /// .coalesce_characters(false); /// ``` + #[must_use] + #[inline] pub fn new() -> ParserConfig { ParserConfig { trim_whitespace: false, @@ -179,3 +182,125 @@ gen_setters! { ParserConfig, replace_unknown_entity_references: val bool, ignore_root_level_whitespace: val bool } + +/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct +#[derive(Clone, PartialEq, Eq, Debug)] +#[non_exhaustive] +pub struct ParserConfig2 { + pub(crate) c: ParserConfig, + + /// Use this encoding as the default. Necessary for UTF-16 files without BOM. + pub override_encoding: Option<Encoding>, + + /// Allow `<?xml encoding="…">` to contain unsupported encoding names, + /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing. + pub ignore_invalid_encoding_declarations: bool, + + /// Documents with multiple root elements are ill-formed + pub allow_multiple_root_elements: bool, +} + +impl Default for ParserConfig2 { + fn default() -> Self { + ParserConfig2 { + c: Default::default(), + override_encoding: None, + ignore_invalid_encoding_declarations: false, + allow_multiple_root_elements: true, + } + } +} + +impl ParserConfig2 { + #[inline] + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Read character encoding from `Content-Type` header. + /// Set this when parsing XML documents fetched over HTTP. + /// + /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback. + #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self { + let charset = mime_type.split_once(';') + .and_then(|(_, args)| args.split_once("charset")) + .and_then(|(_, args)| args.split_once('=')); + if let Some((_, charset)) = charset { + let name = charset.trim().trim_matches('"'); + match name.parse() { + Ok(enc) => { + self.override_encoding = Some(enc); + }, + Err(_) => {}, + } + } + self + } + + /// Creates an XML reader with this configuration. + /// + /// This is a convenience method for configuring and creating a reader at the same time: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false) + /// .create_reader(&mut source); + /// ``` + /// + /// This method is exactly equivalent to calling `EventReader::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> { + EventReader::new_with_config(source, self) + } +} + +impl From<ParserConfig> for ParserConfig2 { + #[inline] + fn from(c: ParserConfig) -> Self { + Self { + c, + ..Default::default() + } + } +} + +gen_setters! { ParserConfig2, + override_encoding: val Option<Encoding>, + allow_multiple_root_elements: val bool, + ignore_invalid_encoding_declarations: val bool +} + +gen_setters! { ParserConfig, + override_encoding: c2 Option<Encoding>, + ignore_invalid_encoding_declarations: c2 bool, + allow_multiple_root_elements: c2 bool, + content_type: c2 &str +} + +gen_setters! { ParserConfig2, + trim_whitespace: delegate bool, + whitespace_to_characters: delegate bool, + cdata_to_characters: delegate bool, + ignore_comments: delegate bool, + coalesce_characters: delegate bool, + ignore_end_of_stream: delegate bool, + replace_unknown_entity_references: delegate bool, + ignore_root_level_whitespace: delegate bool +} + +#[test] +fn mime_parse() { + let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii"); + assert_eq!(c.override_encoding, Some(Encoding::Ascii)); + + let c = ParserConfig2::new().content_type("text/xml;charset = \"UTF-16\""); + assert_eq!(c.override_encoding, Some(Encoding::Utf16)); +} diff --git a/src/reader/error.rs b/src/reader/error.rs index 92378e6..8af35ae 100644 --- a/src/reader/error.rs +++ b/src/reader/error.rs @@ -1,12 +1,15 @@ +use crate::Encoding; +use crate::reader::lexer::Token; -use std::io; use std::borrow::Cow; -use std::fmt; use std::error; +use std::error::Error as _; +use std::fmt; +use std::io; use std::str; -use util; -use common::{Position, TextPosition}; +use crate::common::{Position, TextPosition}; +use crate::util; #[derive(Debug)] pub enum ErrorKind { @@ -16,18 +19,127 @@ pub enum ErrorKind { UnexpectedEof, } +#[derive(Debug, Clone, PartialEq)] +#[non_exhaustive] +pub(crate) enum SyntaxError { + CannotRedefineXmlnsPrefix, + CannotRedefineXmlPrefix, + /// Recursive custom entity expanded to too many chars, it could be DoS + EntityTooBig, + EmptyEntity, + NoRootElement, + ProcessingInstructionWithoutName, + UnbalancedRootElement, + UnexpectedEof, + UnexpectedOpeningTag, + /// Missing `]]>` + UnclosedCdata, + UnexpectedQualifiedName(Token), + UnexpectedTokenOutsideRoot(Token), + UnexpectedToken(Token), + UnexpectedTokenInEntity(Token), + UnexpectedTokenInClosingTag(Token), + UnexpectedTokenInOpeningTag(Token), + InvalidQualifiedName(Box<str>), + UnboundAttribute(Box<str>), + UnboundElementPrefix(Box<str>), + UnexpectedClosingTag(Box<str>), + UnexpectedName(Box<str>), + /// Found <?xml-like PI not at the beginning of a document, + /// which is an error, see section 2.6 of XML 1.1 spec + UnexpectedProcessingInstruction(Box<str>, Token), + CannotUndefinePrefix(Box<str>), + InvalidCharacterEntity(u32), + InvalidDefaultNamespace(Box<str>), + InvalidNamePrefix(Box<str>), + InvalidNumericEntity(Box<str>), + InvalidStandaloneDeclaration(Box<str>), + InvalidXmlProcessingInstruction(Box<str>), + RedefinedAttribute(Box<str>), + UndefinedEntity(Box<str>), + UnexpectedEntity(Box<str>), + UnexpectedNameInsideXml(Box<str>), + UnsupportedEncoding(Box<str>), + /// In DTD + UnknownMarkupDeclaration(Box<str>), + UnexpectedXmlVersion(Box<str>), + ConflictingEncoding(Encoding, Encoding), + UnexpectedTokenBefore(&'static str, char), +} + +impl fmt::Display for SyntaxError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.to_cow().fmt(f) + } +} + +impl SyntaxError { + #[inline(never)] + #[cold] + pub(crate) fn to_cow(&self) -> Cow<'static, str> { + match *self { + Self::CannotRedefineXmlnsPrefix => "Cannot redefine XMLNS prefix".into(), + Self::CannotRedefineXmlPrefix => "Default XMLNS prefix cannot be rebound to another value".into(), + Self::EmptyEntity => "Encountered empty entity".into(), + Self::EntityTooBig => "Entity too big".into(), + Self::NoRootElement => "Unexpected end of stream: no root element found".into(), + Self::ProcessingInstructionWithoutName => "Encountered processing instruction without a name".into(), + Self::UnbalancedRootElement => "Unexpected end of stream: still inside the root element".into(), + Self::UnclosedCdata => "Unclosed <![CDATA[".into(), + Self::UnexpectedEof => "Unexpected end of stream".into(), + Self::UnexpectedOpeningTag => "'<' is not allowed in attributes".into(), + Self::CannotUndefinePrefix(ref ln) => format!("Cannot undefine prefix '{ln}'").into(), + Self::ConflictingEncoding(a, b) => format!("Declared encoding {a}, but uses {b}").into(), + Self::InvalidCharacterEntity(num) => format!("Invalid character U+{num:04X}").into(), + Self::InvalidDefaultNamespace(ref name) => format!( "Namespace '{name}' cannot be default").into(), + Self::InvalidNamePrefix(ref prefix) => format!("'{prefix}' cannot be an element name prefix").into(), + Self::InvalidNumericEntity(ref v) => format!("Invalid numeric entity: {v}").into(), + Self::InvalidQualifiedName(ref e) => format!("Qualified name is invalid: {e}").into(), + Self::InvalidStandaloneDeclaration(ref value) => format!("Invalid standalone declaration value: {value}").into(), + Self::InvalidXmlProcessingInstruction(ref name) => format!("Invalid processing instruction: <?{name} - \"<?xml\"-like PI is only valid at the beginning of the document").into(), + Self::RedefinedAttribute(ref name) => format!("Attribute '{name}' is redefined").into(), + Self::UnboundAttribute(ref name) => format!("Attribute {name} prefix is unbound").into(), + Self::UnboundElementPrefix(ref name) => format!("Element {name} prefix is unbound").into(), + Self::UndefinedEntity(ref v) => format!("Undefined entity: {v}").into(), + Self::UnexpectedClosingTag(ref expected_got) => format!("Unexpected closing tag: {expected_got}").into(), + Self::UnexpectedEntity(ref name) => format!("Unexpected entity: {name}").into(), + Self::UnexpectedName(ref name) => format!("Unexpected name: {name}").into(), + Self::UnexpectedNameInsideXml(ref name) => format!("Unexpected name inside XML declaration: {name}").into(), + Self::UnexpectedProcessingInstruction(ref buf, token) => format!("Unexpected token inside processing instruction: <?{buf}{token}").into(), + Self::UnexpectedQualifiedName(e) => format!("Unexpected token inside qualified name: {e}").into(), + Self::UnexpectedToken(token) => format!("Unexpected token: {token}").into(), + Self::UnexpectedTokenBefore(before, c) => format!("Unexpected token '{before}' before '{c}'").into(), + Self::UnexpectedTokenInClosingTag(token) => format!("Unexpected token inside closing tag: {token}").into(), + Self::UnexpectedTokenInEntity(token) => format!("Unexpected token inside entity: {token}").into(), + Self::UnexpectedTokenInOpeningTag(token) => format!("Unexpected token inside opening tag: {token}").into(), + Self::UnexpectedTokenOutsideRoot(token) => format!("Unexpected characters outside the root element: {token}").into(), + Self::UnexpectedXmlVersion(ref version) => format!("Invalid XML version: {version}").into(), + Self::UnknownMarkupDeclaration(ref v) => format!("Unknown markup declaration: {v}").into(), + Self::UnsupportedEncoding(ref v) => format!("Unsupported encoding: {v}").into(), + } + } +} + /// An XML parsing error. /// /// Consists of a 2D position in a document and a textual message describing the error. #[derive(Clone, PartialEq, Eq, Debug)] pub struct Error { - pos: TextPosition, - kind: ErrorKind, + pub(crate) pos: TextPosition, + pub(crate) kind: ErrorKind, } impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{} {}", self.pos, self.msg()) + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; + + write!(f, "{} ", self.pos)?; + match &self.kind { + Io(io_error) => io_error.fmt(f), + Utf8(reason) => reason.fmt(f), + Syntax(msg) => f.write_str(msg), + UnexpectedEof => f.write_str("Unexpected EOF"), + } } } @@ -38,49 +150,59 @@ impl Position for Error { impl Error { /// Returns a reference to a message which is contained inside this error. - #[inline] - pub fn msg(&self) -> &str { - use self::ErrorKind::*; - match self.kind { - UnexpectedEof => &"Unexpected EOF", - Utf8(ref reason) => error_description(reason), - Io(ref io_error) => error_description(io_error), - Syntax(ref msg) => msg.as_ref(), + #[cold] + #[doc(hidden)] + #[allow(deprecated)] + #[must_use] pub fn msg(&self) -> &str { + use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; + match &self.kind { + Io(io_error) => io_error.description(), + Utf8(reason) => reason.description(), + Syntax(msg) => msg.as_ref(), + UnexpectedEof => "Unexpected EOF", } } - pub fn kind(&self) -> &ErrorKind { &self.kind } + #[must_use] + #[inline] + pub fn kind(&self) -> &ErrorKind { + &self.kind + } } impl error::Error for Error { - #[inline] + #[allow(deprecated)] + #[cold] fn description(&self) -> &str { self.msg() } } impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into<Cow<'static, str>> { + #[cold] fn from(orig: (&'a P, M)) -> Self { - Error{ + Error { pos: orig.0.position(), - kind: ErrorKind::Syntax(orig.1.into()) + kind: ErrorKind::Syntax(orig.1.into()), } } } impl From<util::CharReadError> for Error { + #[cold] fn from(e: util::CharReadError) -> Self { - use util::CharReadError::*; - Error{ + use crate::util::CharReadError::{Io, UnexpectedEof, Utf8}; + Error { pos: TextPosition::new(), kind: match e { UnexpectedEof => ErrorKind::UnexpectedEof, Utf8(reason) => ErrorKind::Utf8(reason), Io(io_error) => ErrorKind::Io(io_error), - } + }, } } } impl From<io::Error> for Error { + #[cold] fn from(e: io::Error) -> Self { Error { pos: TextPosition::new(), @@ -90,26 +212,28 @@ impl From<io::Error> for Error { } impl Clone for ErrorKind { + #[cold] fn clone(&self) -> Self { - use self::ErrorKind::*; - match *self { + use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; + match self { UnexpectedEof => UnexpectedEof, - Utf8(ref reason) => Utf8(reason.clone()), - Io(ref io_error) => Io(io::Error::new(io_error.kind(), error_description(io_error))), - Syntax(ref msg) => Syntax(msg.clone()), + Utf8(reason) => Utf8(*reason), + Io(io_error) => Io(io::Error::new(io_error.kind(), io_error.to_string())), + Syntax(msg) => Syntax(msg.clone()), } } } impl PartialEq for ErrorKind { + #[allow(deprecated)] fn eq(&self, other: &ErrorKind) -> bool { - use self::ErrorKind::*; + use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; match (self, other) { - (&UnexpectedEof, &UnexpectedEof) => true, - (&Utf8(ref left), &Utf8(ref right)) => left == right, - (&Io(ref left), &Io(ref right)) => + (UnexpectedEof, UnexpectedEof) => true, + (Utf8(left), Utf8(right)) => left == right, + (Io(left), Io(right)) => left.kind() == right.kind() && - error_description(left) == error_description(right), - (&Syntax(ref left), &Syntax(ref right)) => + left.description() == right.description(), + (Syntax(left), Syntax(right)) => left == right, (_, _) => false, @@ -118,4 +242,7 @@ impl PartialEq for ErrorKind { } impl Eq for ErrorKind {} -fn error_description(e: &error::Error) -> &str { e.description() } +#[test] +fn err_size() { + assert!(std::mem::size_of::<SyntaxError>() <= 24); +} diff --git a/src/reader/events.rs b/src/reader/events.rs index 46d7621..de2b930 100644 --- a/src/reader/events.rs +++ b/src/reader/events.rs @@ -1,12 +1,12 @@ //! Contains `XmlEvent` datatype, instances of which are emitted by the parser. -use std::fmt; use std::borrow::Cow; +use std::fmt; -use name::OwnedName; -use attribute::OwnedAttribute; -use common::XmlVersion; -use namespace::Namespace; +use crate::attribute::OwnedAttribute; +use crate::common::XmlVersion; +use crate::name::OwnedName; +use crate::namespace::Namespace; /// An element of an XML input stream. /// @@ -36,7 +36,7 @@ pub enum XmlEvent { /// If XML document is not present or does not contain `standalone` attribute, /// defaults to `None`. This field is currently used for no other purpose than /// informational. - standalone: Option<bool> + standalone: Option<bool>, }, /// Denotes to the end of the document stream. @@ -54,7 +54,7 @@ pub enum XmlEvent { name: String, /// Processing instruction content. - data: Option<String> + data: Option<String>, }, /// Denotes a beginning of an XML element. @@ -80,7 +80,7 @@ pub enum XmlEvent { /// latter case it is emitted immediately after corresponding `StartElement` event. EndElement { /// Qualified name of the element. - name: OwnedName + name: OwnedName, }, /// Denotes CDATA content. @@ -111,19 +111,20 @@ pub enum XmlEvent { /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace /// trimming, it will eliminate standalone whitespace from the event stream completely. - Whitespace(String) + Whitespace(String), } impl fmt::Debug for XmlEvent { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { - XmlEvent::StartDocument { ref version, ref encoding, ref standalone } => - write!(f, "StartDocument({}, {}, {:?})", version, *encoding, *standalone), + XmlEvent::StartDocument { ref version, ref encoding, standalone } => + write!(f, "StartDocument({}, {}, {:?})", version, *encoding, standalone), XmlEvent::EndDocument => write!(f, "EndDocument"), XmlEvent::ProcessingInstruction { ref name, ref data } => write!(f, "ProcessingInstruction({}{})", *name, match *data { - Some(ref data) => format!(", {}", data), + Some(ref data) => format!(", {data}"), None => String::new() }), XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } => @@ -136,15 +137,15 @@ impl fmt::Debug for XmlEvent { format!(", [{}]", attributes.join(", ")) }), XmlEvent::EndElement { ref name } => - write!(f, "EndElement({})", name), + write!(f, "EndElement({name})"), XmlEvent::Comment(ref data) => - write!(f, "Comment({})", data), + write!(f, "Comment({data})"), XmlEvent::CData(ref data) => - write!(f, "CData({})", data), + write!(f, "CData({data})"), XmlEvent::Characters(ref data) => - write!(f, "Characters({})", data), + write!(f, "Characters({data})"), XmlEvent::Whitespace(ref data) => - write!(f, "Whitespace({})", data) + write!(f, "Whitespace({data})") } } } @@ -188,32 +189,33 @@ impl XmlEvent { /// ``` /// /// Note that this API may change or get additions in future to improve its ergonomics. - pub fn as_writer_event<'a>(&'a self) -> Option<::writer::events::XmlEvent<'a>> { + #[must_use] + pub fn as_writer_event(&self) -> Option<crate::writer::events::XmlEvent<'_>> { match *self { XmlEvent::StartDocument { version, ref encoding, standalone } => - Some(::writer::events::XmlEvent::StartDocument { - version: version, + Some(crate::writer::events::XmlEvent::StartDocument { + version, encoding: Some(encoding), - standalone: standalone + standalone }), XmlEvent::ProcessingInstruction { ref name, ref data } => - Some(::writer::events::XmlEvent::ProcessingInstruction { - name: name, - data: data.as_ref().map(|s| &s[..]) + Some(crate::writer::events::XmlEvent::ProcessingInstruction { + name, + data: data.as_ref().map(|s| &**s) }), XmlEvent::StartElement { ref name, ref attributes, ref namespace } => - Some(::writer::events::XmlEvent::StartElement { + Some(crate::writer::events::XmlEvent::StartElement { name: name.borrow(), attributes: attributes.iter().map(|a| a.borrow()).collect(), namespace: Cow::Borrowed(namespace) }), XmlEvent::EndElement { ref name } => - Some(::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), - XmlEvent::Comment(ref data) => Some(::writer::events::XmlEvent::Comment(data)), - XmlEvent::CData(ref data) => Some(::writer::events::XmlEvent::CData(data)), - XmlEvent::Characters(ref data) => Some(::writer::events::XmlEvent::Characters(data)), - XmlEvent::Whitespace(ref data) => Some(::writer::events::XmlEvent::Characters(data)), - _ => None + Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), + XmlEvent::Comment(ref data) => Some(crate::writer::events::XmlEvent::Comment(data)), + XmlEvent::CData(ref data) => Some(crate::writer::events::XmlEvent::CData(data)), + XmlEvent::Characters(ref data) | + XmlEvent::Whitespace(ref data) => Some(crate::writer::events::XmlEvent::Characters(data)), + XmlEvent::EndDocument => None, } } } diff --git a/src/reader/lexer.rs b/src/reader/lexer.rs index c466db9..a8345ba 100644 --- a/src/reader/lexer.rs +++ b/src/reader/lexer.rs @@ -2,20 +2,25 @@ //! //! This module is for internal use. Use `xml::pull` module to do parsing. -use std::fmt; + +use crate::reader::ErrorKind; +use crate::reader::error::SyntaxError; use std::collections::VecDeque; +use std::fmt; use std::io::Read; use std::result; -use std::borrow::Cow; +use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is_xml10_char, is_xml11_char}; +use crate::reader::Error; +use crate::util::{CharReader, Encoding}; -use common::{Position, TextPosition, is_whitespace_char, is_name_char}; -use reader::Error; -use util; +/// Limits to defend from billion laughs attack +const MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000; +const MAX_ENTITY_EXPANSION_DEPTH: u8 = 10; /// `Token` represents a single lexeme of an XML document. These lexemes /// are used to perform actual parsing. #[derive(Copy, Clone, PartialEq, Eq, Debug)] -pub enum Token { +pub(crate) enum Token { /// `<?` ProcessingInstructionStart, /// `?>` @@ -34,12 +39,8 @@ pub enum Token { CommentStart, /// `-->` CommentEnd, - /// A chunk of characters, used for errors recovery. - Chunk(&'static str), /// Any non-special character except whitespace. Character(char), - /// Whitespace character. - Whitespace(char), /// `=` EqualsSign, /// `'` @@ -54,14 +55,16 @@ pub enum Token { ReferenceStart, /// `;` ReferenceEnd, + /// `<!` of `ENTITY` + MarkupDeclarationStart, } impl fmt::Display for Token { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { - Token::Chunk(s) => write!(f, "{}", s), - Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), - other => write!(f, "{}", match other { + Token::Character(c) => c.fmt(f), + other => match other { Token::OpeningTagStart => "<", Token::ProcessingInstructionStart => "<?", Token::DoctypeStart => "<!DOCTYPE", @@ -78,8 +81,9 @@ impl fmt::Display for Token { Token::EqualsSign => "=", Token::SingleQuote => "'", Token::DoubleQuote => "\"", + Token::MarkupDeclarationStart => "<!", _ => unreachable!() - }) + }.fmt(f), } } } @@ -103,47 +107,28 @@ impl Token { Token::EqualsSign => Some("="), Token::SingleQuote => Some("'"), Token::DoubleQuote => Some("\""), - Token::Chunk(s) => Some(s), _ => None } } // using String.push_str(token.to_string()) is simply way too slow pub fn push_to_string(&self, target: &mut String) { - match self.as_static_str() { - Some(s) => { target.push_str(s); } - None => { - match *self { - Token::Character(c) | Token::Whitespace(c) => target.push(c), - _ => unreachable!() - } - } - } - } - - /// Returns `true` if this token contains data that can be interpreted - /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. - #[inline] - pub fn contains_char_data(&self) -> bool { - match *self { - Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | - Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | - Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, - _ => false - } - } - - /// Returns `true` if this token corresponds to a white space character. - #[inline] - pub fn is_whitespace(&self) -> bool { match *self { - Token::Whitespace(_) => true, - _ => false + Token::Character(c) => { + debug_assert!(is_xml10_char(c) || is_xml11_char(c)); + target.push(c) + }, + _ => if let Some(s) = self.as_static_str() { + target.push_str(s); + } } } } +#[derive(Copy, Clone)] enum State { + /// Default state + Normal, /// Triggered on '<' TagStarted, /// Triggered on '<!' @@ -152,8 +137,10 @@ enum State { CommentStarted, /// Triggered on '<!D' up to '<!DOCTYPE' DoctypeStarted(DoctypeStartedSubstate), + /// Other items like `<!ELEMENT` in DTD + InsideMarkupDeclaration, /// Triggered after DoctypeStarted to handle sub elements - DoctypeFinishing(u8), + InsideDoctype, /// Triggered on '<![' up to '<![CDATA' CDataStarted(CDataStartedSubstate), /// Triggered on '?' @@ -162,10 +149,23 @@ enum State { EmptyTagClosing, /// Triggered on '-' up to '--' CommentClosing(ClosingSubstate), - /// Triggered on ']' up to ']]' + /// Triggered on ']' up to ']]' inside CDATA CDataClosing(ClosingSubstate), - /// Default state - Normal + /// Triggered on ']' up to ']]' outside CDATA + InvalidCDataClosing(ClosingSubstate), + /// After `<!--` + InsideComment, + /// After `<[[` + InsideCdata, + /// After `<?` + InsideProcessingInstruction, + /// `<!ENTITY "here">` + InsideMarkupDeclarationQuotedString(QuoteStyle), +} + +#[derive(Copy, Clone, Eq, PartialEq)] +enum QuoteStyle { + Single, Double } #[derive(Copy, Clone)] @@ -184,7 +184,7 @@ enum CDataStartedSubstate { } /// `Result` represents lexing result. It is either a token or an error message. -pub type Result = result::Result<Option<Token>, Error>; +pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>; /// Helps to set up a dispatch table for lexing large unambigous tokens like /// `<![CDATA[` or `<!DOCTYPE `. @@ -216,15 +216,19 @@ macro_rules! dispatch_on_enum_state( /// When it is not set, errors will be reported as `Err` objects with a string message. /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods /// to toggle the behavior. -pub struct Lexer { +pub(crate) struct Lexer { + st: State, + reader: CharReader, pos: TextPosition, head_pos: TextPosition, char_queue: VecDeque<char>, - st: State, - skip_errors: bool, - inside_comment: bool, + /// Default state to go back to after a tag end (may be `InsideDoctype`) + normal_state: State, inside_token: bool, - eof_handled: bool + eof_handled: bool, + reparse_depth: u8, + #[cfg(test)] + skip_errors: bool, } impl Position for Lexer { @@ -235,37 +239,33 @@ impl Position for Lexer { impl Lexer { /// Returns a new lexer with default state. - pub fn new() -> Lexer { + pub(crate) fn new() -> Lexer { Lexer { + reader: CharReader::new(), pos: TextPosition::new(), head_pos: TextPosition::new(), char_queue: VecDeque::with_capacity(4), // TODO: check size st: State::Normal, - skip_errors: false, - inside_comment: false, + normal_state: State::Normal, inside_token: false, - eof_handled: false + eof_handled: false, + reparse_depth: 0, + #[cfg(test)] + skip_errors: false, } } - /// Enables error handling so `next_token` will return `Some(Err(..))` - /// upon invalid lexeme. - #[inline] - pub fn enable_errors(&mut self) { self.skip_errors = false; } + pub(crate) fn encoding(&mut self) -> Encoding { + self.reader.encoding + } + + pub(crate) fn set_encoding(&mut self, encoding: Encoding) { + self.reader.encoding = encoding; + } /// Disables error handling so `next_token` will return `Some(Chunk(..))` /// upon invalid lexeme with this lexeme content. - #[inline] - pub fn disable_errors(&mut self) { self.skip_errors = true; } - - /// Enables special handling of some lexemes which should be done when we're parsing comment - /// internals. - #[inline] - pub fn inside_comment(&mut self) { self.inside_comment = true; } - - /// Disables the effect of `inside_comment()` method. - #[inline] - pub fn outside_comment(&mut self) { self.inside_comment = false; } + #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; } /// Reset the eof handled flag of the lexer. #[inline] @@ -293,23 +293,29 @@ impl Lexer { // Check if we have saved a char or two for ourselves while let Some(c) = self.char_queue.pop_front() { - match try!(self.read_next_token(c)) { + match self.dispatch_char(c)? { Some(t) => { self.inside_token = false; return Ok(Some(t)); } - None => {} // continue + None => {} // continue } } - + // if char_queue is empty, all circular reparsing is done + self.reparse_depth = 0; loop { - // TODO: this should handle multiple encodings - let c = match try!(util::next_char_from(b)) { - Some(c) => c, // got next char - None => break, // nothing to read left + let c = match self.reader.next_char_from(b)? { + Some(c) => c, // got next char + None => break, // nothing to read left }; - match try!(self.read_next_token(c)) { + if c == '\n' { + self.head_pos.new_line(); + } else { + self.head_pos.advance(1); + } + + match self.dispatch_char(c)? { Some(t) => { self.inside_token = false; return Ok(Some(t)); @@ -320,61 +326,67 @@ impl Lexer { } } + self.end_of_stream() + } + + #[inline(never)] + fn end_of_stream(&mut self) -> Result { // Handle end of stream self.eof_handled = true; self.pos = self.head_pos; match self.st { + State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)), State::TagStarted | State::CommentOrCDataOrDoctypeStarted | State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | State::CommentClosing(ClosingSubstate::Second) | - State::DoctypeFinishing(_) => - Err(self.error("Unexpected end of stream")), - State::ProcessingInstructionClosing => - Ok(Some(Token::Character('?'))), + State::InsideComment | State::InsideMarkupDeclaration | + State::InsideProcessingInstruction | State::ProcessingInstructionClosing | + State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) => + Err(self.error(SyntaxError::UnexpectedEof)), State::EmptyTagClosing => Ok(Some(Token::Character('/'))), State::CommentClosing(ClosingSubstate::First) => Ok(Some(Token::Character('-'))), - State::CDataClosing(ClosingSubstate::First) => + State::InvalidCDataClosing(ClosingSubstate::First) => Ok(Some(Token::Character(']'))), - State::CDataClosing(ClosingSubstate::Second) => - Ok(Some(Token::Chunk("]]"))), + State::InvalidCDataClosing(ClosingSubstate::Second) => { + self.eof_handled = false; + self.move_to_with_unread(State::Normal, &[']'], Token::Character(']')) + }, State::Normal => - Ok(None) + Ok(None), } } - #[inline] - fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error { - (self, msg).into() - } - - #[inline] - fn read_next_token(&mut self, c: char) -> Result { - let res = self.dispatch_char(c); - if self.char_queue.is_empty() { - if c == '\n' { - self.head_pos.new_line(); - } else { - self.head_pos.advance(1); - } + #[cold] + fn error(&self, e: SyntaxError) -> Error { + Error { + pos: self.position(), + kind: ErrorKind::Syntax(e.to_cow()), } - res } + + #[inline(never)] fn dispatch_char(&mut self, c: char) -> Result { match self.st { State::Normal => self.normal(c), State::TagStarted => self.tag_opened(c), + State::EmptyTagClosing => self.empty_element_closing(c), State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), - State::CommentStarted => self.comment_started(c), + State::InsideCdata => self.inside_cdata(c), State::CDataStarted(s) => self.cdata_started(c, s), - State::DoctypeStarted(s) => self.doctype_started(c, s), - State::DoctypeFinishing(d) => self.doctype_finishing(c, d), + State::InsideComment => self.inside_comment_state(c), + State::CommentStarted => self.comment_started(c), + State::InsideProcessingInstruction => self.inside_processing_instruction(c), State::ProcessingInstructionClosing => self.processing_instruction_closing(c), - State::EmptyTagClosing => self.empty_element_closing(c), State::CommentClosing(s) => self.comment_closing(c, s), - State::CDataClosing(s) => self.cdata_closing(c, s) + State::CDataClosing(s) => self.cdata_closing(c, s), + State::InsideDoctype => self.inside_doctype(c), + State::DoctypeStarted(s) => self.doctype_started(c, s), + State::InvalidCDataClosing(s) => self.invalid_cdata_closing(c, s), + State::InsideMarkupDeclaration => self.markup_declaration(c), + State::InsideMarkupDeclarationQuotedString(q) => self.markup_declaration_string(c, q), } } @@ -391,18 +403,50 @@ impl Lexer { } #[inline] + fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result { + self.normal_state = st; + self.st = st; + Ok(Some(token)) + } + fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { - self.char_queue.extend(cs.iter().cloned()); + for c in cs.iter().rev().copied() { + self.char_queue.push_front(c); + } self.move_to_with(st, token) } + pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> { + if markup.is_empty() { + return Ok(()); + } + + self.reparse_depth += 1; + if self.reparse_depth > MAX_ENTITY_EXPANSION_DEPTH || self.char_queue.len() > MAX_ENTITY_EXPANSION_LENGTH { + return Err(self.error(SyntaxError::EntityTooBig)) + } + + self.eof_handled = false; + self.char_queue.reserve(markup.len()); + for c in markup.chars().rev() { + self.char_queue.push_front(c); + } + + Ok(()) + } + fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { - self.char_queue.push_back(c); - if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky - self.move_to_with(State::Normal, Token::Chunk(chunk)) - } else { - Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) + debug_assert!(!chunk.is_empty()); + + #[cfg(test)] + if self.skip_errors { + let mut chars = chunk.chars(); + let first = chars.next().unwrap_or('\0'); + self.char_queue.extend(chars); + self.char_queue.push_back(c); + return self.move_to_with(State::Normal, Token::Character(first)); } + Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c))) } /// Encountered a char @@ -414,12 +458,39 @@ impl Lexer { '=' => Ok(Some(Token::EqualsSign)), '"' => Ok(Some(Token::DoubleQuote)), '\'' => Ok(Some(Token::SingleQuote)), - '?' => self.move_to(State::ProcessingInstructionClosing), - '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), + ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + _ => Ok(Some(Token::Character(c))) + } + } + + fn inside_cdata(&mut self, c: char) -> Result { + match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), + _ => Ok(Some(Token::Character(c))) + } + } + + fn inside_processing_instruction(&mut self, c: char) -> Result { + // These tokens are used by `<?xml?>` parser + match c { + '?' => self.move_to(State::ProcessingInstructionClosing), + '<' => Ok(Some(Token::OpeningTagStart)), + '>' => Ok(Some(Token::TagEnd)), + '/' => Ok(Some(Token::ClosingTagStart)), + '=' => Ok(Some(Token::EqualsSign)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), '&' => Ok(Some(Token::ReferenceStart)), ';' => Ok(Some(Token::ReferenceEnd)), - _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), + _ => Ok(Some(Token::Character(c))) + } + } + + fn inside_comment_state(&mut self, c: char) -> Result { + match c { + '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), _ => Ok(Some(Token::Character(c))) } } @@ -427,11 +498,11 @@ impl Lexer { /// Encountered '<' fn tag_opened(&mut self, c: char) -> Result { match c { - '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), - '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), + '?' => self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart), + '/' => self.move_to_with(self.normal_state, Token::ClosingTagStart), '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), - _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), - _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ if is_whitespace_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart), + _ if is_name_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart), _ => self.handle_error("<", c) } } @@ -442,31 +513,55 @@ impl Lexer { '-' => self.move_to(State::CommentStarted), '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), - _ => self.handle_error("<!", c) + 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => { + self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart) + }, + _ => self.handle_error("<!", c), } } /// Encountered '<!-' fn comment_started(&mut self, c: char) -> Result { match c { - '-' => self.move_to_with(State::Normal, Token::CommentStart), - _ => self.handle_error("<!-", c) + '-' => self.move_to_with(State::InsideComment, Token::CommentStart), + _ => self.handle_error("<!-", c), } } /// Encountered '<![' fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { - use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; + use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E}; dispatch_on_enum_state!(self, s, c, State::CDataStarted, E ; 'C' ; C ; "<![", C ; 'D' ; CD ; "<![C", CD ; 'A' ; CDA ; "<![CD", CDA ; 'T' ; CDAT ; "<![CDA", CDAT ; 'A' ; CDATA ; "<![CDAT"; - CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart) + CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::InsideCdata, Token::CDataStart) ) } + /// Encountered '<!…' that isn't DOCTYPE or CDATA + fn markup_declaration(&mut self, c: char) -> Result { + match c { + '<' => self.handle_error("<!", c), + '>' => self.move_to_with(self.normal_state, Token::TagEnd), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + '"' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote), + '\'' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote), + _ => Ok(Some(Token::Character(c))), + } + } + + fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result { + match c { + '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote), + '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote), + _ => Ok(Some(Token::Character(c))), + } + } + /// Encountered '<!D' fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; @@ -476,33 +571,36 @@ impl Lexer { DOC ; 'T' ; DOCT ; "<!DOC", DOCT ; 'Y' ; DOCTY ; "<!DOCT", DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; - DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart) + DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart) ) } /// State used while awaiting the closing bracket for the <!DOCTYPE tag - fn doctype_finishing(&mut self, c: char, d: u8) -> Result { + fn inside_doctype(&mut self, c: char) -> Result { match c { - '<' => self.move_to(State::DoctypeFinishing(d + 1)), - '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), - '>' => self.move_to(State::DoctypeFinishing(d - 1)), - _ => Ok(None), + '>' => self.move_to_and_reset_normal(State::Normal, Token::TagEnd), + '<' => self.move_to(State::TagStarted), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), + _ => Ok(Some(Token::Character(c))), } } /// Encountered '?' fn processing_instruction_closing(&mut self, c: char) -> Result { match c { - '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), - _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), + '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd), + _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')), } } /// Encountered '/' fn empty_element_closing(&mut self, c: char) -> Result { match c { - '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), - _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), + '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd), + _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')), } } @@ -511,18 +609,13 @@ impl Lexer { match s { ClosingSubstate::First => match c { '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), - _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) + _ => self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')), }, ClosingSubstate::Second => match c { - '>' => self.move_to_with(State::Normal, Token::CommentEnd), + '>' => self.move_to_with(self.normal_state, Token::CommentEnd), // double dash not followed by a greater-than is a hard error inside comment - _ if self.inside_comment => self.handle_error("--", c), - // nothing else except comment closing starts with a double dash, and comment - // closing can never be after another dash, and also we're outside of a comment, - // therefore it is safe to push only the last read character to the list of unread - // characters and pass the double dash directly to the output - _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) - } + _ => self.handle_error("--", c), + }, } } @@ -531,19 +624,33 @@ impl Lexer { match s { ClosingSubstate::First => match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), - _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) + _ => self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']')), }, ClosingSubstate::Second => match c { '>' => self.move_to_with(State::Normal, Token::CDataEnd), - _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) - } + _ => self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']')), + }, + } + } + + /// Encountered ']' + fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')), + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(self.normal_state, Token::CDataEnd), + _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')), + }, } } } #[cfg(test)] mod tests { - use common::{Position}; + use crate::common::Position; use std::io::{BufReader, Cursor}; use super::{Lexer, Token}; @@ -563,13 +670,12 @@ mod tests { let err = err.unwrap_err(); assert_eq!($r as u64, err.position().row); assert_eq!($c as u64, err.position().column); - assert_eq!($s, err.msg()); }) ); macro_rules! assert_none( (for $lex:ident and $buf:ident) => ( - assert_eq!(Ok(None), $lex.next_token(&mut $buf)); + assert_eq!(Ok(None), $lex.next_token(&mut $buf)) ) ); @@ -578,6 +684,47 @@ mod tests { } #[test] + fn tricky_pi() { + let (mut lex, mut buf) = make_lex_and_buf(r#"<?x<!-- &??><x>"#); + + assert_oks!(for lex and buf ; + Token::ProcessingInstructionStart + Token::Character('x') + Token::OpeningTagStart // processing of <?xml?> relies on the extra tokens + Token::Character('!') + Token::Character('-') + Token::Character('-') + Token::Character(' ') + Token::ReferenceStart + Token::Character('?') + Token::ProcessingInstructionEnd + Token::OpeningTagStart + Token::Character('x') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn reparser() { + let (mut lex, mut buf) = make_lex_and_buf(r#"&a;"#); + + assert_oks!(for lex and buf ; + Token::ReferenceStart + Token::Character('a') + Token::ReferenceEnd + ); + lex.reparse("<hi/>").unwrap(); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('h') + Token::Character('i') + Token::EmptyTagEnd + ); + assert_none!(for lex and buf); + } + + #[test] fn simple_lexer_test() { let (mut lex, mut buf) = make_lex_and_buf( r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# @@ -586,18 +733,18 @@ mod tests { assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('p') Token::EqualsSign Token::SingleQuote Token::Character('q') Token::SingleQuote Token::TagEnd - Token::Whitespace(' ') + Token::Character(' ') Token::Character('x') Token::OpeningTagStart Token::Character('b') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('z') Token::EqualsSign Token::DoubleQuote @@ -605,7 +752,7 @@ mod tests { Token::DoubleQuote Token::TagEnd Token::Character('d') - Token::Whitespace('\t') + Token::Character('\t') Token::ClosingTagStart Token::Character('b') Token::TagEnd @@ -615,21 +762,21 @@ mod tests { Token::OpeningTagStart Token::Character('p') Token::EmptyTagEnd - Token::Whitespace(' ') + Token::Character(' ') Token::ProcessingInstructionStart Token::Character('n') Token::Character('m') - Token::Whitespace(' ') + Token::Character(' ') Token::ProcessingInstructionEnd - Token::Whitespace(' ') + Token::Character(' ') Token::CommentStart - Token::Whitespace(' ') + Token::Character(' ') Token::Character('a') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('c') - Token::Whitespace(' ') + Token::Character(' ') Token::CommentEnd - Token::Whitespace(' ') + Token::Character(' ') Token::ReferenceStart Token::Character('n') Token::Character('b') @@ -651,16 +798,17 @@ mod tests { Token::Character('x') Token::Character('!') Token::Character('+') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('/') Token::Character('/') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('-') Token::Character('|') - Token::Whitespace(' ') + Token::Character(' ') Token::Character(']') Token::Character('z') - Token::Chunk("]]") + Token::Character(']') + Token::Character(']') ); assert_none!(for lex and buf); } @@ -677,12 +825,12 @@ mod tests { Token::TagEnd Token::CDataStart Token::Character('x') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('y') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('?') Token::CDataEnd - Token::Whitespace(' ') + Token::Character(' ') Token::ClosingTagStart Token::Character('a') Token::TagEnd @@ -691,6 +839,33 @@ mod tests { } #[test] + fn cdata_closers_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<![CDATA[] > ]> ]]><!---->]]<a>"# + ); + + assert_oks!(for lex and buf ; + Token::CDataStart + Token::Character(']') + Token::Character(' ') + Token::Character('>') + Token::Character(' ') + Token::Character(']') + Token::Character('>') + Token::Character(' ') + Token::CDataEnd + Token::CommentStart + Token::CommentEnd + Token::Character(']') + Token::Character(']') + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + + #[test] fn doctype_test() { let (mut lex, mut buf) = make_lex_and_buf( r#"<a><!DOCTYPE ab xx z> "# @@ -700,26 +875,135 @@ mod tests { Token::Character('a') Token::TagEnd Token::DoctypeStart + Token::Character(' ') + Token::Character('a') + Token::Character('b') + Token::Character(' ') + Token::Character('x') + Token::Character('x') + Token::Character(' ') + Token::Character('z') Token::TagEnd - Token::Whitespace(' ') + Token::Character(' ') ); - assert_none!(for lex and buf) + assert_none!(for lex and buf); + } + + #[test] + fn tricky_comments() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"<a><!-- C ->--></a>"# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::CommentStart + Token::Character(' ') + Token::Character('C') + Token::Character(' ') + Token::Character('-') + Token::Character('>') + Token::CommentEnd + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); } #[test] fn doctype_with_internal_subset_test() { let (mut lex, mut buf) = make_lex_and_buf( - r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "# + r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::DoctypeStart + Token::Character(' ') + Token::Character('a') + Token::Character('b') + Token::Character('[') + Token::MarkupDeclarationStart + Token::Character('E') + Token::Character('L') + Token::Character('E') + Token::Character('M') + Token::Character('E') + Token::Character('N') + Token::Character('T') + Token::Character(' ') + Token::Character('b') + Token::Character('a') + Token::Character(' ') + Token::DoubleQuote + Token::Character('>') + Token::Character('>') + Token::Character('>') + Token::DoubleQuote + Token::TagEnd + Token::Character(' ') + Token::Character(']') + Token::TagEnd + Token::Character(' ') + ); + assert_none!(for lex and buf); + } + + #[test] + fn doctype_internal_pi_comment() { + let (mut lex, mut buf) = make_lex_and_buf( + "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>" + ); + assert_oks!(for lex and buf ; + Token::DoctypeStart + Token::Character(' ') + Token::Character('a') + Token::Character(' ') + Token::Character('[') + Token::Character('\n') + Token::MarkupDeclarationStart + Token::Character('E') + Token::Character('L') + Token::Character('E') + Token::Character('M') + Token::Character('E') + Token::Character('N') + Token::Character('T') + Token::Character(' ') + Token::Character('l') + Token::Character(' ') + Token::Character('A') + Token::Character('N') + Token::Character('Y') Token::TagEnd - Token::Whitespace(' ') + Token::Character(' ') + Token::CommentStart + Token::Character(' ') + Token::Character('<') + Token::Character('?') + Token::Character('n') + Token::Character('o') + Token::Character('n') + Token::Character('?') + Token::Character('>') + Token::CommentEnd + Token::Character(' ') + Token::ProcessingInstructionStart + Token::Character('p') + Token::Character('i') + Token::Character(' ') + Token::TagEnd // not really + Token::Character(' ') + Token::ProcessingInstructionEnd + Token::Character(' ') + Token::Character('\n') + Token::Character(']') + Token::TagEnd // DTD ); - assert_none!(for lex and buf) + assert_none!(for lex and buf); } #[test] @@ -735,7 +1019,8 @@ mod tests { eof_check!("/" ; Token::Character('/')); eof_check!("-" ; Token::Character('-')); eof_check!("]" ; Token::Character(']')); - eof_check!("]]" ; Token::Chunk("]]")); + eof_check!("]" ; Token::Character(']')); + eof_check!("]" ; Token::Character(']')); } #[test] @@ -756,7 +1041,6 @@ mod tests { eof_check!("<![CDA" ; 0, 6); eof_check!("<![CDAT" ; 0, 7); eof_check!("<![CDATA" ; 0, 8); - eof_check!("--" ; 0, 2); } #[test] @@ -769,7 +1053,8 @@ mod tests { let (mut lex, mut buf) = make_lex_and_buf("<!x"); lex.disable_errors(); assert_oks!(for lex and buf ; - Token::Chunk("<!") + Token::Character('<') + Token::Character('!') Token::Character('x') ); assert_none!(for lex and buf); @@ -785,8 +1070,10 @@ mod tests { let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); lex.disable_errors(); assert_oks!(for lex and buf ; - Token::Chunk("<!-") - Token::Whitespace('\t') + Token::Character('<') + Token::Character('!') + Token::Character('-') + Token::Character('\t') ); assert_none!(for lex and buf); } @@ -794,14 +1081,15 @@ mod tests { #[test] fn error_in_comment_two_dashes_not_at_end() { let (mut lex, mut buf) = make_lex_and_buf("--x"); - lex.inside_comment(); + lex.st = super::State::InsideComment; assert_err!(for lex and buf expect row 0; 0, "Unexpected token '--' before 'x'" ); let (mut lex, mut buf) = make_lex_and_buf("--x"); assert_oks!(for lex and buf ; - Token::Chunk("--") + Token::Character('-') + Token::Character('-') Token::Character('x') ); } @@ -813,8 +1101,10 @@ mod tests { let (mut lex, mut buf) = make_lex_and_buf($data); lex.disable_errors(); + for c in $chunk.chars() { + assert_eq!(Ok(Some(Token::Character(c))), lex.next_token(&mut buf)); + } assert_oks!(for lex and buf ; - Token::Chunk($chunk) Token::Character($app) ); assert_none!(for lex and buf); @@ -822,6 +1112,12 @@ mod tests { ); #[test] + fn token_size() { + assert_eq!(4, std::mem::size_of::<Token>()); + assert_eq!(2, std::mem::size_of::<super::State>()); + } + + #[test] fn error_in_cdata_started() { check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); @@ -854,7 +1150,7 @@ mod tests { Token::Character('F') Token::Character('o') Token::Character('o') - Token::Whitespace(' ') + Token::Character(' ') Token::Character('[') Token::Character('B') Token::Character('a') diff --git a/src/reader/parser/mod.rs b/src/reader/parser.rs index 58ca3a6..dcdec89 100644 --- a/src/reader/parser/mod.rs +++ b/src/reader/parser.rs @@ -1,29 +1,32 @@ //! Contains an implementation of pull-based XML parser. -use std::mem; -use std::borrow::Cow; + +use crate::common::is_xml11_char; +use crate::common::is_xml10_char; +use crate::common::is_xml11_char_not_restricted; +use crate::reader::error::SyntaxError; +use std::collections::HashMap; use std::io::prelude::*; -use common::{ - self, - XmlVersion, Position, TextPosition, - is_name_start_char, is_name_char, -}; -use name::OwnedName; -use attribute::OwnedAttribute; -use namespace::NamespaceStack; +use crate::attribute::OwnedAttribute; +use crate::common::{self, is_name_char, is_name_start_char, Position, TextPosition, XmlVersion, is_whitespace_char}; +use crate::name::OwnedName; +use crate::namespace::NamespaceStack; + +use crate::reader::config::ParserConfig2; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::{Lexer, Token}; -use reader::events::XmlEvent; -use reader::config::ParserConfig; -use reader::lexer::{Lexer, Token}; +use super::{Error, ErrorKind}; macro_rules! gen_takes( ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( $( impl MarkupData { #[inline] + #[allow(clippy::mem_replace_option_with_none)] fn $method(&mut self) -> $t { - mem::replace(&mut self.$field, $def) + std::mem::replace(&mut self.$field, $def) } } )+ @@ -34,9 +37,7 @@ gen_takes!( name -> take_name, String, String::new(); ref_data -> take_ref_data, String, String::new(); - version -> take_version, Option<common::XmlVersion>, None; encoding -> take_encoding, Option<String>, None; - standalone -> take_standalone, Option<bool>, None; element_name -> take_element_name, Option<OwnedName>, None; @@ -44,34 +45,33 @@ gen_takes!( attributes -> take_attributes, Vec<OwnedAttribute>, vec!() ); -macro_rules! self_error( - ($this:ident; $msg:expr) => ($this.error($msg)); - ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+))) -); - -mod outside_tag; -mod inside_processing_instruction; +mod inside_cdata; +mod inside_closing_tag_name; +mod inside_comment; mod inside_declaration; mod inside_doctype; mod inside_opening_tag; -mod inside_closing_tag_name; -mod inside_comment; -mod inside_cdata; +mod inside_processing_instruction; mod inside_reference; +mod outside_tag; -static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; -static DEFAULT_ENCODING: &'static str = "UTF-8"; +static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; static DEFAULT_STANDALONE: Option<bool> = None; type ElementStack = Vec<OwnedName>; pub type Result = super::Result<XmlEvent>; /// Pull-based XML parser. -pub struct PullParser { - config: ParserConfig, +pub(crate) struct PullParser { + config: ParserConfig2, lexer: Lexer, st: State, + state_after_reference: State, buf: String, + + /// From DTD internal subset + entities: HashMap<String, String>, + nst: NamespaceStack, data: MarkupData, @@ -80,21 +80,48 @@ pub struct PullParser { est: ElementStack, pos: Vec<TextPosition>, - encountered_element: bool, - parsed_declaration: bool, + encountered: Encountered, inside_whitespace: bool, read_prefix_separator: bool, - pop_namespace: bool + pop_namespace: bool, +} + +// Keeps track when XML declaration can happen +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +enum Encountered { + None = 0, + AnyChars, // whitespace before <?xml is not allowed + Declaration, + Comment, + Doctype, + Element, } impl PullParser { /// Returns a new parser using the given config. - pub fn new(config: ParserConfig) -> PullParser { + #[inline] + pub fn new(config: impl Into<ParserConfig2>) -> PullParser { + let config = config.into(); + Self::new_with_config2(config) + } + + #[inline] + fn new_with_config2(config: ParserConfig2) -> PullParser { + let mut lexer = Lexer::new(); + if let Some(enc) = config.override_encoding { + lexer.set_encoding(enc); + } + + let mut pos = Vec::with_capacity(16); + pos.push(TextPosition::new()); + PullParser { - config: config, - lexer: Lexer::new(), - st: State::OutsideTag, + config, + lexer, + st: State::DocumentStart, + state_after_reference: State::OutsideTag, buf: String::new(), + entities: HashMap::new(), nst: NamespaceStack::default(), data: MarkupData { @@ -106,23 +133,44 @@ impl PullParser { element_name: None, quote: None, attr_name: None, - attributes: Vec::new() + attributes: Vec::new(), }, final_result: None, next_event: None, est: Vec::new(), - pos: vec![TextPosition::new()], + pos, - encountered_element: false, - parsed_declaration: false, + encountered: Encountered::None, inside_whitespace: true, read_prefix_separator: false, - pop_namespace: false + pop_namespace: false, } } /// Checks if this parser ignores the end of stream errors. - pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream } + pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream } + + #[inline(never)] + fn set_encountered(&mut self, new_encounter: Encountered) -> Option<Result> { + if new_encounter <= self.encountered { + return None; + } + let prev_enc = self.encountered; + self.encountered = new_encounter; + + // If declaration was not parsed and we have encountered an element, + // emit this declaration as the next event. + if prev_enc == Encountered::None { + self.push_pos(); + Some(Ok(XmlEvent::StartDocument { + version: DEFAULT_VERSION, + encoding: self.lexer.encoding().to_string(), + standalone: DEFAULT_STANDALONE, + })) + } else { + None + } + } } impl Position for PullParser { @@ -133,7 +181,7 @@ impl Position for PullParser { } } -#[derive(Clone, PartialEq)] +#[derive(Copy, Clone, PartialEq)] pub enum State { OutsideTag, InsideOpeningTag(OpeningTagSubstate), @@ -142,11 +190,33 @@ pub enum State { InsideComment, InsideCData, InsideDeclaration(DeclarationSubstate), - InsideDoctype, - InsideReference(Box<State>) + InsideDoctype(DoctypeSubstate), + InsideReference, + DocumentStart, } -#[derive(Clone, PartialEq)] +#[derive(Copy, Clone, PartialEq)] +pub enum DoctypeSubstate { + Outside, + String, + InsideName, + BeforeEntityName, + EntityName, + BeforeEntityValue, + EntityValue, + NumericReferenceStart, + NumericReference, + /// expansion + PEReferenceInValue, + PEReferenceInDtd, + /// name definition + PEReferenceDefinitionStart, + PEReferenceDefinition, + SkipDeclaration, + Comment, +} + +#[derive(Copy, Clone, PartialEq)] pub enum OpeningTagSubstate { InsideName, @@ -156,21 +226,22 @@ pub enum OpeningTagSubstate { AfterAttributeName, InsideAttributeValue, + AfterAttributeValue, } -#[derive(Clone, PartialEq)] +#[derive(Copy, Clone, PartialEq)] pub enum ClosingTagSubstate { CTInsideName, - CTAfterName + CTAfterName, } -#[derive(Clone, PartialEq)] +#[derive(Copy, Clone, PartialEq)] pub enum ProcessingInstructionSubstate { PIInsideName, - PIInsideData + PIInsideData, } -#[derive(Clone, PartialEq)] +#[derive(Copy, Clone, PartialEq)] pub enum DeclarationSubstate { BeforeVersion, InsideVersion, @@ -179,30 +250,32 @@ pub enum DeclarationSubstate { InsideVersionValue, AfterVersionValue, + BeforeEncoding, InsideEncoding, AfterEncoding, InsideEncodingValue, + AfterEncodingValue, BeforeStandaloneDecl, InsideStandaloneDecl, AfterStandaloneDecl, InsideStandaloneDeclValue, - AfterStandaloneDeclValue + AfterStandaloneDeclValue, } #[derive(PartialEq)] enum QualifiedNameTarget { AttributeNameTarget, OpeningTagNameTarget, - ClosingTagNameTarget + ClosingTagNameTarget, } #[derive(Copy, Clone, PartialEq, Eq)] enum QuoteToken { SingleQuoteToken, - DoubleQuoteToken + DoubleQuoteToken, } impl QuoteToken { @@ -210,14 +283,14 @@ impl QuoteToken { match *t { Token::SingleQuote => QuoteToken::SingleQuoteToken, Token::DoubleQuote => QuoteToken::DoubleQuoteToken, - _ => panic!("Unexpected token: {}", t) + _ => panic!("Unexpected token: {t}"), } } fn as_token(self) -> Token { match self { QuoteToken::SingleQuoteToken => Token::SingleQuote, - QuoteToken::DoubleQuoteToken => Token::DoubleQuote + QuoteToken::DoubleQuoteToken => Token::DoubleQuote, } } } @@ -257,97 +330,114 @@ impl PullParser { } loop { + debug_assert!(self.next_event.is_none()); + debug_assert!(!self.pop_namespace); + // While lexer gives us Ok(maybe_token) -- we loop. // Upon having a complete XML-event -- we return from the whole function. match self.lexer.next_token(r) { - Ok(maybe_token) => - match maybe_token { - None => break, - Some(token) => - match self.dispatch_token(token) { - None => {} // continue - Some(Ok(XmlEvent::EndDocument)) => - return { - self.next_pos(); - self.set_final_result(Ok(XmlEvent::EndDocument)) - }, - Some(Ok(xml_event)) => - return { - self.next_pos(); - Ok(xml_event) - }, - Some(Err(xml_error)) => - return { - self.next_pos(); - self.set_final_result(Err(xml_error)) - }, - } - }, - Err(lexer_error) => - return self.set_final_result(Err(lexer_error)), + Ok(Some(token)) => { + match self.dispatch_token(token) { + None => {} // continue + Some(Ok(xml_event)) => { + self.next_pos(); + return Ok(xml_event) + }, + Some(Err(xml_error)) => { + self.next_pos(); + return self.set_final_result(Err(xml_error)) + }, + } + }, + Ok(None) => break, + Err(lexer_error) => { + return self.set_final_result(Err(lexer_error)) + }, } } - // Handle end of stream + self.handle_eof() + } + + /// Handle end of stream + fn handle_eof(&mut self) -> std::result::Result<XmlEvent, super::Error> { // Forward pos to the lexer head self.next_pos(); let ev = if self.depth() == 0 { - if self.encountered_element && self.st == State::OutsideTag { // all is ok + if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok Ok(XmlEvent::EndDocument) - } else if !self.encountered_element { - self_error!(self; "Unexpected end of stream: no root element found") + } else if self.encountered < Encountered::Element { + self.error(SyntaxError::NoRootElement) } else { // self.st != State::OutsideTag - self_error!(self; "Unexpected end of stream") // TODO: add expected hint? + self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint? } + } else if self.config.c.ignore_end_of_stream { + self.final_result = None; + self.lexer.reset_eof_handled(); + return self.error(SyntaxError::UnbalancedRootElement); } else { - if self.config.ignore_end_of_stream { - self.final_result = None; - self.lexer.reset_eof_handled(); - return self_error!(self; "Unexpected end of stream: still inside the root element"); - } else { - self_error!(self; "Unexpected end of stream: still inside the root element") - } + self.error(SyntaxError::UnbalancedRootElement) }; self.set_final_result(ev) } // This function is to be called when a terminal event is reached. // The function sets up the `self.final_result` into `Some(result)` and return `result`. + #[inline] fn set_final_result(&mut self, result: Result) -> Result { self.final_result = Some(result.clone()); result } - #[inline] - fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result { - Err((&self.lexer, msg).into()) + #[cold] + fn error(&self, e: SyntaxError) -> Result { + Err(Error { + pos: self.lexer.position(), + kind: ErrorKind::Syntax(e.to_cow()), + }) } #[inline] fn next_pos(&mut self) { - if self.pos.len() > 1 { - self.pos.remove(0); - } else { - self.pos[0] = self.lexer.position(); + // unfortunately calls to next_pos will never be perfectly balanced with push_pos, + // at very least because parse errors and EOF can happen unexpectedly without a prior push. + if self.pos.len() > 0 { + if self.pos.len() > 1 { + self.pos.remove(0); + } else { + self.pos[0] = self.lexer.position(); + } } } #[inline] + #[track_caller] fn push_pos(&mut self) { - self.pos.push(self.lexer.position()); + debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events. + This case is ignored in release mode, and merely causes document positions to be out of sync. + Please file a bug and include the XML document that triggers this assert."); + + // it has capacity preallocated for more than it ever needs, so this reduces code size + if self.pos.len() != self.pos.capacity() { + self.pos.push(self.lexer.position()); + } else if self.pos.len() > 1 { + self.pos.remove(0); // this mitigates the excessive push_pos() call + } } + #[inline(never)] fn dispatch_token(&mut self, t: Token) -> Option<Result> { - match self.st.clone() { + match self.st { State::OutsideTag => self.outside_tag(t), - State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), - State::InsideDeclaration(s) => self.inside_declaration(t, s), - State::InsideDoctype => self.inside_doctype(t), State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), + State::InsideReference => self.inside_reference(t), State::InsideComment => self.inside_comment(t), State::InsideCData => self.inside_cdata(t), - State::InsideReference(s) => self.inside_reference(t, *s) + State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), + State::InsideDoctype(s) => self.inside_doctype(t, s), + State::InsideDeclaration(s) => self.inside_declaration(t, s), + State::DocumentStart => self.document_start(t), } } @@ -358,18 +448,12 @@ impl PullParser { #[inline] fn buf_has_data(&self) -> bool { - self.buf.len() > 0 + !self.buf.is_empty() } #[inline] fn take_buf(&mut self) -> String { - mem::replace(&mut self.buf, String::new()) - } - - #[inline] - fn append_char_continue(&mut self, c: char) -> Option<Result> { - self.buf.push(c); - None + std::mem::take(&mut self.buf) } #[inline] @@ -402,11 +486,11 @@ impl PullParser { self.read_prefix_separator = false; } - let invoke_callback = |this: &mut PullParser, t| { + let invoke_callback = move |this: &mut PullParser, t| { let name = this.take_buf(); match name.parse() { Ok(name) => on_name(this, t, name), - Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name)) + Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))) } }; @@ -418,9 +502,11 @@ impl PullParser { None } - Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) || - self.buf_has_data() && is_name_char(c)) => - self.append_char_continue(c), + Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c)) => { + self.buf.push(c); + None + }, Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), @@ -429,9 +515,9 @@ impl PullParser { Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), - Token::Whitespace(_) => invoke_callback(self, t), + Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t), - _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t)) + _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))) } } @@ -443,7 +529,7 @@ impl PullParser { fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result> where F: Fn(&mut PullParser, String) -> Option<Result> { match t { - Token::Whitespace(_) if self.data.quote.is_none() => None, // skip leading whitespace + Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace Token::DoubleQuote | Token::SingleQuote => match self.data.quote { None => { // Entered attribute value @@ -456,45 +542,56 @@ impl PullParser { on_value(self, value) } _ => { + if let Token::Character(c) = t { + if !self.is_valid_xml_char_not_restricted(c) { + return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); + } + } t.push_to_string(&mut self.buf); None } }, - Token::ReferenceStart => { - let st = Box::new(self.st.clone()); - self.into_state_continue(State::InsideReference(st)) - } + Token::ReferenceStart if self.data.quote.is_some() => { + self.state_after_reference = self.st; + self.into_state_continue(State::InsideReference) + }, Token::OpeningTagStart => - Some(self_error!(self; "Unexpected token inside attribute value: <")), + Some(self.error(SyntaxError::UnexpectedOpeningTag)), + + Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, // Every character except " and ' and < is okay - _ => { + _ if self.data.quote.is_some() => { t.push_to_string(&mut self.buf); None } + + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), } } fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> { - let mut name = self.data.take_element_name().unwrap(); + let mut name = self.data.take_element_name()?; let mut attributes = self.data.take_attributes(); // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), - None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) } // check and fix accumulated attributes prefixes - for attr in attributes.iter_mut() { + for attr in &mut attributes { if let Some(ref pfx) = attr.name.prefix { let new_ns = match self.nst.get(pfx) { - Some("") => None, // default namespace + Some("") => None, // default namespace Some(ns) => Some(ns.into()), - None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name)) + None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into()))) }; attr.name.namespace = new_ns; } @@ -510,44 +607,60 @@ impl PullParser { } let namespace = self.nst.squash(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { - name: name, - attributes: attributes, - namespace: namespace + name, + attributes, + namespace })) } fn emit_end_element(&mut self) -> Option<Result> { - let mut name = self.data.take_element_name().unwrap(); + let mut name = self.data.take_element_name()?; // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { - Some("") => name.namespace = None, // default namespace + Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), - None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) } - let op_name = self.est.pop().unwrap(); + let op_name = self.est.pop()?; if name == op_name { self.pop_namespace = true; - self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name })) + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name })) } else { - Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name)) + Some(self.error(SyntaxError::UnexpectedClosingTag(format!("{name} != {op_name}").into()))) } } + #[inline] + fn is_valid_xml_char(&self, c: char) -> bool { + if Some(XmlVersion::Version11) == self.data.version { + is_xml11_char(c) + } else { + is_xml10_char(c) + } + } + + #[inline] + fn is_valid_xml_char_not_restricted(&self, c: char) -> bool { + if Some(XmlVersion::Version11) == self.data.version { + is_xml11_char_not_restricted(c) + } else { + is_xml10_char(c) + } + } } #[cfg(test)] mod tests { use std::io::BufReader; - - use common::{Position, TextPosition}; - use name::OwnedName; - use attribute::OwnedAttribute; - use reader::parser::PullParser; - use reader::ParserConfig; - use reader::events::XmlEvent; + use crate::attribute::OwnedAttribute; + use crate::common::TextPosition; + use crate::name::OwnedName; + use crate::reader::events::XmlEvent; + use crate::reader::parser::PullParser; + use crate::reader::ParserConfig; fn new_parser() -> PullParser { PullParser::new(ParserConfig::new()) @@ -557,13 +670,13 @@ mod tests { ($r:expr, $p:expr, $t:pat) => ( match $p.next(&mut $r) { $t => {} - e => panic!("Unexpected event: {:?}", e) + e => panic!("Unexpected event: {e:?}\nExpected: {}", stringify!($t)) } ); ($r:expr, $p:expr, $t:pat => $c:expr ) => ( match $p.next(&mut $r) { $t if $c => {} - e => panic!("Unexpected event: {:?}", e) + e => panic!("Unexpected event: {e:?}\nExpected: {} if {}", stringify!($t), stringify!($c)) } ) ); @@ -608,15 +721,76 @@ mod tests { } #[test] + fn issue_220_comment() { + let (mut r, mut p) = test_data!(r#"<x><!-- <!--></x>"#); + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); + expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + + let (mut r, mut p) = test_data!(r#"<x><!-- <!---></x>"#); + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); + expect_event!(r, p, Err(_)); // ---> is forbidden in comments + + let (mut r, mut p) = test_data!(r#"<x><!--<text&x;> <!--></x>"#); + p.config.c.ignore_comments = false; + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); + expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == "<text&x;> <!"); + expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn malformed_declaration_attrs() { + let (mut r, mut p) = test_data!(r#"<?xml version x="1.0"?>"#); + expect_event!(r, p, Err(_)); + + let (mut r, mut p) = test_data!(r#"<?xml version="1.0" version="1.0"?>"#); + expect_event!(r, p, Err(_)); + + let (mut r, mut p) = test_data!(r#"<?xml version="1.0"encoding="utf-8"?>"#); + expect_event!(r, p, Err(_)); + + let (mut r, mut p) = test_data!(r#"<?xml version="1.0"standalone="yes"?>"#); + expect_event!(r, p, Err(_)); + + let (mut r, mut p) = test_data!(r#"<?xml version="1.0" encoding="utf-8"standalone="yes"?>"#); + expect_event!(r, p, Err(_)); + } + + #[test] fn opening_tag_in_attribute_value() { + use crate::reader::error::{SyntaxError, Error, ErrorKind}; + let (mut r, mut p) = test_data!(r#" <a attr="zzz<zzz" /> "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Err(ref e) => - e.msg() == "Unexpected token inside attribute value: <" && - e.position() == TextPosition { row: 1, column: 24 } + *e == Error { + kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()), + pos: TextPosition { row: 1, column: 24 } + } ); } + + #[test] + fn reference_err() { + let (mut r, mut p) = test_data!(r#" + <a>&&</a> + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); + expect_event!(r, p, Err(_)); + } + + #[test] + fn state_size() { + assert_eq!(2, std::mem::size_of::<super::State>()); + assert_eq!(1, std::mem::size_of::<super::DoctypeSubstate>()); + } } diff --git a/src/reader/parser/inside_cdata.rs b/src/reader/parser/inside_cdata.rs index 3269fb4..4f46f06 100644 --- a/src/reader/parser/inside_cdata.rs +++ b/src/reader/parser/inside_cdata.rs @@ -1,14 +1,14 @@ -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::reader::lexer::Token; +use crate::{common::is_whitespace_char, reader::events::XmlEvent}; -use super::{Result, PullParser, State}; +use super::{PullParser, Result, State}; impl PullParser { pub fn inside_cdata(&mut self, t: Token) -> Option<Result> { match t { Token::CDataEnd => { - self.lexer.enable_errors(); - let event = if self.config.cdata_to_characters { + let event = if self.config.c.cdata_to_characters { None } else { let data = self.take_buf(); @@ -17,16 +17,18 @@ impl PullParser { self.into_state(State::OutsideTag, event) } - Token::Whitespace(_) => { - t.push_to_string(&mut self.buf); + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + if !is_whitespace_char(c) { + self.inside_whitespace = false; + } + self.buf.push(c); None } - _ => { - self.inside_whitespace = false; - t.push_to_string(&mut self.buf); - None - } + _ => unreachable!(), } } } diff --git a/src/reader/parser/inside_closing_tag_name.rs b/src/reader/parser/inside_closing_tag_name.rs index 1d8074a..6d86808 100644 --- a/src/reader/parser/inside_closing_tag_name.rs +++ b/src/reader/parser/inside_closing_tag_name.rs @@ -1,8 +1,7 @@ -use namespace; - -use reader::lexer::Token; - -use super::{Result, PullParser, State, QualifiedNameTarget, ClosingTagSubstate}; +use crate::reader::error::SyntaxError; +use crate::{common::is_whitespace_char, namespace}; +use crate::reader::lexer::Token; +use super::{ClosingTagSubstate, PullParser, QualifiedNameTarget, Result, State}; impl PullParser { pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option<Result> { @@ -11,24 +10,22 @@ impl PullParser { match name.prefix_ref() { Some(prefix) if prefix == namespace::NS_XML_PREFIX || prefix == namespace::NS_XMLNS_PREFIX => - // TODO: {:?} is bad, need something better - Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), _ => { this.data.element_name = Some(name.clone()); match token { - Token::Whitespace(_) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), Token::TagEnd => this.emit_end_element(), - _ => Some(self_error!(this; "Unexpected token inside closing tag: {}", token)) + Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), + _ => Some(this.error(SyntaxError::UnexpectedTokenInClosingTag(token))) } } } }), ClosingTagSubstate::CTAfterName => match t { - Token::Whitespace(_) => None, // Skip whitespace Token::TagEnd => self.emit_end_element(), - _ => Some(self_error!(self; "Unexpected token inside closing tag: {}", t)) + Token::Character(c) if is_whitespace_char(c) => None, // Skip whitespace + _ => Some(self.error(SyntaxError::UnexpectedTokenInClosingTag(t))) } } } - } diff --git a/src/reader/parser/inside_comment.rs b/src/reader/parser/inside_comment.rs index fc98320..e4132c5 100644 --- a/src/reader/parser/inside_comment.rs +++ b/src/reader/parser/inside_comment.rs @@ -1,26 +1,26 @@ -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State}; +use super::{PullParser, Result, State}; impl PullParser { pub fn inside_comment(&mut self, t: Token) -> Option<Result> { match t { - // Double dash is illegal inside a comment - Token::Chunk(ref s) if &s[..] == "--" => Some(self_error!(self; "Unexpected token inside a comment: --")), - - Token::CommentEnd if self.config.ignore_comments => { - self.lexer.outside_comment(); + Token::CommentEnd if self.config.c.ignore_comments => { self.into_state_continue(State::OutsideTag) } Token::CommentEnd => { - self.lexer.outside_comment(); let data = self.take_buf(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data))) } - _ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + + _ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment _ => { t.push_to_string(&mut self.buf); @@ -28,5 +28,4 @@ impl PullParser { } } } - } diff --git a/src/reader/parser/inside_declaration.rs b/src/reader/parser/inside_declaration.rs index af39d10..4ff1427 100644 --- a/src/reader/parser/inside_declaration.rs +++ b/src/reader/parser/inside_declaration.rs @@ -1,44 +1,62 @@ - -use common::XmlVersion; - -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::common::{is_whitespace_char, XmlVersion}; +use crate::reader::error::SyntaxError; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::Token; +use crate::util::Encoding; use super::{ - Result, PullParser, State, DeclarationSubstate, QualifiedNameTarget, - DEFAULT_VERSION, DEFAULT_ENCODING + DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State, + DEFAULT_VERSION, }; impl PullParser { + #[inline(never)] + fn emit_start_document(&mut self) -> Option<Result> { + debug_assert!(self.encountered == Encountered::None); + self.encountered = Encountered::Declaration; + + let version = self.data.version; + let encoding = self.data.take_encoding(); + let standalone = self.data.standalone; + + if let Some(new_encoding) = encoding.as_deref() { + let new_encoding = match new_encoding.parse() { + Ok(e) => e, + Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1, + Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))), + }; + let current_encoding = self.lexer.encoding(); + if current_encoding != new_encoding { + let set = match (current_encoding, new_encoding) { + (Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new, + (Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding, + _ if self.config.ignore_invalid_encoding_declarations => current_encoding, + _ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))), + }; + self.lexer.set_encoding(set); + } + } + + let current_encoding = self.lexer.encoding(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { + version: version.unwrap_or(DEFAULT_VERSION), + encoding: encoding.unwrap_or_else(move || current_encoding.to_string()), + standalone + })) + } + // TODO: remove redundancy via macros or extra methods pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option<Result> { - macro_rules! unexpected_token( - ($this:expr; $t:expr) => (Some($this.error(format!("Unexpected token inside XML declaration: {}", $t)))); - ($t:expr) => (unexpected_token!(self; $t)); - ); - - #[inline] - fn emit_start_document(this: &mut PullParser) -> Option<Result> { - this.parsed_declaration = true; - let version = this.data.take_version(); - let encoding = this.data.take_encoding(); - let standalone = this.data.take_standalone(); - this.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { - version: version.unwrap_or(DEFAULT_VERSION), - encoding: encoding.unwrap_or(DEFAULT_ENCODING.into()), - standalone: standalone - })) - } match s { DeclarationSubstate::BeforeVersion => match t { - Token::Whitespace(_) => None, // continue Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, // continue + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { - match &name.local_name[..] { + match &*name.local_name { "ersion" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { @@ -47,18 +65,18 @@ impl PullParser { DeclarationSubstate::AfterVersion } )), - _ => unexpected_token!(this; name) + _ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))), } }), DeclarationSubstate::AfterVersion => match t { - Token::Whitespace(_) => None, Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { - this.data.version = match &value[..] { + this.data.version = match &*value { "1.0" => Some(XmlVersion::Version10), "1.1" => Some(XmlVersion::Version11), _ => None @@ -66,48 +84,60 @@ impl PullParser { if this.data.version.is_some() { this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) } else { - Some(self_error!(this; "Unexpected XML version value: {}", value)) + Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into()))) } }), DeclarationSubstate::AfterVersionValue => match t { - Token::Whitespace(_) => None, // skip whitespace + Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)), + Token::ProcessingInstructionEnd => self.emit_start_document(), + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, + + DeclarationSubstate::BeforeEncoding => match t { Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), - Token::ProcessingInstructionEnd => emit_start_document(self), - _ => unexpected_token!(t) + Token::ProcessingInstructionEnd => self.emit_start_document(), + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { - match &name.local_name[..] { + match &*name.local_name { "ncoding" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } )), - _ => unexpected_token!(this; name) + _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))) } }), DeclarationSubstate::AfterEncoding => match t { - Token::Whitespace(_) => None, Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { this.data.encoding = Some(value); - this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)) + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue)) }), + DeclarationSubstate::AfterEncodingValue => match t { + Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)), + Token::ProcessingInstructionEnd => self.emit_start_document(), + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, + DeclarationSubstate::BeforeStandaloneDecl => match t { - Token::Whitespace(_) => None, // skip whitespace Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), - Token::ProcessingInstructionEnd => emit_start_document(self), - _ => unexpected_token!(t) + Token::ProcessingInstructionEnd => self.emit_start_document(), + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { - match &name.local_name[..] { + match &*name.local_name { "tandalone" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { @@ -116,18 +146,18 @@ impl PullParser { DeclarationSubstate::AfterStandaloneDecl } )), - _ => unexpected_token!(this; name) + _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))), } }), DeclarationSubstate::AfterStandaloneDecl => match t { - Token::Whitespace(_) => None, Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { - let standalone = match &value[..] { + let standalone = match &*value { "yes" => Some(true), "no" => Some(false), _ => None @@ -136,16 +166,15 @@ impl PullParser { this.data.standalone = standalone; this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) } else { - Some(self_error!(this; "Invalid standalone declaration value: {}", value)) + Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into()))) } }), DeclarationSubstate::AfterStandaloneDeclValue => match t { - Token::Whitespace(_) => None, // skip whitespace - Token::ProcessingInstructionEnd => emit_start_document(self), - _ => unexpected_token!(t) - } + Token::ProcessingInstructionEnd => self.emit_start_document(), + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, } } - } diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs index 8dcf367..93ea470 100644 --- a/src/reader/parser/inside_doctype.rs +++ b/src/reader/parser/inside_doctype.rs @@ -1,16 +1,235 @@ -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State}; +use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State}; impl PullParser { - pub fn inside_doctype(&mut self, t: Token) -> Option<Result> { - match t { - Token::TagEnd => { - self.lexer.enable_errors(); - self.into_state_continue(State::OutsideTag) - } + pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option<Result> { + match substate { + DoctypeSubstate::Outside => match t { + Token::TagEnd => self.into_state_continue(State::OutsideTag), + Token::MarkupDeclarationStart => { + self.buf.clear(); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName)) + }, + Token::Character('%') => { + self.data.ref_data.clear(); + self.data.ref_data.push('%'); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd)) + }, + Token::CommentStart => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment)) + }, + Token::SingleQuote | Token::DoubleQuote => { + // just discard string literals + self.data.quote = Some(super::QuoteToken::from_token(&t)); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String)) + }, + Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))), + // TODO: parse SYSTEM, and [ + _ => None, + }, + DoctypeSubstate::String => match t { + Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { None }, + Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { None }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = None; + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, + DoctypeSubstate::Comment => match t { + Token::CommentEnd => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, + DoctypeSubstate::InsideName => match t { + Token::Character(c @ 'A'..='Z') => { + self.buf.push(c); + None + }, + Token::Character(c) if is_whitespace_char(c) => { + match self.buf.as_str() { + "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)), + "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)), + s => Some(self.error(SyntaxError::UnknownMarkupDeclaration(s.into()))), + } - _ => None + }, + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + }, + DoctypeSubstate::BeforeEntityName => { + self.data.name.clear(); + match t { + Token::Character(c) if is_whitespace_char(c) => None, + Token::Character('%') => { // % is for PEDecl + self.data.name.push('%'); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart)) + }, + Token::Character(c) if is_name_start_char(c) => { + self.data.name.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + } + }, + DoctypeSubstate::EntityName => match t { + Token::Character(c) if is_whitespace_char(c) => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) + }, + Token::Character(c) if is_name_char(c) => { + self.data.name.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::BeforeEntityValue => { + self.buf.clear(); + match t { + Token::Character(c) if is_whitespace_char(c) => None, + // SYSTEM/PUBLIC not supported + Token::Character('S' | 'P') => { + let name = self.data.take_name(); + self.entities.entry(name).or_insert_with(String::new); // Dummy value, but at least the name is recognized + + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) + }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = Some(super::QuoteToken::from_token(&t)); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + } + }, + DoctypeSubstate::EntityValue => match t { + Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None }, + Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None }, + Token::SingleQuote | Token::DoubleQuote => { + self.data.quote = None; + let name = self.data.take_name(); + let val = self.take_buf(); + self.entities.entry(name).or_insert(val); // First wins + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME + }, + Token::ReferenceStart | Token::Character('&') => { + self.data.ref_data.clear(); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart)) + }, + Token::Character('%') => { + self.data.ref_data.clear(); + self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue)) + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.buf.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceDefinitionStart => match t { + Token::Character(c) if is_whitespace_char(c) => { + None + }, + Token::Character(c) if is_name_start_char(c) => { + debug_assert_eq!(self.data.name, "%"); + self.data.name.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceDefinition => match t { + Token::Character(c) if is_name_char(c) => { + self.data.name.push(c); + None + }, + Token::Character(c) if is_whitespace_char(c) => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceInDtd => match t { + Token::Character(c) if is_name_char(c) => { + self.data.ref_data.push(c); + None + }, + Token::ReferenceEnd | Token::Character(';') => { + let name = self.data.take_ref_data(); + match self.entities.get(&name) { + Some(ent) => { + if let Err(e) = self.lexer.reparse(ent) { + return Some(Err(e)); + } + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), + } + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::PEReferenceInValue => match t { + Token::Character(c) if is_name_char(c) => { + self.data.ref_data.push(c); + None + }, + Token::ReferenceEnd | Token::Character(';') => { + let name = self.data.take_ref_data(); + match self.entities.get(&name) { + Some(ent) => { + self.buf.push_str(ent); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), + } + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::NumericReferenceStart => match t { + Token::Character('#') => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference)) + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.buf.push('&'); + self.buf.push(c); + // named entities are not expanded inside doctype + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::NumericReference => match t { + Token::ReferenceEnd | Token::Character(';') => { + let r = self.data.take_ref_data(); + // https://www.w3.org/TR/xml/#sec-entexpand + match self.numeric_reference_from_str(&r) { + Ok(c) => { + self.buf.push(c); + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) + } + Err(e) => Some(self.error(e)), + } + }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + Token::Character(c) => { + self.data.ref_data.push(c); + None + }, + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + }, + DoctypeSubstate::SkipDeclaration => match t { + Token::TagEnd => { + self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) + }, + _ => None, + }, } } } diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs index 533874f..b7f185a 100644 --- a/src/reader/parser/inside_opening_tag.rs +++ b/src/reader/parser/inside_opening_tag.rs @@ -1,26 +1,26 @@ -use common::is_name_start_char; -use attribute::OwnedAttribute; -use namespace; +use crate::reader::error::SyntaxError; +use crate::common::is_name_start_char; +use crate::namespace; +use crate::{attribute::OwnedAttribute, common::is_whitespace_char}; -use reader::lexer::Token; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State, OpeningTagSubstate, QualifiedNameTarget}; +use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State}; impl PullParser { pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> { - macro_rules! unexpected_token(($t:expr) => (Some(self_error!(self; "Unexpected token inside opening tag: {}", $t)))); match s { OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| { match name.prefix_ref() { Some(prefix) if prefix == namespace::NS_XML_PREFIX || prefix == namespace::NS_XMLNS_PREFIX => - Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), _ => { this.data.element_name = Some(name.clone()); match token { Token::TagEnd => this.emit_start_element(false), Token::EmptyTagEnd => this.emit_start_element(true), - Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), _ => unreachable!() } } @@ -28,66 +28,65 @@ impl PullParser { }), OpeningTagSubstate::InsideTag => match t { - Token::Whitespace(_) => None, // skip whitespace + Token::TagEnd => self.emit_start_element(false), + Token::EmptyTagEnd => self.emit_start_element(true), + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace Token::Character(c) if is_name_start_char(c) => { self.buf.push(c); self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) } - Token::TagEnd => self.emit_start_element(false), - Token::EmptyTagEnd => self.emit_start_element(true), - _ => unexpected_token!(t) + _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) }, OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { this.data.attr_name = Some(name); match token { - Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), _ => unreachable!() } }), OpeningTagSubstate::AfterAttributeName => match t { - Token::Whitespace(_) => None, Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), - _ => unexpected_token!(t) + Token::Character(c) if is_whitespace_char(c) => None, + _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) }, OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { - let name = this.data.take_attr_name().unwrap(); // unwrap() will always succeed here - + let name = this.data.take_attr_name()?; // will always succeed here // check that no attribute with such name is already present // if there is one, XML is not well-formed - if this.data.attributes.iter().find(|a| a.name == name).is_some() { // TODO: looks bad + if this.data.attributes.iter().any(|a| a.name == name) { // TODO: looks bad // TODO: ideally this error should point to the beginning of the attribute, // TODO: not the end of its value - Some(self_error!(this; "Attribute '{}' is redefined", name)) + Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into()))) } else { match name.prefix_ref() { // declaring a new prefix; it is sufficient to check prefix only // because "xmlns" prefix is reserved Some(namespace::NS_XMLNS_PREFIX) => { - let ln = &name.local_name[..]; + let ln = &*name.local_name; if ln == namespace::NS_XMLNS_PREFIX { - Some(self_error!(this; "Cannot redefine prefix '{}'", namespace::NS_XMLNS_PREFIX)) - } else if ln == namespace::NS_XML_PREFIX && &value[..] != namespace::NS_XML_URI { - Some(self_error!(this; "Prefix '{}' cannot be rebound to another value", namespace::NS_XML_PREFIX)) + Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix)) + } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI { + Some(this.error(SyntaxError::CannotRedefineXmlPrefix)) } else if value.is_empty() { - Some(self_error!(this; "Cannot undefine prefix '{}'", ln)) + Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into()))) } else { this.nst.put(name.local_name.clone(), value); - this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } } // declaring default namespace - None if &name.local_name[..] == namespace::NS_XMLNS_PREFIX => - match &value[..] { - namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX => - Some(self_error!(this; "Namespace '{}' cannot be default", value)), + None if &*name.local_name == namespace::NS_XMLNS_PREFIX => + match &*value { + namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI => + Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))), _ => { this.nst.put(namespace::NS_NO_PREFIX, value.clone()); - this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } }, @@ -95,14 +94,20 @@ impl PullParser { _ => { this.data.attributes.push(OwnedAttribute { name: name.clone(), - value: value + value }); - this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } } } - }) + }), + + OpeningTagSubstate::AfterAttributeValue => match t { + Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + Token::TagEnd => self.emit_start_element(false), + Token::EmptyTagEnd => self.emit_start_element(true), + _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) + }, } } - } diff --git a/src/reader/parser/inside_processing_instruction.rs b/src/reader/parser/inside_processing_instruction.rs index 8ddf6b8..96f6753 100644 --- a/src/reader/parser/inside_processing_instruction.rs +++ b/src/reader/parser/inside_processing_instruction.rs @@ -1,18 +1,20 @@ -use common::{ - is_name_start_char, is_name_char, -}; +use crate::reader::error::SyntaxError; +use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::Token; -use super::{Result, PullParser, State, ProcessingInstructionSubstate, DeclarationSubstate}; +use super::{DeclarationSubstate, ProcessingInstructionSubstate, PullParser, Result, State, Encountered}; impl PullParser { pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option<Result> { match s { ProcessingInstructionSubstate::PIInsideName => match t { - Token::Character(c) if !self.buf_has_data() && is_name_start_char(c) || - self.buf_has_data() && is_name_char(c) => self.append_char_continue(c), + Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c) => { + self.buf.push(c); + None + }, Token::ProcessingInstructionEnd => { // self.buf contains PI name @@ -20,70 +22,83 @@ impl PullParser { // Don't need to check for declaration because it has mandatory attributes // but there is none - match &name[..] { + match &*name { // Name is empty, it is an error - "" => Some(self_error!(self; "Encountered processing instruction without name")), + "" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)), // Found <?xml-like PI not at the beginning of a document, // it is an error - see section 2.6 of XML 1.1 spec - "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML" => - Some(self_error!(self; "Invalid processing instruction: <?{}", name)), + n if "xml".eq_ignore_ascii_case(n) => + Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))), // All is ok, emitting event _ => { - self.into_state_emit( - State::OutsideTag, - Ok(XmlEvent::ProcessingInstruction { - name: name, - data: None - }) - ) + debug_assert!(self.next_event.is_none(), "{:?}", self.next_event); + // can't have a PI before `<?xml` + let event1 = self.set_encountered(Encountered::Declaration); + let event2 = Some(Ok(XmlEvent::ProcessingInstruction { + name, + data: None + })); + // emitting two events at once is cumbersome + let event1 = if event1.is_some() { + self.next_event = event2; + event1 + } else { + event2 + }; + self.into_state(State::OutsideTag, event1) } } } - Token::Whitespace(_) => { + Token::Character(c) if is_whitespace_char(c) => { // self.buf contains PI name let name = self.take_buf(); - match &name[..] { + match &*name { // We have not ever encountered an element and have not parsed XML declaration - "xml" if !self.encountered_element && !self.parsed_declaration => + "xml" if self.encountered == Encountered::None => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)), // Found <?xml-like PI after the beginning of a document, // it is an error - see section 2.6 of XML 1.1 spec - "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML" - if self.encountered_element || self.parsed_declaration => - Some(self_error!(self; "Invalid processing instruction: <?{}", name)), + n if "xml".eq_ignore_ascii_case(n) => + Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))), // All is ok, starting parsing PI data _ => { - self.lexer.disable_errors(); // data is arbitrary, so disable errors self.data.name = name; - self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData)) + // can't have a PI before `<?xml` + let next_event = self.set_encountered(Encountered::Declaration); + self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData), next_event) } - } } - _ => Some(self_error!(self; "Unexpected token: <?{}{}", self.buf, t)) + _ => { + let buf = self.take_buf(); + Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf.into(), t))) + } }, ProcessingInstructionSubstate::PIInsideData => match t { Token::ProcessingInstructionEnd => { - self.lexer.enable_errors(); let name = self.data.take_name(); let data = self.take_buf(); self.into_state_emit( State::OutsideTag, Ok(XmlEvent::ProcessingInstruction { - name: name, - data: Some(data) - }) + name, + data: Some(data), + }), ) }, + Token::Character(c) if !self.is_valid_xml_char(c) => { + Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) + }, + // Any other token should be treated as plain characters _ => { t.push_to_string(&mut self.buf); @@ -92,5 +107,4 @@ impl PullParser { }, } } - } diff --git a/src/reader/parser/inside_reference.rs b/src/reader/parser/inside_reference.rs index 60026d5..9a15e09 100644 --- a/src/reader/parser/inside_reference.rs +++ b/src/reader/parser/inside_reference.rs @@ -1,13 +1,11 @@ +use crate::reader::error::SyntaxError; use std::char; - -use common::{is_name_start_char, is_name_char, is_whitespace_str}; - -use reader::lexer::Token; - -use super::{Result, PullParser, State}; +use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; +use crate::reader::lexer::Token; +use super::{PullParser, Result, State}; impl PullParser { - pub fn inside_reference(&mut self, t: Token, prev_st: State) -> Option<Result> { + pub fn inside_reference(&mut self, t: Token) -> Option<Result> { match t { Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { @@ -16,74 +14,64 @@ impl PullParser { } Token::ReferenceEnd => { - // TODO: check for unicode correctness let name = self.data.take_ref_data(); - let name_len = name.len(); // compute once - let c = match &name[..] { - "lt" => Ok('<'.to_string()), - "gt" => Ok('>'.to_string()), - "amp" => Ok('&'.to_string()), - "apos" => Ok('\''.to_string()), - "quot" => Ok('"'.to_string()), - "" => Err(self_error!(self; "Encountered empty entity")), - _ if name_len > 2 && name.starts_with("#x") => { - let num_str = &name[2..name_len]; - if num_str == "0" { - Err(self_error!(self; "Null character entity is not allowed")) - } else { - if self.config.replace_unknown_entity_references { - match u32::from_str_radix(num_str, 16).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { - Some(c) => Ok(c.to_string()), - None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) - } - } else { - match u32::from_str_radix(num_str, 16).ok().and_then(char::from_u32) { - Some(c) => Ok(c.to_string()), - None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) - } - } - } - } - _ if name_len > 1 && name.starts_with('#') => { - let num_str = &name[1..name_len]; - if num_str == "0" { - Err(self_error!(self; "Null character entity is not allowed")) - } else { - if self.config.replace_unknown_entity_references { - match u32::from_str_radix(num_str, 10).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { - Some(c) => Ok(c.to_string()), - None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) - } - } - else { - match u32::from_str_radix(num_str, 10).ok().and_then(char::from_u32) { - Some(c) => Ok(c.to_string()), - None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) - } - } - } + if name.is_empty() { + return Some(self.error(SyntaxError::EmptyEntity)); + } + + let c = match &*name { + "lt" => Some('<'), + "gt" => Some('>'), + "amp" => Some('&'), + "apos" => Some('\''), + "quot" => Some('"'), + _ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) { + Ok(c) => Some(c), + Err(e) => return Some(self.error(e)) }, - _ => { - if let Some(v) = self.config.extra_entities.get(&name) { - Ok(v.clone()) - } else { - Err(self_error!(self; "Unexpected entity: {}", name)) - } - } + _ => None, }; - match c { - Ok(c) => { - self.buf.push_str(&c); - if prev_st == State::OutsideTag && !is_whitespace_str(&c) { - self.inside_whitespace = false; + if let Some(c) = c { + self.buf.push(c); + } else if let Some(v) = self.config.c.extra_entities.get(&name) { + self.buf.push_str(v); + } else if let Some(v) = self.entities.get(&name) { + if self.state_after_reference == State::OutsideTag { + // an entity can expand to *elements*, so outside of a tag it needs a full reparse + if let Err(e) = self.lexer.reparse(v) { + return Some(Err(e)); } - self.into_state_continue(prev_st) + } else { + // however, inside attributes it's not allowed to affect attribute quoting, + // so it can't be fed to the lexer + self.buf.push_str(v); } - Err(e) => Some(e) + } else { + return Some(self.error(SyntaxError::UnexpectedEntity(name.into()))); + } + let prev_st = self.state_after_reference; + if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) { + self.inside_whitespace = false; } + self.into_state_continue(prev_st) } - _ => Some(self_error!(self; "Unexpected token inside an entity: {}", t)) + _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), + } + } + + pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result<char, SyntaxError> { + let val = if let Some(hex) = num_str.strip_prefix('x') { + u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? + } else { + u32::from_str_radix(num_str, 10).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? + }; + match char::from_u32(val) { + Some(c) if self.is_valid_xml_char(c) => Ok(c), + None if self.config.c.replace_unknown_entity_references => { + Ok('\u{fffd}') + }, + _ => Err(SyntaxError::InvalidCharacterEntity(val)), } } } diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs index d3f7598..8104224 100644 --- a/src/reader/parser/outside_tag.rs +++ b/src/reader/parser/outside_tag.rs @@ -1,130 +1,196 @@ -use common::is_whitespace_char; - -use reader::events::XmlEvent; -use reader::lexer::Token; +use crate::reader::error::SyntaxError; +use crate::common::is_whitespace_char; +use crate::reader::events::XmlEvent; +use crate::reader::lexer::Token; use super::{ - Result, PullParser, State, ClosingTagSubstate, OpeningTagSubstate, - ProcessingInstructionSubstate, DEFAULT_VERSION, DEFAULT_ENCODING, DEFAULT_STANDALONE + ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate, + ProcessingInstructionSubstate, PullParser, Result, State, }; impl PullParser { pub fn outside_tag(&mut self, t: Token) -> Option<Result> { match t { - Token::ReferenceStart => - self.into_state_continue(State::InsideReference(Box::new(State::OutsideTag))), - - Token::Whitespace(_) if self.depth() == 0 && self.config.ignore_root_level_whitespace => None, // skip whitespace outside of the root element + Token::Character(c) => { + if is_whitespace_char(c) { + // skip whitespace outside of the root element + if (self.config.c.trim_whitespace && self.buf.is_empty()) || + (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { + return None; + } + } else { + self.inside_whitespace = false; + if self.depth() == 0 { + return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); + } + } - Token::Whitespace(_) if self.config.trim_whitespace && !self.buf_has_data() => None, + if !self.is_valid_xml_char_not_restricted(c) { + return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); + } - Token::Whitespace(c) => { - if !self.buf_has_data() { + if self.buf.is_empty() { self.push_pos(); } - self.append_char_continue(c) - } - - _ if t.contains_char_data() && self.depth() == 0 => - Some(self_error!(self; "Unexpected characters outside the root element: {}", t)), + self.buf.push(c); + None + }, - _ if t.contains_char_data() => { // Non-whitespace char data - if !self.buf_has_data() { - self.push_pos(); + Token::CommentEnd | Token::TagEnd | Token::EqualsSign | + Token::DoubleQuote | Token::SingleQuote | + Token::ProcessingInstructionEnd | Token::EmptyTagEnd => { + if self.depth() == 0 { + return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); } self.inside_whitespace = false; - t.push_to_string(&mut self.buf); + + if let Some(s) = t.as_static_str() { + if self.buf.is_empty() { + self.push_pos(); + } + self.buf.push_str(s); + } None - } + }, + + Token::ReferenceStart if self.depth() > 0 => { + self.state_after_reference = State::OutsideTag; + self.into_state_continue(State::InsideReference) + }, - Token::ReferenceEnd => { // Semi-colon in a text outside an entity + Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity self.inside_whitespace = false; Token::ReferenceEnd.push_to_string(&mut self.buf); None - } + }, - Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => { + Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => { + let next_event = self.set_encountered(Encountered::Comment); // We need to switch the lexer into a comment mode inside comments - self.lexer.inside_comment(); - self.into_state_continue(State::InsideComment) + self.into_state(State::InsideComment, next_event) } - Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => { - if !self.buf_has_data() { + Token::CDataStart if self.depth() > 0 && self.config.c.coalesce_characters && self.config.c.cdata_to_characters => { + if self.buf.is_empty() { self.push_pos(); } - // We need to disable lexing errors inside CDATA - self.lexer.disable_errors(); self.into_state_continue(State::InsideCData) - } + }, _ => { // Encountered some markup event, flush the buffer as characters // or a whitespace let mut next_event = if self.buf_has_data() { let buf = self.take_buf(); - if self.inside_whitespace && self.config.trim_whitespace { + if self.inside_whitespace && self.config.c.trim_whitespace { None - } else if self.inside_whitespace && !self.config.whitespace_to_characters { + } else if self.inside_whitespace && !self.config.c.whitespace_to_characters { Some(Ok(XmlEvent::Whitespace(buf))) - } else if self.config.trim_whitespace { + } else if self.config.c.trim_whitespace { Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) } else { Some(Ok(XmlEvent::Characters(buf))) } } else { None }; self.inside_whitespace = true; // Reset inside_whitespace flag - self.push_pos(); - match t { - Token::ProcessingInstructionStart => - self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), - - Token::DoctypeStart if !self.encountered_element => { - // We don't have a doctype event so skip this position - // FIXME: update when we have a doctype event - self.next_pos(); - self.lexer.disable_errors(); - self.into_state(State::InsideDoctype, next_event) - } - Token::OpeningTagStart => { - // If declaration was not parsed and we have encountered an element, - // emit this declaration as the next event. - if !self.parsed_declaration { - self.parsed_declaration = true; - let sd_event = XmlEvent::StartDocument { - version: DEFAULT_VERSION, - encoding: DEFAULT_ENCODING.into(), - standalone: DEFAULT_STANDALONE - }; - // next_event is always none here because we're outside of - // the root element - next_event = Some(Ok(sd_event)); - self.push_pos(); + // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it + // and ignored comments don't pop + if t != Token::CommentStart || !self.config.c.ignore_comments { + self.push_pos(); + } + match t { + Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => { + if let Some(e) = self.set_encountered(Encountered::Element) { + next_event = Some(e); } - self.encountered_element = true; self.nst.push_empty(); self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) - } + }, Token::ClosingTagStart if self.depth() > 0 => self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), Token::CommentStart => { + if let Some(e) = self.set_encountered(Encountered::Comment) { + next_event = Some(e); + } // We need to switch the lexer into a comment mode inside comments - self.lexer.inside_comment(); self.into_state(State::InsideComment, next_event) - } + }, + + Token::DoctypeStart if self.encountered < Encountered::Doctype => { + if let Some(e) = self.set_encountered(Encountered::Doctype) { + next_event = Some(e); + } - Token::CDataStart => { - // We need to disable lexing errors inside CDATA - self.lexer.disable_errors(); + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) + }, + + Token::ProcessingInstructionStart => + self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), + + Token::CDataStart if self.depth() > 0 => { self.into_state(State::InsideCData, next_event) - } + }, - _ => Some(self_error!(self; "Unexpected token: {}", t)) + _ => Some(self.error(SyntaxError::UnexpectedToken(t))) } } } } + + pub fn document_start(&mut self, t: Token) -> Option<Result> { + debug_assert!(self.encountered < Encountered::Declaration); + + match t { + Token::Character(c) => { + let next_event = self.set_encountered(Encountered::AnyChars); + + if !is_whitespace_char(c) { + return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); + } + self.inside_whitespace = true; + + // skip whitespace outside of the root element + if (self.config.c.trim_whitespace && self.buf.is_empty()) || + (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { + return self.into_state(State::OutsideTag, next_event); + } + + self.push_pos(); + self.buf.push(c); + self.into_state(State::OutsideTag, next_event) + }, + + Token::CommentStart => { + let next_event = self.set_encountered(Encountered::Comment); + self.into_state(State::InsideComment, next_event) + } + + Token::OpeningTagStart => { + let next_event = self.set_encountered(Encountered::Element); + self.nst.push_empty(); + self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) + }, + + Token::DoctypeStart => { + let next_event = self.set_encountered(Encountered::Doctype); + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) + }, + + Token::ProcessingInstructionStart => { + self.push_pos(); + self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName)) + }, + + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + } + } } diff --git a/src/util.rs b/src/util.rs index 23fee04..07d0336 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,107 +1,305 @@ -use std::io::{self, Read}; -use std::str; use std::fmt; +use std::io::{self, Read}; +use std::str::{self, FromStr}; #[derive(Debug)] pub enum CharReadError { UnexpectedEof, Utf8(str::Utf8Error), - Io(io::Error) + Io(io::Error), } impl From<str::Utf8Error> for CharReadError { + #[cold] fn from(e: str::Utf8Error) -> CharReadError { CharReadError::Utf8(e) } } impl From<io::Error> for CharReadError { + #[cold] fn from(e: io::Error) -> CharReadError { CharReadError::Io(e) } } impl fmt::Display for CharReadError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::CharReadError::*; + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::CharReadError::{Io, UnexpectedEof, Utf8}; match *self { UnexpectedEof => write!(f, "unexpected end of stream"), - Utf8(ref e) => write!(f, "UTF-8 decoding error: {}", e), - Io(ref e) => write!(f, "I/O error: {}", e) + Utf8(ref e) => write!(f, "UTF-8 decoding error: {e}"), + Io(ref e) => write!(f, "I/O error: {e}"), } } } -pub fn next_char_from<R: Read>(source: &mut R) -> Result<Option<char>, CharReadError> { - const MAX_CODEPOINT_LEN: usize = 4; +/// Character encoding used for parsing +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[non_exhaustive] +pub enum Encoding { + /// Explicitly UTF-8 only + Utf8, + /// UTF-8 fallback, but can be any 8-bit encoding + Default, + /// ISO-8859-1 + Latin1, + /// US-ASCII + Ascii, + /// Big-Endian + Utf16Be, + /// Little-Endian + Utf16Le, + /// Unknown endianness yet, will be sniffed + Utf16, + /// Not determined yet, may be sniffed to be anything + Unknown, +} - let mut bytes = source.bytes(); - let mut buf = [0u8; MAX_CODEPOINT_LEN]; - let mut pos = 0; +// Rustc inlines eq_ignore_ascii_case and creates kilobytes of code! +#[inline(never)] +fn icmp(lower: &str, varcase: &str) -> bool { + lower.bytes().zip(varcase.bytes()).all(|(l, v)| l == v.to_ascii_lowercase()) +} - loop { - let next = match bytes.next() { - Some(Ok(b)) => b, - Some(Err(e)) => return Err(e.into()), - None if pos == 0 => return Ok(None), - None => return Err(CharReadError::UnexpectedEof) - }; - buf[pos] = next; - pos += 1; +impl FromStr for Encoding { + type Err = &'static str; + + fn from_str(val: &str) -> Result<Self, Self::Err> { + if ["utf-8", "utf8"].into_iter().any(move |label| icmp(label, val)) { + Ok(Encoding::Utf8) + } else if ["iso-8859-1", "latin1"].into_iter().any(move |label| icmp(label, val)) { + Ok(Encoding::Latin1) + } else if ["utf-16", "utf16"].into_iter().any(move |label| icmp(label, val)) { + Ok(Encoding::Utf16) + } else if ["ascii", "us-ascii"].into_iter().any(move |label| icmp(label, val)) { + Ok(Encoding::Ascii) + } else { + Err("unknown encoding name") + } + } +} + +impl fmt::Display for Encoding { + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + Encoding::Utf8 => "UTF-8", + Encoding::Default => "UTF-8", + Encoding::Latin1 => "ISO-8859-1", + Encoding::Ascii => "US-ASCII", + Encoding::Utf16Be => "UTF-16", + Encoding::Utf16Le => "UTF-16", + Encoding::Utf16 => "UTF-16", + Encoding::Unknown => "(unknown)", + }) + } +} + +pub(crate) struct CharReader { + pub encoding: Encoding, +} + +impl CharReader { + pub fn new() -> Self { + Self { + encoding: Encoding::Unknown, + } + } + + pub fn next_char_from<R: Read>(&mut self, source: &mut R) -> Result<Option<char>, CharReadError> { + let mut bytes = source.bytes(); + const MAX_CODEPOINT_LEN: usize = 4; + + let mut buf = [0u8; MAX_CODEPOINT_LEN]; + let mut pos = 0; + loop { + let next = match bytes.next() { + Some(Ok(b)) => b, + Some(Err(e)) => return Err(e.into()), + None if pos == 0 => return Ok(None), + None => return Err(CharReadError::UnexpectedEof), + }; + + match self.encoding { + Encoding::Utf8 | Encoding::Default => { + // fast path for ASCII subset + if pos == 0 && next.is_ascii() { + return Ok(Some(next.into())); + } - match str::from_utf8(&buf[..pos]) { - Ok(s) => return Ok(s.chars().next()), // always Some(..) - Err(_) if pos < MAX_CODEPOINT_LEN => {}, - Err(e) => return Err(e.into()) + buf[pos] = next; + pos += 1; + + match str::from_utf8(&buf[..pos]) { + Ok(s) => return Ok(s.chars().next()), // always Some(..) + Err(_) if pos < MAX_CODEPOINT_LEN => continue, + Err(e) => return Err(e.into()), + } + }, + Encoding::Latin1 => { + return Ok(Some(next.into())); + }, + Encoding::Ascii => { + if next.is_ascii() { + return Ok(Some(next.into())); + } else { + return Err(CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, "char is not ASCII"))); + } + }, + Encoding::Unknown | Encoding::Utf16 => { + buf[pos] = next; + pos += 1; + + // sniff BOM + if pos <= 3 && buf[..pos] == [0xEF, 0xBB, 0xBF][..pos] { + if pos == 3 && self.encoding != Encoding::Utf16 { + pos = 0; + self.encoding = Encoding::Utf8; + } + } else if pos <= 2 && buf[..pos] == [0xFE, 0xFF][..pos] { + if pos == 2 { + pos = 0; + self.encoding = Encoding::Utf16Be; + } + } else if pos <= 2 && buf[..pos] == [0xFF, 0xFE][..pos] { + if pos == 2 { + pos = 0; + self.encoding = Encoding::Utf16Le; + } + } else if pos == 1 && self.encoding == Encoding::Utf16 { + // sniff ASCII char in UTF-16 + self.encoding = if next == 0 { Encoding::Utf16Be } else { Encoding::Utf16Le }; + } else { + // UTF-8 is the default, but XML decl can change it to other 8-bit encoding + self.encoding = Encoding::Default; + if pos == 1 && next.is_ascii() { + return Ok(Some(next.into())); + } + } + }, + Encoding::Utf16Be => { + buf[pos] = next; + pos += 1; + if pos == 2 { + if let Some(Ok(c)) = char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap())]).next() { + return Ok(Some(c)); + } + } else if pos == 4 { // surrogate + return char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap()), u16::from_be_bytes(buf[2..4].try_into().unwrap())]) + .next().transpose() + .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))); + } + }, + Encoding::Utf16Le => { + buf[pos] = next; + pos += 1; + if pos == 2 { + if let Some(Ok(c)) = char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap())]).next() { + return Ok(Some(c)); + } + } else if pos == 4 { // surrogate + return char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap()), u16::from_le_bytes(buf[2..4].try_into().unwrap())]) + .next().transpose() + .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))); + } + }, + } } } } #[cfg(test)] mod tests { + use super::{CharReadError, CharReader, Encoding}; + #[test] fn test_next_char_from() { use std::io; - use std::error::Error; let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII - assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('c')); + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('c')); + + let mut bytes: &[u8] = b"\xEF\xBB\xBF\xE2\x80\xA2!"; // BOM + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('•')); + + let mut bytes: &[u8] = b"\xEF\xBB\xBFx123"; // BOM + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('x')); + + let mut bytes: &[u8] = b"\xEF\xBB\xBF"; // Nothing after BOM + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); + + let mut bytes: &[u8] = b"\xEF\xBB"; // Nothing after BO + assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); + + let mut bytes: &[u8] = b"\xEF\xBB\x42"; // Nothing after BO + assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(_))); + + let mut bytes: &[u8] = b"\xFE\xFF\x00\x42"; // UTF-16 + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); + + let mut bytes: &[u8] = b"\xFF\xFE\x42\x00"; // UTF-16 + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); + + let mut bytes: &[u8] = b"\xFF\xFE"; // UTF-16 + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); + + let mut bytes: &[u8] = b"\xFF\xFE\x00"; // UTF-16 + assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP - assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('п')); + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('п')); + + let mut bytes: &[u8] = "правильно".as_bytes(); + assert_eq!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes).unwrap(), Some('킿')); + + let mut bytes: &[u8] = "правильно".as_bytes(); + assert_eq!(CharReader { encoding: Encoding::Utf16Le }.next_char_from(&mut bytes).unwrap(), Some('뿐')); + + let mut bytes: &[u8] = b"\xD8\xD8\x80"; + assert!(matches!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes), Err(_))); + + let mut bytes: &[u8] = b"\x00\x42"; + assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); + + let mut bytes: &[u8] = b"\x42\x00"; + assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); + + let mut bytes: &[u8] = b"\x00"; + assert!(matches!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes), Err(_))); let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP - assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('😊')); + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('😊')); let mut bytes: &[u8] = b""; // empty - assert_eq!(super::next_char_from(&mut bytes).unwrap(), None); + assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point - match super::next_char_from(&mut bytes).unwrap_err() { + match CharReader::new().next_char_from(&mut bytes).unwrap_err() { super::CharReadError::UnexpectedEof => {}, - e => panic!("Unexpected result: {:?}", e) + e => panic!("Unexpected result: {e:?}") }; let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point - match super::next_char_from(&mut bytes).unwrap_err() { + match CharReader::new().next_char_from(&mut bytes).unwrap_err() { super::CharReadError::Utf8(_) => {}, - e => panic!("Unexpected result: {:?}", e) + e => panic!("Unexpected result: {e:?}") }; - // error during read struct ErrorReader; impl io::Read for ErrorReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + fn read(&mut self, _: &mut [u8]) -> io::Result<usize> { Err(io::Error::new(io::ErrorKind::Other, "test error")) } } let mut r = ErrorReader; - match super::next_char_from(&mut r).unwrap_err() { + match CharReader::new().next_char_from(&mut r).unwrap_err() { super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other && - e.description() == "test error" => {}, - e => panic!("Unexpected result: {:?}", e) + e.to_string().contains("test error") => {}, + e => panic!("Unexpected result: {e:?}") } } } diff --git a/src/writer/mod.rs b/src/writer.rs index ea1b242..e2b70ec 100644 --- a/src/writer/mod.rs +++ b/src/writer.rs @@ -3,24 +3,24 @@ //! The most important type in this module is `EventWriter` which allows writing an XML document //! to some output stream. -pub use self::emitter::Result; -pub use self::emitter::EmitterError as Error; pub use self::config::EmitterConfig; +pub use self::emitter::EmitterError as Error; +pub use self::emitter::Result; pub use self::events::XmlEvent; use self::emitter::Emitter; use std::io::prelude::*; -mod emitter; mod config; +mod emitter; pub mod events; /// A wrapper around an `std::io::Write` instance which emits XML document according to provided /// events. pub struct EventWriter<W> { sink: W, - emitter: Emitter + emitter: Emitter, } impl<W: Write> EventWriter<W> { @@ -37,7 +37,7 @@ impl<W: Write> EventWriter<W> { pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter<W> { EventWriter { sink, - emitter: Emitter::new(config) + emitter: Emitter::new(config), } } @@ -63,12 +63,9 @@ impl<W: Write> EventWriter<W> { self.emitter.namespace_stack_mut().try_pop(); r } - XmlEvent::Comment(content) => - self.emitter.emit_comment(&mut self.sink, content), - XmlEvent::CData(content) => - self.emitter.emit_cdata(&mut self.sink, content), - XmlEvent::Characters(content) => - self.emitter.emit_characters(&mut self.sink, content) + XmlEvent::Comment(content) => self.emitter.emit_comment(&mut self.sink, content), + XmlEvent::CData(content) => self.emitter.emit_cdata(&mut self.sink, content), + XmlEvent::Characters(content) => self.emitter.emit_characters(&mut self.sink, content), } } diff --git a/src/writer/config.rs b/src/writer/config.rs index ebabf18..c7841bc 100644 --- a/src/writer/config.rs +++ b/src/writer/config.rs @@ -1,9 +1,8 @@ //! Contains emitter configuration structure. -use std::io::Write; use std::borrow::Cow; - -use writer::EventWriter; +use std::io::Write; +use crate::writer::EventWriter; /// Emitter configuration structure. /// @@ -98,10 +97,11 @@ impl EmitterConfig { /// .normalize_empty_elements(false); /// ``` #[inline] + #[must_use] pub fn new() -> EmitterConfig { EmitterConfig { line_separator: "\n".into(), - indent_string: " ".into(), // two spaces + indent_string: " ".into(), // two spaces perform_indent: false, perform_escaping: true, write_document_declaration: true, @@ -109,7 +109,7 @@ impl EmitterConfig { cdata_to_characters: false, keep_element_names_stack: true, autopad_comments: true, - pad_self_closing: true + pad_self_closing: true, } } diff --git a/src/writer/emitter.rs b/src/writer/emitter.rs index ba80f66..8e74b5f 100644 --- a/src/writer/emitter.rs +++ b/src/writer/emitter.rs @@ -1,18 +1,17 @@ +use std::error::Error; +use std::fmt; use std::io; use std::io::prelude::*; -use std::fmt; use std::result; -use std::borrow::Cow; -use std::error::Error; -use common; -use name::{Name, OwnedName}; -use attribute::Attribute; -use escape::{escape_str_attribute, escape_str_pcdata}; -use common::XmlVersion; -use namespace::{NamespaceStack, NS_NO_PREFIX, NS_EMPTY_URI, NS_XMLNS_PREFIX, NS_XML_PREFIX}; +use crate::attribute::Attribute; +use crate::common; +use crate::common::XmlVersion; +use crate::escape::{AttributeEscapes, Escaped, PcDataEscapes}; +use crate::name::{Name, OwnedName}; +use crate::namespace::{NamespaceStack, NS_EMPTY_URI, NS_NO_PREFIX, NS_XMLNS_PREFIX, NS_XML_PREFIX}; -use writer::config::EmitterConfig; +use crate::writer::config::EmitterConfig; /// An error which may be returned by `XmlWriter` when writing XML events. #[derive(Debug)] @@ -32,47 +31,35 @@ pub enum EmitterError { /// End element name is not specified when it is needed, for example, when automatic /// closing is not enabled in configuration. - EndElementNameIsNotSpecified + EndElementNameIsNotSpecified, } impl From<io::Error> for EmitterError { + #[cold] fn from(err: io::Error) -> EmitterError { EmitterError::Io(err) } } impl fmt::Display for EmitterError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - - write!(f, "emitter error: ")?; - match *self { - EmitterError::Io(ref e) => - write!(f, "I/O error: {}", e), - ref other => - write!(f, "{}", other.description()), + #[cold] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("emitter error: ")?; + match self { + EmitterError::Io(e) => write!(f, "I/O error: {e}"), + EmitterError::DocumentStartAlreadyEmitted => f.write_str("document start event has already been emitted"), + EmitterError::LastElementNameNotAvailable => f.write_str("last element name is not available"), + EmitterError::EndElementNameIsNotEqualToLastStartElementName => f.write_str("end element name is not equal to last start element name"), + EmitterError::EndElementNameIsNotSpecified => f.write_str("end element name is not specified and can't be inferred"), } } } impl Error for EmitterError { - fn description(&self) -> &str { - match *self { - EmitterError::Io(_) => - "I/O error", - EmitterError::DocumentStartAlreadyEmitted => - "document start event has already been emitted", - EmitterError::LastElementNameNotAvailable => - "last element name is not available", - EmitterError::EndElementNameIsNotEqualToLastStartElementName => - "end element name is not equal to last start element name", - EmitterError::EndElementNameIsNotSpecified => - "end element name is not specified and can't be inferred", - } - } } /// A result type yielded by `XmlWriter`. -pub type Result<T> = result::Result<T, EmitterError>; +pub type Result<T, E = EmitterError> = result::Result<T, E>; // TODO: split into a low-level fast writer without any checks and formatting logic and a // high-level indenting validating writer @@ -87,23 +74,26 @@ pub struct Emitter { element_names: Vec<OwnedName>, start_document_emitted: bool, - just_wrote_start_element: bool + just_wrote_start_element: bool, } impl Emitter { pub fn new(config: EmitterConfig) -> Emitter { + let mut indent_stack = Vec::with_capacity(16); + indent_stack.push(IndentFlags::WroteNothing); + Emitter { config, nst: NamespaceStack::empty(), indent_level: 0, - indent_stack: vec![IndentFlags::WroteNothing], + indent_stack, element_names: Vec::new(), start_document_emitted: false, - just_wrote_start_element: false + just_wrote_start_element: false, } } } @@ -124,27 +114,26 @@ impl Emitter { #[inline] fn wrote_text(&self) -> bool { - *self.indent_stack.last().unwrap() == IndentFlags::WroteText + self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteText) } #[inline] fn wrote_markup(&self) -> bool { - *self.indent_stack.last().unwrap() == IndentFlags::WroteMarkup + self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteMarkup) } #[inline] fn set_wrote_text(&mut self) { - *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteText; + if let Some(e) = self.indent_stack.last_mut() { + *e = IndentFlags::WroteText; + } } #[inline] fn set_wrote_markup(&mut self) { - *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteMarkup; - } - - #[inline] - fn reset_state(&mut self) { - *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteNothing; + if let Some(e) = self.indent_stack.last_mut() { + *e = IndentFlags::WroteMarkup; + } } fn write_newline<W: Write>(&mut self, target: &mut W, level: usize) -> Result<()> { @@ -216,7 +205,7 @@ impl Emitter { self.before_markup(target)?; let result = { let mut write = move || { - write!(target, "<?xml version=\"{}\" encoding=\"{}\"", version, encoding)?; + write!(target, "<?xml version=\"{version}\" encoding=\"{encoding}\"")?; if let Some(standalone) = standalone { write!(target, " standalone=\"{}\"", if standalone { "yes" } else { "no" })?; @@ -260,11 +249,11 @@ impl Emitter { self.before_markup(target)?; let result = { - let mut write = || { - write!(target, "<?{}", name)?; + let mut write = move || { + write!(target, "<?{name}")?; if let Some(data) = data { - write!(target, " {}", data)?; + write!(target, " {data}")?; } write!(target, "?>")?; @@ -280,8 +269,8 @@ impl Emitter { } fn emit_start_element_initial<W>(&mut self, target: &mut W, - name: Name, - attributes: &[Attribute]) -> Result<()> + name: Name<'_>, + attributes: &[Attribute<'_>]) -> Result<()> where W: Write { self.check_document_started(target)?; @@ -295,8 +284,8 @@ impl Emitter { } pub fn emit_start_element<W>(&mut self, target: &mut W, - name: Name, - attributes: &[Attribute]) -> Result<()> + name: Name<'_>, + attributes: &[Attribute<'_>]) -> Result<()> where W: Write { if self.config.keep_element_names_stack { @@ -324,29 +313,31 @@ impl Emitter { //prefix if self.nst.get(prefix) == Some(uri) => Ok(()), // emit xmlns only if it is overridden NS_NO_PREFIX => if uri != NS_EMPTY_URI { - write!(target, " xmlns=\"{}\"", uri) + write!(target, " xmlns=\"{uri}\"") } else { Ok(()) }, // everything else - prefix => write!(target, " xmlns:{}=\"{}\"", prefix, uri) + prefix => write!(target, " xmlns:{prefix}=\"{uri}\"") }?; } Ok(()) } pub fn emit_attributes<W: Write>(&mut self, target: &mut W, - attributes: &[Attribute]) -> Result<()> { - for attr in attributes.iter() { - write!( - target, " {}=\"{}\"", - attr.name.repr_display(), - if self.config.perform_escaping { escape_str_attribute(attr.value) } else { Cow::Borrowed(attr.value) } - )? + attributes: &[Attribute<'_>]) -> Result<()> { + for attr in attributes.iter() { + write!(target, " {}=\"", attr.name.repr_display())?; + if self.config.perform_escaping { + write!(target, "{}", Escaped::<AttributeEscapes>::new(attr.value))?; + } else { + write!(target, "{}", attr.value)?; + } + write!(target, "\"")?; } Ok(()) } pub fn emit_end_element<W: Write>(&mut self, target: &mut W, - name: Option<Name>) -> Result<()> { + name: Option<Name<'_>>) -> Result<()> { let owned_name = if self.config.keep_element_names_stack { Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?) } else { @@ -403,13 +394,13 @@ impl Emitter { content: &str) -> Result<()> { self.check_document_started(target)?; self.fix_non_empty_element(target)?; - target.write_all( - (if self.config.perform_escaping { - escape_str_pcdata(content) - } else { - Cow::Borrowed(content) - }).as_bytes() - )?; + + if self.config.perform_escaping { + write!(target, "{}", Escaped::<PcDataEscapes>::new(content))?; + } else { + target.write_all(content.as_bytes())?; + } + self.after_text(); Ok(()) } @@ -420,7 +411,7 @@ impl Emitter { // TODO: add escaping dashes at the end of the comment let autopad_comments = self.config.autopad_comments; - let write = |target: &mut W| -> Result<()> { + let write = move |target: &mut W| -> Result<()> { target.write_all(b"<!--")?; if autopad_comments && !content.starts_with(char::is_whitespace) { diff --git a/src/writer/events.rs b/src/writer/events.rs index 1f7040f..af9f37c 100644 --- a/src/writer/events.rs +++ b/src/writer/events.rs @@ -2,16 +2,16 @@ use std::borrow::Cow; -use name::Name; -use attribute::Attribute; -use common::XmlVersion; -use namespace::{Namespace, NS_NO_PREFIX}; +use crate::attribute::Attribute; +use crate::common::XmlVersion; +use crate::name::Name; +use crate::namespace::{Namespace, NS_NO_PREFIX}; /// A part of an XML output stream. /// /// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of /// an XML document. -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum XmlEvent<'a> { /// Corresponds to XML document declaration. /// @@ -32,7 +32,7 @@ pub enum XmlEvent<'a> { /// XML standalone declaration. /// /// Defaults to `None`. - standalone: Option<bool> + standalone: Option<bool>, }, /// Denotes an XML processing instruction. @@ -41,7 +41,7 @@ pub enum XmlEvent<'a> { name: &'a str, /// Processing instruction content. - data: Option<&'a str> + data: Option<&'a str>, }, /// Denotes a beginning of an XML element. @@ -71,7 +71,7 @@ pub enum XmlEvent<'a> { /// If `None`, then it is assumed that the element name should be the last valid one. /// If `Some` and element names tracking is enabled, then the writer will check it for /// correctness. - name: Option<Name<'a>> + name: Option<Name<'a>>, }, /// Denotes CDATA content. @@ -90,14 +90,15 @@ pub enum XmlEvent<'a> { /// /// Contents of this event will be escaped if `perform_escaping` option is enabled, /// that is, every character invalid for PCDATA will appear as a character entity. - Characters(&'a str) + Characters(&'a str), } impl<'a> XmlEvent<'a> { /// Returns an writer event for a processing instruction. #[inline] + #[must_use] pub fn processing_instruction(name: &'a str, data: Option<&'a str>) -> XmlEvent<'a> { - XmlEvent::ProcessingInstruction { name: name, data: data } + XmlEvent::ProcessingInstruction { name, data } } /// Returns a builder for a starting element. @@ -109,7 +110,7 @@ impl<'a> XmlEvent<'a> { StartElementBuilder { name: name.into(), attributes: Vec::new(), - namespace: Namespace::empty().into() + namespace: Namespace::empty(), } } @@ -119,6 +120,7 @@ impl<'a> XmlEvent<'a> { /// the writer is able to determine it automatically. However, when this functionality /// is disabled, it is possible to specify the name with `name()` method on the builder. #[inline] + #[must_use] pub fn end_element() -> EndElementBuilder<'a> { EndElementBuilder { name: None } } @@ -128,26 +130,37 @@ impl<'a> XmlEvent<'a> { /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>` /// (depending on the configuration). #[inline] - pub fn cdata(data: &'a str) -> XmlEvent<'a> { XmlEvent::CData(data) } + #[must_use] + pub fn cdata(data: &'a str) -> XmlEvent<'a> { + XmlEvent::CData(data) + } /// Returns a regular characters (PCDATA) event. /// /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer. #[inline] - pub fn characters(data: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(data) } + #[must_use] + pub fn characters(data: &'a str) -> XmlEvent<'a> { + XmlEvent::Characters(data) + } /// Returns a comment event. #[inline] - pub fn comment(data: &'a str) -> XmlEvent<'a> { XmlEvent::Comment(data) } + #[must_use] + pub fn comment(data: &'a str) -> XmlEvent<'a> { + XmlEvent::Comment(data) + } } impl<'a> From<&'a str> for XmlEvent<'a> { #[inline] - fn from(s: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(s) } + fn from(s: &'a str) -> XmlEvent<'a> { + XmlEvent::Characters(s) + } } pub struct EndElementBuilder<'a> { - name: Option<Name<'a>> + name: Option<Name<'a>>, } /// A builder for a closing element event. @@ -175,7 +188,7 @@ impl<'a> From<EndElementBuilder<'a>> for XmlEvent<'a> { pub struct StartElementBuilder<'a> { name: Name<'a>, attributes: Vec<Attribute<'a>>, - namespace: Namespace + namespace: Namespace, } impl<'a> StartElementBuilder<'a> { @@ -210,6 +223,7 @@ impl<'a> StartElementBuilder<'a> { /// then another binding will be added as a part of this element attribute set, shadowing /// the outer binding. #[inline] + #[must_use] pub fn ns<S1, S2>(mut self, prefix: S1, uri: S2) -> StartElementBuilder<'a> where S1: Into<String>, S2: Into<String> { @@ -221,6 +235,7 @@ impl<'a> StartElementBuilder<'a> { /// /// Same rules as for `ns()` are also valid for the default namespace mapping. #[inline] + #[must_use] pub fn default_ns<S>(mut self, uri: S) -> StartElementBuilder<'a> where S: Into<String> { @@ -235,7 +250,7 @@ impl<'a> From<StartElementBuilder<'a>> for XmlEvent<'a> { XmlEvent::StartElement { name: b.name, attributes: Cow::Owned(b.attributes), - namespace: Cow::Owned(b.namespace) + namespace: Cow::Owned(b.namespace), } } } diff --git a/tests/documents/sample_1.xml b/tests/documents/sample_1.xml deleted file mode 100644 index 4d1cbc0..0000000 --- a/tests/documents/sample_1.xml +++ /dev/null @@ -1,34 +0,0 @@ -<?xml version="1.0" encoding="utf-8" standalone="yes"?> -<project name="project-name"> - <libraries> - <library groupId="org.example" artifactId="<name>" version="0.1"/> - <library groupId="com.example" artifactId=""cool-lib&" version="999"/> - </libraries> - <module name="module-1"> - <files> - <file name="somefile.java" type="java"> - Some <java> class - </file> - <file name="another_file.java" type="java"> - Another "java" class - </file> - <file name="config.xml" type="xml"> - Weird 'XML' config - </file> - </files> - <libraries> - <library groupId="junit" artifactId="junit" version="1.9.5"/> - </libraries> - </module> - <module name="module-2"> - <files> - <file name="program.js" type="javascript"> - JavaScript & program - </file> - <file name="style.css" type="css"> - Cascading style sheet: © - ҉ - </file> - </files> - </module> -</project> - diff --git a/tests/documents/sample_1_full.txt b/tests/documents/sample_1_full.txt deleted file mode 100644 index a8d64d0..0000000 --- a/tests/documents/sample_1_full.txt +++ /dev/null @@ -1,58 +0,0 @@ -StartDocument(1.0, utf-8) -StartElement(project [name="project-name"]) -Whitespace("\n ") -StartElement(libraries) -Whitespace("\n ") -StartElement(library [groupId="org.example", artifactId="<name>", version="0.1"]) -EndElement(library) -Whitespace("\n ") -StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"]) -EndElement(library) -Whitespace("\n ") -EndElement(libraries) -Whitespace("\n ") -StartElement(module [name="module-1"]) -Whitespace("\n ") -StartElement(files) -Whitespace("\n ") -StartElement(file [name="somefile.java", type="java"]) -Characters("\n Some <java> class\n ") -EndElement(file) -Whitespace("\n ") -StartElement(file [name="another_file.java", type="java"]) -Characters("\n Another \"java\" class\n ") -EndElement(file) -Whitespace("\n ") -StartElement(file [name="config.xml", type="xml"]) -Characters("\n Weird \'XML\' config\n ") -EndElement(file) -Whitespace("\n ") -EndElement(files) -Whitespace("\n ") -StartElement(libraries) -Whitespace("\n ") -StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"]) -EndElement(library) -Whitespace("\n ") -EndElement(libraries) -Whitespace("\n ") -EndElement(module) -Whitespace("\n ") -StartElement(module [name="module-2"]) -Whitespace("\n ") -StartElement(files) -Whitespace("\n ") -StartElement(file [name="program.js", type="javascript"]) -Characters("\n JavaScript & program\n ") -EndElement(file) -Whitespace("\n ") -StartElement(file [name="style.css", type="css"]) -Characters("\n Cascading style sheet: © - ҉\n ") -EndElement(file) -Whitespace("\n ") -EndElement(files) -Whitespace("\n ") -EndElement(module) -Whitespace("\n") -EndElement(project) -EndDocument diff --git a/tests/documents/sample_1_short.txt b/tests/documents/sample_1_short.txt deleted file mode 100644 index 4dbe285..0000000 --- a/tests/documents/sample_1_short.txt +++ /dev/null @@ -1,37 +0,0 @@ -StartDocument(1.0, utf-8) -StartElement(project [name="project-name"]) -StartElement(libraries) -StartElement(library [groupId="org.example", artifactId="<name>", version="0.1"]) -EndElement(library) -StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"]) -EndElement(library) -EndElement(libraries) -StartElement(module [name="module-1"]) -StartElement(files) -StartElement(file [name="somefile.java", type="java"]) -Characters("Some <java> class") -EndElement(file) -StartElement(file [name="another_file.java", type="java"]) -Characters("Another \"java\" class") -EndElement(file) -StartElement(file [name="config.xml", type="xml"]) -Characters("Weird \'XML\' config") -EndElement(file) -EndElement(files) -StartElement(libraries) -StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"]) -EndElement(library) -EndElement(libraries) -EndElement(module) -StartElement(module [name="module-2"]) -StartElement(files) -StartElement(file [name="program.js", type="javascript"]) -Characters("JavaScript & program") -EndElement(file) -StartElement(file [name="style.css", type="css"]) -Characters("Cascading style sheet: © - ҉") -EndElement(file) -EndElement(files) -EndElement(module) -EndElement(project) -EndDocument diff --git a/tests/documents/sample_2.xml b/tests/documents/sample_2.xml deleted file mode 100644 index f9543ac..0000000 --- a/tests/documents/sample_2.xml +++ /dev/null @@ -1,15 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<p:data xmlns:d="urn:example:double" xmlns:h="urn:example:header" xmlns:p="urn:example:namespace"> - <p:datum id="34"> - <p:name>Name</p:name> - <d:name>Another name</d:name> - <d:arg>0.3</d:arg> - <d:arg>0.2</d:arg> - <p:arg>0.1</p:arg> - <p:arg>0.01</p:arg> - <h:header name="Header-1">header 1 value</h:header> - <h:header name="Header-2"> - Some bigger value - </h:header> - </p:datum> -</p:data> diff --git a/tests/documents/sample_2_full.txt b/tests/documents/sample_2_full.txt deleted file mode 100644 index 75075cd..0000000 --- a/tests/documents/sample_2_full.txt +++ /dev/null @@ -1,41 +0,0 @@ -StartDocument(1.0, utf-8) -StartElement({urn:example:namespace}p:data) -Whitespace("\n ") -StartElement({urn:example:namespace}p:datum [id="34"]) -Whitespace("\n ") -StartElement({urn:example:namespace}p:name) -Characters("Name") -EndElement({urn:example:namespace}p:name) -Whitespace("\n ") -StartElement({urn:example:double}d:name) -Characters("Another name") -EndElement({urn:example:double}d:name) -Whitespace("\n ") -StartElement({urn:example:double}d:arg) -Characters("0.3") -EndElement({urn:example:double}d:arg) -Whitespace("\n ") -StartElement({urn:example:double}d:arg) -Characters("0.2") -EndElement({urn:example:double}d:arg) -Whitespace("\n ") -StartElement({urn:example:namespace}p:arg) -Characters("0.1") -EndElement({urn:example:namespace}p:arg) -Whitespace("\n ") -StartElement({urn:example:namespace}p:arg) -Characters("0.01") -EndElement({urn:example:namespace}p:arg) -Whitespace("\n ") -StartElement({urn:example:header}h:header [name="Header-1"]) -Characters("header 1 value") -EndElement({urn:example:header}h:header) -Whitespace("\n ") -StartElement({urn:example:header}h:header [name="Header-2"]) -Characters("\n Some bigger value\n ") -EndElement({urn:example:header}h:header) -Whitespace("\n ") -EndElement({urn:example:namespace}p:datum) -Whitespace("\n") -EndElement({urn:example:namespace}p:data) -EndDocument diff --git a/tests/documents/sample_2_short.txt b/tests/documents/sample_2_short.txt deleted file mode 100644 index 2368025..0000000 --- a/tests/documents/sample_2_short.txt +++ /dev/null @@ -1,30 +0,0 @@ -StartDocument(1.0, utf-8) -StartElement({urn:example:namespace}p:data) -StartElement({urn:example:namespace}p:datum [id="34"]) -StartElement({urn:example:namespace}p:name) -Characters("Name") -EndElement({urn:example:namespace}p:name) -StartElement({urn:example:double}d:name) -Characters("Another name") -EndElement({urn:example:double}d:name) -StartElement({urn:example:double}d:arg) -Characters("0.3") -EndElement({urn:example:double}d:arg) -StartElement({urn:example:double}d:arg) -Characters("0.2") -EndElement({urn:example:double}d:arg) -StartElement({urn:example:namespace}p:arg) -Characters("0.1") -EndElement({urn:example:namespace}p:arg) -StartElement({urn:example:namespace}p:arg) -Characters("0.01") -EndElement({urn:example:namespace}p:arg) -StartElement({urn:example:header}h:header [name="Header-1"]) -Characters("header 1 value") -EndElement({urn:example:header}h:header) -StartElement({urn:example:header}h:header [name="Header-2"]) -Characters("Some bigger value") -EndElement({urn:example:header}h:header) -EndElement({urn:example:namespace}p:datum) -EndElement({urn:example:namespace}p:data) -EndDocument diff --git a/tests/documents/sample_3.xml b/tests/documents/sample_3.xml deleted file mode 100644 index 657e37d..0000000 --- a/tests/documents/sample_3.xml +++ /dev/null @@ -1,13 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<p:data xmlns:p="urn:x" z=">"> - <!-- abcd < > & --> - <a>test</a> - <b>kkss" = ddd' ></b> - <![CDATA[ - <a>ddddd</b>!e3--><!-- ddckx - ]]> - <c/> - <![CDATA[ - <![CDATA[zzzz]]]]><![CDATA[>]]> -</p:data> - diff --git a/tests/documents/sample_3_full.txt b/tests/documents/sample_3_full.txt deleted file mode 100644 index e9a0f7e..0000000 --- a/tests/documents/sample_3_full.txt +++ /dev/null @@ -1,23 +0,0 @@ -1:1 StartDocument(1.0, utf-8) -2:1 StartElement({urn:x}p:data [z=">"]) -2:31 Whitespace("\n ") -3:5 Comment(" abcd < > & ") -3:34 Whitespace("\n ") -4:5 StartElement(a) -4:8 Characters("test") -4:12 EndElement(a) -4:16 Whitespace("\n ") -5:5 StartElement(b) -5:8 Characters("kkss\" = ddd\' >") -5:22 EndElement(b) -5:26 Whitespace("\n ") -6:5 CData("\n <a>ddddd</b>!e3--><!-- ddckx\n ") -8:8 Characters("\n ") -9:5 StartElement(c) -9:5 EndElement(c) -9:9 Whitespace("\n ") -10:5 CData("\n <![CDATA[zzzz]]") -11:23 CData(">") -11:36 Characters("\n") -12:1 EndElement({urn:x}p:data) -14:1 EndDocument diff --git a/tests/documents/sample_3_short.txt b/tests/documents/sample_3_short.txt deleted file mode 100644 index 2582f33..0000000 --- a/tests/documents/sample_3_short.txt +++ /dev/null @@ -1,14 +0,0 @@ -1:1 StartDocument(1.0, utf-8) -2:1 StartElement({urn:x}p:data [z=">"]) -4:5 StartElement(a) -4:8 Characters("test") -4:12 EndElement(a) -5:5 StartElement(b) -5:8 Characters("kkss\" = ddd\' >") -5:22 EndElement(b) -6:5 Characters("<a>ddddd</b>!e3--><!-- ddckx") -9:5 StartElement(c) -9:5 EndElement(c) -10:5 Characters("<![CDATA[zzzz]]>") -12:1 EndElement({urn:x}p:data) -14:1 EndDocument diff --git a/tests/documents/sample_4.xml b/tests/documents/sample_4.xml deleted file mode 100644 index fb915ff..0000000 --- a/tests/documents/sample_4.xml +++ /dev/null @@ -1,15 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<!DOCTYPE data SYSTEM "abcd.dtd"> -<p:data xmlns:p="urn:x" z=">"> - <!-- abcd < > & --> - <a>test</a> - <b>kkss" = ddd' ></b> - <![CDATA[ - <a>ddddd</b>!e3--><!-- ddckx - ]]> - <c/> - <![CDATA[ - <![CDATA[zzzz]]]]><![CDATA[>]]> -</p:data> - - diff --git a/tests/documents/sample_4_full.txt b/tests/documents/sample_4_full.txt deleted file mode 100644 index 4bdadfb..0000000 --- a/tests/documents/sample_4_full.txt +++ /dev/null @@ -1,23 +0,0 @@ -StartDocument(1.0, utf-8) -StartElement({urn:x}p:data [z=">"]) -Whitespace("\n ") -Comment(" abcd < > & ") -Whitespace("\n ") -StartElement(a) -Characters("test") -EndElement(a) -Whitespace("\n ") -StartElement(b) -Characters("kkss\" = ddd\' >") -EndElement(b) -Whitespace("\n ") -CData("\n <a>ddddd</b>!e3--><!-- ddckx\n ") -Characters("\n ") -StartElement(c) -EndElement(c) -Whitespace("\n ") -CData("\n <![CDATA[zzzz]]") -CData(">") -Characters("\n") -EndElement({urn:x}p:data) -EndDocument diff --git a/tests/documents/sample_4_short.txt b/tests/documents/sample_4_short.txt deleted file mode 100644 index 52e4b83..0000000 --- a/tests/documents/sample_4_short.txt +++ /dev/null @@ -1,14 +0,0 @@ -StartDocument(1.0, utf-8) -StartElement({urn:x}p:data [z=">"]) -StartElement(a) -Characters("test") -EndElement(a) -StartElement(b) -Characters("kkss\" = ddd\' >") -EndElement(b) -Characters("<a>ddddd</b>!e3--><!-- ddckx") -StartElement(c) -EndElement(c) -Characters("<![CDATA[zzzz]]>") -EndElement({urn:x}p:data) -EndDocument diff --git a/tests/documents/sample_5.xml b/tests/documents/sample_5.xml deleted file mode 100644 index 92aa31d..0000000 --- a/tests/documents/sample_5.xml +++ /dev/null @@ -1,7 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<!DOCTYPE data SYSTEM "abcd.dtd"> -<p> - <a>test ©≂̸</a> -</p> - - diff --git a/tests/documents/sample_5_short.txt b/tests/documents/sample_5_short.txt deleted file mode 100644 index 3079811..0000000 --- a/tests/documents/sample_5_short.txt +++ /dev/null @@ -1,7 +0,0 @@ -StartDocument(1.0, utf-8) -StartElement(p) -StartElement(a) -Characters("test ©≂̸") -EndElement(a) -EndElement(p) -EndDocument diff --git a/tests/documents/sample_6.xml b/tests/documents/sample_6.xml deleted file mode 100644 index 943c02d..0000000 --- a/tests/documents/sample_6.xml +++ /dev/null @@ -1,4 +0,0 @@ -<?xml version="1.0"?> -<?xml-stylesheet href="doc.xsl"?> - -<doc>Hello</doc> diff --git a/tests/documents/sample_6_full.txt b/tests/documents/sample_6_full.txt deleted file mode 100644 index debb366..0000000 --- a/tests/documents/sample_6_full.txt +++ /dev/null @@ -1,8 +0,0 @@ -StartDocument(1.0, UTF-8) -Whitespace("\n") -ProcessingInstruction(xml-stylesheet="href=\"doc.xsl\"") -Whitespace("\n\n") -StartElement(doc) -Characters("Hello") -EndElement(doc) -EndDocument diff --git a/tests/event_reader.rs b/tests/event_reader.rs deleted file mode 100644 index 750dcc4..0000000 --- a/tests/event_reader.rs +++ /dev/null @@ -1,587 +0,0 @@ -#![forbid(unsafe_code)] - -extern crate xml; -#[macro_use] -extern crate lazy_static; - -use std::env; -use std::fmt; -use std::fs::File; -use std::io::{BufRead, BufReader, Write, stderr}; -use std::path::Path; - -use xml::name::OwnedName; -use xml::common::Position; -use xml::reader::{Result, XmlEvent, ParserConfig, EventReader}; - -/// Dummy function that opens a file, parses it, and returns a `Result`. -/// There can be IO errors (from `File::open`) and XML errors (from the parser). -/// Having `impl From<std::io::Error> for xml::reader::Error` allows the user to -/// do this without defining their own error type. -#[allow(dead_code)] -fn count_event_in_file(name: &Path) -> Result<usize> { - let mut event_count = 0; - for event in EventReader::new(BufReader::new(try!(File::open(name)))) { - try!(event); - event_count += 1; - } - Ok(event_count) -} - -#[test] -fn sample_1_short() { - test( - include_bytes!("documents/sample_1.xml"), - include_bytes!("documents/sample_1_short.txt"), - ParserConfig::new() - .ignore_comments(true) - .whitespace_to_characters(true) - .cdata_to_characters(true) - .trim_whitespace(true) - .coalesce_characters(true), - false - ); -} - -#[test] -fn sample_1_full() { - test( - include_bytes!("documents/sample_1.xml"), - include_bytes!("documents/sample_1_full.txt"), - ParserConfig::new() - .ignore_comments(false) - .whitespace_to_characters(false) - .cdata_to_characters(false) - .trim_whitespace(false) - .coalesce_characters(false), - false - ); -} - -#[test] -fn sample_2_short() { - test( - include_bytes!("documents/sample_2.xml"), - include_bytes!("documents/sample_2_short.txt"), - ParserConfig::new() - .ignore_comments(true) - .whitespace_to_characters(true) - .cdata_to_characters(true) - .trim_whitespace(true) - .coalesce_characters(true), - false - ); -} - -#[test] -fn sample_2_full() { - test( - include_bytes!("documents/sample_2.xml"), - include_bytes!("documents/sample_2_full.txt"), - ParserConfig::new() - .ignore_comments(false) - .whitespace_to_characters(false) - .cdata_to_characters(false) - .trim_whitespace(false) - .coalesce_characters(false), - false - ); -} - -#[test] -fn sample_3_short() { - test( - include_bytes!("documents/sample_3.xml"), - include_bytes!("documents/sample_3_short.txt"), - ParserConfig::new() - .ignore_comments(true) - .whitespace_to_characters(true) - .cdata_to_characters(true) - .trim_whitespace(true) - .coalesce_characters(true), - true - ); -} - -#[test] -fn sample_3_full() { - test( - include_bytes!("documents/sample_3.xml"), - include_bytes!("documents/sample_3_full.txt"), - ParserConfig::new() - .ignore_comments(false) - .whitespace_to_characters(false) - .cdata_to_characters(false) - .trim_whitespace(false) - .coalesce_characters(false), - true - ); -} - -#[test] -fn sample_4_short() { - test( - include_bytes!("documents/sample_4.xml"), - include_bytes!("documents/sample_4_short.txt"), - ParserConfig::new() - .ignore_comments(true) - .whitespace_to_characters(true) - .cdata_to_characters(true) - .trim_whitespace(true) - .coalesce_characters(true), - false - ); -} - -#[test] -fn sample_4_full() { - test( - include_bytes!("documents/sample_4.xml"), - include_bytes!("documents/sample_4_full.txt"), - ParserConfig::new() - .ignore_comments(false) - .whitespace_to_characters(false) - .cdata_to_characters(false) - .trim_whitespace(false) - .coalesce_characters(false), - false - ); - -} - -#[test] -fn sample_5_short() { - test( - include_bytes!("documents/sample_5.xml"), - include_bytes!("documents/sample_5_short.txt"), - ParserConfig::new() - .ignore_comments(true) - .whitespace_to_characters(true) - .cdata_to_characters(true) - .trim_whitespace(true) - .coalesce_characters(true) - .add_entity("nbsp", " ") - .add_entity("copy", "©") - .add_entity("NotEqualTilde", "≂̸"), - false - ); -} - -#[test] -fn sample_6_full() { - test( - include_bytes!("documents/sample_6.xml"), - include_bytes!("documents/sample_6_full.txt"), - ParserConfig::new() - .ignore_root_level_whitespace(false) - .ignore_comments(false) - .whitespace_to_characters(false) - .cdata_to_characters(false) - .trim_whitespace(false) - .coalesce_characters(false), - false - ); -} - -#[test] -fn eof_1() { - test( - br#"<?xml"#, - br#"1:6 Unexpected end of stream: no root element found"#, - ParserConfig::new(), - false - ); -} - -#[test] -fn bad_1() { - test( - br#"<?xml&.,"#, - br#"1:6 Unexpected token: <?xml&"#, - ParserConfig::new(), - false - ); -} - -#[test] -fn dashes_in_comments() { - test( - br#"<!-- comment -- --><hello/>"#, - br#" - |1:14 Unexpected token '--' before ' ' - "#, - ParserConfig::new(), - false - ); - - test( - br#"<!-- comment ---><hello/>"#, - br#" - |1:14 Unexpected token '--' before '-' - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn tabs_1() { - test( - b"\t<a>\t<b/></a>", - br#" - |1:2 StartDocument(1.0, UTF-8) - |1:2 StartElement(a) - |1:6 StartElement(b) - |1:6 EndElement(b) - |1:10 EndElement(a) - |1:14 EndDocument - "#, - ParserConfig::new() - .trim_whitespace(true), - true - ); -} - -#[test] -fn issue_32_unescaped_cdata_end() { - test( - br#"<hello>]]></hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("]]>") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_unescaped_processing_instruction_end() { - test( - br#"<hello>?></hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("?>") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_unescaped_empty_tag_end() { - test( - br#"<hello>/></hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("/>") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_83_duplicate_attributes() { - test( - br#"<hello><some-tag a='10' a="20"></hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |1:30 Attribute 'a' is redefined - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_93_large_characters_in_entity_references() { - test( - r#"<hello>&𤶼;</hello>"#.as_bytes(), - r#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |1:10 Unexpected entity: 𤶼 - "#.as_bytes(), // FIXME: it shouldn't be 10, looks like indices are off slightly - ParserConfig::new(), - false - ) -} - -#[test] -fn issue_98_cdata_ending_with_right_bracket() { - test( - br#"<hello><![CDATA[Foo [Bar]]]></hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |CData("Foo [Bar]") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ) -} - -#[test] -fn issue_105_unexpected_double_dash() { - test( - br#"<hello>-- </hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("-- ") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); - - test( - br#"<hello>--</hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("--") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); - - test( - br#"<hello>--></hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |Characters("-->") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); - - test( - br#"<hello><![CDATA[--]]></hello>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(hello) - |CData("--") - |EndElement(hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_attribues_have_no_default_namespace () { - test( - br#"<hello xmlns="urn:foo" x="y"/>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement({urn:foo}hello [x="y"]) - |EndElement({urn:foo}hello) - |EndDocument - "#, - ParserConfig::new(), - false - ); -} - -#[test] -fn issue_replacement_character_entity_reference() { - test( - br#"<doc>��</doc>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(doc) - |1:13 Invalid decimal character number in an entity: #55357 - "#, - ParserConfig::new(), - false, - ); - - test( - br#"<doc>��</doc>"#, - br#" - |StartDocument(1.0, UTF-8) - |StartElement(doc) - |1:13 Invalid hexadecimal character number in an entity: #xd83d - "#, - ParserConfig::new(), - false, - ); - - test( - br#"<doc>��</doc>"#, - format!( - r#" - |StartDocument(1.0, UTF-8) - |StartElement(doc) - |Characters("{replacement_character}{replacement_character}") - |EndElement(doc) - |EndDocument - "#, - replacement_character = "\u{fffd}" - ) - .as_bytes(), - ParserConfig::new() - .replace_unknown_entity_references(true), - false, - ); - - test( - br#"<doc>��</doc>"#, - format!( - r#" - |StartDocument(1.0, UTF-8) - |StartElement(doc) - |Characters("{replacement_character}{replacement_character}") - |EndElement(doc) - |EndDocument - "#, - replacement_character = "\u{fffd}" - ) - .as_bytes(), - ParserConfig::new() - .replace_unknown_entity_references(true), - false, - ); -} - -lazy_static! { - // If PRINT_SPEC env variable is set, print the lines - // to stderr instead of comparing with the output - // it can be used like this: - // PRINT_SPEC=1 cargo test --test event_reader sample_1_full 2> sample_1_full.txt - static ref PRINT: bool = { - for (key, value) in env::vars() { - if key == "PRINT_SPEC" && value == "1" { - return true; - } - } - false - }; -} - -// clones a lot but that's fine -fn trim_until_bar(s: String) -> String { - match s.trim() { - ts if ts.starts_with('|') => return ts[1..].to_owned(), - _ => {} - } - s -} - -fn test(input: &[u8], output: &[u8], config: ParserConfig, test_position: bool) { - let mut reader = config.create_reader(input); - let mut spec_lines = BufReader::new(output).lines() - .map(|line| line.unwrap()) - .enumerate() - .map(|(i, line)| (i, trim_until_bar(line))) - .filter(|&(_, ref line)| !line.trim().is_empty()); - - loop { - let e = reader.next(); - let line = - if test_position { - format!("{} {}", reader.position(), Event(&e)) - } else { - format!("{}", Event(&e)) - }; - - if *PRINT { - writeln!(&mut stderr(), "{}", line).unwrap(); - } else { - if let Some((n, spec)) = spec_lines.next() { - if line != spec { - const SPLITTER: &'static str = "-------------------"; - panic!("\n{}\nUnexpected event at line {}:\nExpected: {}\nFound: {}\n{}\n", - SPLITTER, n + 1, spec, line, std::str::from_utf8(output).unwrap()); - } - } else { - panic!("Unexpected event: {}", line); - } - } - - match e { - Ok(XmlEvent::EndDocument) | Err(_) => break, - _ => {}, - } - } -} - -// Here we define our own string representation of events so we don't depend -// on the specifics of Display implementation for XmlEvent and OwnedName. - -struct Name<'a>(&'a OwnedName); - -impl <'a> fmt::Display for Name<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - if let Some(ref namespace) = self.0.namespace { - try! { write!(f, "{{{}}}", namespace) } - } - - if let Some(ref prefix) = self.0.prefix { - try! { write!(f, "{}:", prefix) } - } - - write!(f, "{}", self.0.local_name) - } -} - -struct Event<'a>(&'a Result<XmlEvent>); - -impl<'a> fmt::Display for Event<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let empty = String::new(); - match *self.0 { - Ok(ref e) => match *e { - XmlEvent::StartDocument { ref version, ref encoding, .. } => - write!(f, "StartDocument({}, {})", version, encoding), - XmlEvent::EndDocument => - write!(f, "EndDocument"), - XmlEvent::ProcessingInstruction { ref name, ref data } => - write!(f, "ProcessingInstruction({}={:?})", name, - data.as_ref().unwrap_or(&empty)), - XmlEvent::StartElement { ref name, ref attributes, .. } => { - if attributes.is_empty() { - write!(f, "StartElement({})", Name(name)) - } - else { - let attrs: Vec<_> = attributes.iter() - .map(|a| format!("{}={:?}", Name(&a.name), a.value)) .collect(); - write!(f, "StartElement({} [{}])", Name(name), attrs.join(", ")) - } - }, - XmlEvent::EndElement { ref name } => - write!(f, "EndElement({})", Name(name)), - XmlEvent::Comment(ref data) => - write!(f, r#"Comment("{}")"#, data.escape_debug()), - XmlEvent::CData(ref data) => - write!(f, r#"CData("{}")"#, data.escape_debug()), - XmlEvent::Characters(ref data) => - write!(f, r#"Characters("{}")"#, data.escape_debug()), - XmlEvent::Whitespace(ref data) => - write!(f, r#"Whitespace("{}")"#, data.escape_debug()), - }, - Err(ref e) => e.fmt(f), - } - } -} diff --git a/tests/event_writer.rs b/tests/event_writer.rs deleted file mode 100644 index dd64a43..0000000 --- a/tests/event_writer.rs +++ /dev/null @@ -1,269 +0,0 @@ -#![forbid(unsafe_code)] - -extern crate xml; - -use std::io::{BufReader, SeekFrom}; -use std::io::prelude::*; -use std::fs::File; -use std::str; - -use xml::reader::EventReader; -use xml::writer::EmitterConfig; - -macro_rules! unwrap_all { - ($($e:expr);+) => {{ - $($e.unwrap();)+ - }} -} - -#[test] -fn reading_writing_equal_with_namespaces() { - let mut f = File::open("tests/documents/sample_2.xml").unwrap(); - let mut b = Vec::new(); - - { - let r = EventReader::new(BufReader::new(&mut f)); - let mut w = EmitterConfig::default().perform_indent(true).create_writer(&mut b); - - for e in r { - match e { - Ok(e) => if let Some(e) = e.as_writer_event() { - match w.write(e) { - Ok(_) => {}, - Err(e) => panic!("Writer error: {:?}", e) - } - }, - Err(e) => panic!("Error: {}", e) - } - } - } - - f.seek(SeekFrom::Start(0)).unwrap(); - let mut fs = String::new(); - f.read_to_string(&mut fs).unwrap(); - - let bs = String::from_utf8(b).unwrap(); - - assert_eq!(fs.trim(), bs.trim()); -} - -#[test] -fn writing_simple() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b); - - w.write(XmlEvent::start_element("h:hello").ns("h", "urn:hello-world")).unwrap(); - w.write("hello world").unwrap(); - w.write(XmlEvent::end_element()).unwrap(); - } - - assert_eq!( - str::from_utf8(&b).unwrap(), - r#"<h:hello xmlns:h="urn:hello-world">hello world</h:hello>"# - ); -} - -#[test] -fn writing_empty_elements_with_normalizing() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world /></hello>"#); -} - -#[test] -fn writing_empty_elements_without_normalizing() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .normalize_empty_elements(false) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world></world></hello>"#); -} - -#[test] -fn writing_empty_elements_without_pad_self_closing() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .pad_self_closing(false) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world/></hello>"#); -} -#[test] -fn writing_empty_elements_pad_self_closing_explicit() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .pad_self_closing(true) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world /></hello>"#); -} - -#[test] -fn writing_comments_with_indentation() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .perform_indent(true) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("hello")); - w.write(XmlEvent::start_element("world")); - w.write(XmlEvent::comment(" this is a manually padded comment\t")); - w.write(XmlEvent::comment("this is an unpadded comment")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!( - str::from_utf8(&b).unwrap(), - "<hello> - <world> - <!-- this is a manually padded comment\t--> - <!-- this is an unpadded comment --> - </world> -</hello>"); -} - -#[test] -fn issue_112_overriding_namepace_prefix() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .create_writer(&mut b); - - unwrap_all! { - w.write(XmlEvent::start_element("iq").ns("", "jabber:client").ns("a", "urn:A")); - w.write(XmlEvent::start_element("bind").ns("", "urn:ietf:params:xml:ns:xmpp-bind")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::start_element("whatever").ns("a", "urn:X")); - w.write(XmlEvent::end_element()); - w.write(XmlEvent::end_element()) - } - } - - assert_eq!( - str::from_utf8(&b).unwrap(), - r#"<iq xmlns="jabber:client" xmlns:a="urn:A"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind" /><whatever xmlns:a="urn:X" /></iq>"# - ) -} - -#[test] -fn attribute_escaping() { - use xml::writer::XmlEvent; - - let mut b = Vec::new(); - - { - let mut w = EmitterConfig::new() - .write_document_declaration(false) - .perform_indent(true) - .create_writer(&mut b); - - unwrap_all! { - w.write( - XmlEvent::start_element("hello") - .attr("testLt", "<") - .attr("testGt", ">") - ); - w.write(XmlEvent::end_element()); - w.write( - XmlEvent::start_element("hello") - .attr("testQuot", "\"") - .attr("testApos", "\'") - ); - w.write(XmlEvent::end_element()); - w.write( - XmlEvent::start_element("hello") - .attr("testAmp", "&") - ); - w.write(XmlEvent::end_element()); - w.write( - XmlEvent::start_element("hello") - .attr("testNl", "\n") - .attr("testCr", "\r") - ); - w.write(XmlEvent::end_element()); - w.write( - XmlEvent::start_element("hello") - .attr("testNl", "\\n") - .attr("testCr", "\\r") - ); - w.write(XmlEvent::end_element()) - } - } - assert_eq!( - str::from_utf8(&b).unwrap(), - "<hello testLt=\"<\" testGt=\">\" /> -<hello testQuot=\""\" testApos=\"'\" /> -<hello testAmp=\"&\" /> -<hello testNl=\"
\" testCr=\"
\" /> -<hello testNl=\"\\n\" testCr=\"\\r\" />" - ); -}
\ No newline at end of file diff --git a/tests/streaming.rs b/tests/streaming.rs deleted file mode 100644 index a577a00..0000000 --- a/tests/streaming.rs +++ /dev/null @@ -1,103 +0,0 @@ -#![forbid(unsafe_code)] - -extern crate xml; - -use std::io::{Cursor, Write}; - -use xml::EventReader; -use xml::reader::ParserConfig; -use xml::reader::XmlEvent; - -macro_rules! assert_match { - ($actual:expr, $expected:pat) => { - match $actual { - $expected => {}, - _ => panic!("assertion failed: `(left matches right)` \ - (left: `{:?}`, right: `{}`", $actual, stringify!($expected)) - } - }; - ($actual:expr, $expected:pat if $guard:expr) => { - match $actual { - $expected if $guard => {}, - _ => panic!("assertion failed: `(left matches right)` \ - (left: `{:?}`, right: `{} if {}`", - $actual, stringify!($expected), stringify!($guard)) - } - } -} - -fn write_and_reset_position<W>(c: &mut Cursor<W>, data: &[u8]) where Cursor<W>: Write { - let p = c.position(); - c.write_all(data).unwrap(); - c.set_position(p); -} - -#[test] -fn reading_streamed_content() { - let buf = Cursor::new(b"<root>".to_vec()); - let reader = EventReader::new(buf); - - let mut it = reader.into_iter(); - - assert_match!(it.next(), Some(Ok(XmlEvent::StartDocument { .. }))); - assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); - - write_and_reset_position(it.source_mut(), b"<child-1>content</child-1>"); - assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); - assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); - - write_and_reset_position(it.source_mut(), b"<child-2/>"); - assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); - - write_and_reset_position(it.source_mut(), b"<child-3/>"); - assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-3"); - // doesn't seem to work because of how tags parsing is done -// write_and_reset_position(it.source_mut(), b"some text"); - // assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "some text"); - - write_and_reset_position(it.source_mut(), b"</root>"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "root"); - assert_match!(it.next(), Some(Ok(XmlEvent::EndDocument))); - assert_match!(it.next(), None); -} - -#[test] -fn reading_streamed_content2() { - let buf = Cursor::new(b"<root>".to_vec()); - let mut config = ParserConfig::new(); - config.ignore_end_of_stream = true; - let readerb = EventReader::new_with_config(buf, config); - - let mut reader = readerb.into_iter(); - - assert_match!(reader.next(), Some(Ok(XmlEvent::StartDocument { .. }))); - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); - - write_and_reset_position(reader.source_mut(), b"<child-1>content</child-1>"); - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); - assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); - assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); - - write_and_reset_position(reader.source_mut(), b"<child-2>content</child-2>"); - - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); - assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); - assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); - assert_match!(reader.next(), Some(Err(_))); - write_and_reset_position(reader.source_mut(), b"<child-3></child-3>"); - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); - write_and_reset_position(reader.source_mut(), b"<child-4 type='get'"); - match reader.next() { - None | - Some(Ok(_)) => { - panic!("At this point, parser must not detect something."); - }, - Some(Err(_)) => {} - }; - write_and_reset_position(reader.source_mut(), b" />"); - assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-4"); -} - |