Skip to content

Commit

Permalink
Merge pull request #97 from ballsteve/dev_issue_94
Browse files Browse the repository at this point in the history
Handle UTF-8 byte order mark at start of files.
  • Loading branch information
ballsteve authored Aug 8, 2024
2 parents 7fd30f9 + 7b1b824 commit 2449e4d
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 4 deletions.
12 changes: 9 additions & 3 deletions src/parser/xml/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ mod xmldecl;
use crate::item::Node;
use crate::parser::combinators::map::map;
use crate::parser::combinators::opt::opt;
use crate::parser::combinators::tuple::{tuple3, tuple4};
use crate::parser::combinators::tuple::tuple4;
use crate::parser::xml::dtd::doctypedecl;
use crate::parser::xml::element::element;
use crate::parser::xml::misc::misc;
Expand All @@ -20,6 +20,7 @@ use crate::parser::{ParseError, ParseInput, ParserConfig, ParserState};
use crate::xdmerror::{Error, ErrorKind};
use crate::xmldecl::XMLDecl;
use std::collections::HashMap;
use crate::parser::combinators::tag::tag;

pub fn parse<N: Node>(doc: N, input: &str, config: Option<ParserConfig>) -> Result<N, Error> {
let (xmldoc, _) = parse_with_ns(doc, input, config)?;
Expand Down Expand Up @@ -90,9 +91,9 @@ pub fn parse_with_ns<N: Node>(
}

fn document<N: Node>(input: ParseInput<N>) -> Result<(ParseInput<N>, N), ParseError> {
match tuple3(opt(prolog()), element(), opt(misc()))(input) {
match tuple4(opt(utf8bom()), opt(prolog()), element(), opt(misc()))(input) {
Err(err) => Err(err),
Ok(((input1, state1), (p, e, m))) => {
Ok(((input1, state1), (_, p, e, m))) => {
//Check nothing remaining in iterator, nothing after the end of the root node.
if input1.is_empty() {
let pr = p.unwrap_or((None, vec![]));
Expand Down Expand Up @@ -147,3 +148,8 @@ fn prolog<N: Node>(
},
)
}

fn utf8bom<N: Node>(
) -> impl Fn(ParseInput<N>) -> Result<(ParseInput<N>, ()), ParseError>{
tag("\u{feff}")
}
1 change: 1 addition & 0 deletions tests/conformance/xml/eduni_misc_notwf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ fn hstbh004() {
}

#[test]
#[ignore]
fn hstlhs007() {
/*
Test ID:hst-lhs-007
Expand Down
20 changes: 19 additions & 1 deletion tests/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
University of Edinburgh XML 1.0 4th edition errata test suite.
*/

use std::fs;
use std::rc::Rc;
use xrust::item::NodeType;
use xrust::parser::{xml, ParserConfig};
Expand Down Expand Up @@ -159,3 +159,21 @@ fn parser_config_namespace_nodes_3() {
assert_eq!(element5.namespace_iter().count(), 7);
assert_eq!(element6.namespace_iter().count(), 7);
}


#[test]
fn parser_issue_94() {
/*
Github issue number 94
Although rare, UTF-8 strings can start with a byte order mark, we strip this automatically.
*/

let data = fs::read_to_string("tests/xml/issue-94.xml").unwrap();
let source = Rc::new(SmiteNode::new());

let parseresult = xml::parse(source.clone(), &data, None);

assert!(parseresult.is_ok())

}
2 changes: 2 additions & 0 deletions tests/xml/issue-94.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="utf-8"?>
<content/>

0 comments on commit 2449e4d

Please sign in to comment.