tl-0.7.8/.cargo_vcs_info.json0000644000000001360000000000100114670ustar { "git": { "sha1": "604f644ada25d8aac78f423b4117932d766d6b79" }, "path_in_vcs": "" }tl-0.7.8/.gitignore000064400000000000000000000000361046102023000122460ustar 00000000000000/target Cargo.lock *.html *.jstl-0.7.8/CHANGELOG.md000064400000000000000000000126061046102023000120750ustar 00000000000000Changes annotated with `⚠` are breaking. # 0.7.8 - Fixes a build error if compiled with the `simd` feature flag. See [y21/tl#60] - Fixes MDN-related doc comments ([y21/tl#51]) # 0.7.7 - Fixes a bug in the query selector parser that made it fail to parse values containing `:`. See [y21/tl#46](https://github.com/y21/tl/issues/46) and [y21/tl#47] for more details. # 0.7.6 - Fixes a build error if compiled with the `simd` feature flag. See [y21/tl#41](https://github.com/y21/tl/issues/41) for more details. - ⚠ In prior versions, `innerHTML()` actually had the behavior of `Element#outerHTML`. This was changed and `innerHTML` now correctly only returns the markup of its subnodes, and not the markup of the own node. - `outerHTML()` was added to nodes, which moves the old behavior to another function. - Added `children_mut()`, which allows mutating the subnodes of an HTML Tag. # 0.7.5 - Fixed a bug that caused the parser to parse closing tags incorrectly. See [y21/tl#37](https://github.com/y21/tl/issues/37) and [y21/tl#38](https://github.com/y21/tl/pull/38) for more details. # 0.7.4 - Restructure internals (mainly SIMD functions) - Add fuzzing targets for internals - Optimize stable parser (adds stable alternatives when the `simd` feature isn't set) # 0.7.3 - Fixed `HTMLTag::raw()` returning one byte less than it should have. See [y21/tl#31](https://github.com/y21/tl/issues/31). # 0.7.2 - Add `Attributes::contains(key)` to check if an attribute exists. - Add `Attributes::remove(key)` to remove an attribute. - Add `Attributes::remove_value(key)` to delete the value of a given attribute key. # 0.7.1 - Version bump in README.md # 0.7.0 > **Warning: This release contains breaking changes** - ⚠ Function signature of `Attributes::insert` has changed: - It now takes two generic parameters `K, V` instead of just one. Prior to this version, this meant that the key and value type had to match. See [y21/tl#27](https://github.com/y21/tl/pull/26) for more details. - Added a `TryFrom for Bytes` implementation for convenience to create owned `Bytes`. - Added `HTMLTag::boundaries` method for obtaining the start and end position of a tag in the source string. - Fixed a panic when source string abruptly ends with ``) is interpreted as `>` and causes the next `>` to be interpreted as a text node on its own. # 0.6.1 - Fixed an off-by-one error in the `QueryIterable` trait implementation for `HTMLTag` that caused query selectors on HTML tags to return one node less than they should. # 0.6.0 > **Warning: This release contains breaking changes** - ⚠ Removed deprecated method `VDom::find_node` - Alternative: use `VDom::nodes().iter().find(...)` instead - ⚠ `Attributes::get()` now returns a reference to `Bytes` instead of cloning. - Prior to this version, it wasn't necessary to return a reference as the `Bytes` type was just an immutable `&[u8]`. Now it can hold owned data. - ⚠ `HTMLTag::children()` no longer returns an iterator, and instead returns a wrapper struct around the children of the HTML tag. This wrapper struct makes it easy to obtain direct children of the tag (`Children::top()`), or all children (including their children, etc...) (`Children::all()`). - ⚠ `Node::children()` no longer returns an iterator (see above). - ⚠ `HTMLTag::name()` now returns a reference to `Bytes` instead of cloning (see above). - Ability to create/parse query selectors independent of any parser (`tl::parse_query_selector`) - Ability to reuse query selectors - Ability to apply query selectors on `HTMLTag`s (see [#18](https://github.com/y21/tl/issues/18)) - `queryselector` module is now public - `InnerNodeHandle` is now u32 - Remove unused `max_depth` parser option - Add convenience `PartialEq` and `PartialEq<[u8]>` impls for Bytes # 0.5.0 > **Warning: This release contains breaking changes** - Allow `Bytes` to store owned data through `Bytes::set()` - ⚠ The maximum length for `Bytes` is `u32::MAX` - ⚠ `tl::parse()` now returns `Result, ParseError>` - ⚠ `Attributes` fields are no longer public, instead use one of the provided methods - ⚠ `HTMLTag::inner_html()` now takes a `&Parser` and no longer directly returns the substring - Node mutations to the tag or any of its subnodes means `inner_html` needs to be recomputed - Consider using `HTMLTag::raw()` if you never mutate any nodes # 0.4.4 - Parse unquoted attribute values properly (``) [#12] - Parse valueless attributes properly (` "#; let dom = parse(input, ParserOptions::default()).unwrap(); let element = dom.get_element_by_id("u54423"); assert!(element.is_some()); } #[test] fn unquoted() { // https://github.com/y21/tl/issues/12 let input = r#" Hello World "#; let dom = parse(input, ParserOptions::default()).unwrap(); let parser = dom.parser(); let element = dom.get_element_by_id("u54423"); assert_eq!( element.and_then(|x| x.get(parser).map(|x| x.inner_text(parser))), Some("Hello World".into()) ); } mod query_selector { use super::*; #[test] fn query_selector_simple() { let input = "

hello

"; let dom = parse(input, ParserOptions::default()).unwrap(); let parser = dom.parser(); let mut selector = dom.query_selector(".hi").unwrap(); let el = force_as_tag(selector.next().and_then(|x| x.get(parser)).unwrap()); assert_eq!(dom.nodes().len(), 3); assert_eq!(el.inner_text(parser), "hello"); } #[test] fn tag_query_selector() { // empty let dom = parse("

", ParserOptions::default()).unwrap(); let parser = dom.parser(); let selector = dom.nodes()[0] .as_tag() .unwrap() .query_selector(parser, "div.z") .unwrap(); assert_eq!(selector.count(), 0); // one child let dom = parse( r#"

PASS

"#, ParserOptions::default(), ) .unwrap(); let parser = dom.parser(); let mut selector = dom.nodes()[0] .as_tag() .unwrap() .query_selector(parser, "div.z") .unwrap(); assert_eq!(selector.clone().count(), 1); assert_eq!( selector .next() .unwrap() .get(parser) .unwrap() .inner_text(parser), "PASS" ); // nested let dom = parse( r#"

PASS

"#, ParserOptions::default(), ) .unwrap(); let parser = dom.parser(); let mut selector = dom.nodes()[0] .as_tag() .unwrap() .query_selector(parser, "div.y") .unwrap(); assert_eq!(selector.clone().count(), 1); assert_eq!( selector .next() .unwrap() .get(parser) .unwrap() .inner_text(parser), "PASS" ); } #[test] fn query_selector_with_quote() { let input = r#"
"#; let dom = parse(input, ParserOptions::default()).unwrap(); let parser = dom.parser(); let node_option = dom .query_selector(r#"meta[property="og:title"]"#) .and_then(|mut iter| iter.next()); let value = if let Some(node) = node_option { Some( node.get(parser) .unwrap() .as_tag() .unwrap() .attributes() .get("content") .flatten() .unwrap() .try_as_utf8_str() .unwrap() .to_string(), ) } else { None }; assert_eq!(value, Some("hello".to_string())); } } #[test] fn nodes_order() { let input = r#"

test

test2
"# .trim(); let dom = parse(input, Default::default()).unwrap(); let nodes = dom.nodes(); // 5 nodes in total assert_eq!(nodes.len(), 5); // First node is

assert_eq!(&nodes[0].as_tag().unwrap()._name, "p"); // Second node is inner text of

: test assert_eq!(nodes[1].as_raw().unwrap().as_bytes(), b"test"); // Third node is

assert_eq!(&nodes[2].as_tag().unwrap()._name, "div"); // Fourth node is inner node assert_eq!(&nodes[3].as_tag().unwrap()._name, "span"); // Fifth node is inner text of : test2 assert_eq!(nodes[4].as_raw().unwrap().as_bytes(), b"test2"); } #[test] fn comment() { let dom = parse("", Default::default()).unwrap(); let nodes = dom.nodes(); assert_eq!(nodes.len(), 1); assert_eq!( nodes[0].as_comment().unwrap().as_utf8_str(), "" ); } #[test] fn tag_all_children() { fn assert_len(input: &str, len: usize) { let dom = parse(input, Default::default()).unwrap(); let el = dom.nodes()[0].as_tag().unwrap(); assert_eq!(el.children().all(dom.parser()).len(), len); } fn assert_last(input: &str, last: &str) { let dom = parse(input, Default::default()).unwrap(); let el = dom.nodes()[0].as_tag().unwrap(); assert_eq!( el.children() .all(dom.parser()) .last() .unwrap() .inner_text(dom.parser()), last ); } assert_len(r#"
"#, 0); assert_len(r#"
a
"#, 1); assert_len(r#"

"#, 1); assert_len(r#"

a

"#, 2); assert_len(r#"

"#, 2); assert_len(r#"

a

"#, 3); assert_last(r#"
a
"#, "a"); assert_last(r#"

a

"#, "a"); assert_last(r#"
b

a

"#, "a"); assert_last(r#"
b

a

"#, "a"); } #[test] fn assert_length() { fn assert_len(input: &str, selector: &str, len: usize) { let dom = parse(input, Default::default()).unwrap(); let el = dom.nodes()[0].as_tag().unwrap(); let query = el.query_selector(dom.parser(), selector).unwrap(); assert_eq!(query.count(), len); } assert_len("
", "a", 0); assert_len("
", "a", 1); assert_len("
", "a", 2); assert_len("
", "span", 1); } #[test] fn self_closing_no_child() { let dom = parse("

test

", Default::default()).unwrap(); let nodes = dom.nodes(); assert_eq!(nodes.len(), 3); assert_eq!(nodes[0].as_tag().unwrap()._children.len(), 0); assert_eq!(nodes[0].as_tag().unwrap().raw(), "
"); } #[test] fn insert_attribute_owned() { // https://github.com/y21/tl/issues/27 let mut attr = Attributes::new(); let style = "some style".to_string(); attr.insert("style", Some(Bytes::try_from(style).unwrap())); assert_eq!(attr.get("style"), Some(Some(&"some style".into()))); } #[test] fn boundaries() { // https://github.com/y21/tl/issues/25 let dom = parse("

haha

", Default::default()).unwrap(); let span = dom.nodes()[1].as_tag().unwrap(); let boundary = span.boundaries(dom.parser()); assert_eq!(boundary, (5, 15)); } #[test] fn attributes_remove_inner_html() { let mut dom = parse( "testing", Default::default(), ) .unwrap(); dom.nodes_mut()[0] .as_tag_mut() .unwrap() .attributes_mut() .remove_value("contenteditable"); assert_eq!(dom.outer_html(), "testing"); dom.nodes_mut()[0] .as_tag_mut() .unwrap() .attributes_mut() .remove("contenteditable"); assert_eq!(dom.outer_html(), "testing"); } #[test] fn tag_raw() { let input = "

abcd

"; let vdom = parse(input, Default::default()).unwrap(); let first_tag = vdom.children()[0] .get(vdom.parser()) .unwrap() .as_tag() .unwrap(); let from_raw = first_tag.raw().try_as_utf8_str().unwrap(); assert_eq!(from_raw, "

abcd

"); } #[test] fn tag_raw_abrupt_stop() { let input = "

abcdabcd bool { (b'0'..=b'9').contains(&c) || (b'A'..=b'Z').contains(&c) || (b'a'..=b'z').contains(&c) || c == b'-' || c == b'_' || c == b':' || c == b'+' || c == b'/' } #[inline(always)] pub fn to_lower(byte: u8) -> u8 { let is_upper = (byte >= b'A' && byte <= b'Z') as u8; let lower = is_upper * 0x20; byte + lower } tl-0.7.8/src/vdom.rs000064400000000000000000000204741046102023000123700ustar 00000000000000use crate::errors::ParseError; use crate::parser::HTMLVersion; use crate::parser::NodeHandle; use crate::queryselector; use crate::queryselector::QuerySelectorIterator; use crate::Bytes; use crate::InnerNodeHandle; use crate::ParserOptions; use crate::{Node, Parser}; use std::marker::PhantomData; /// VDom represents a [Document Object Model](https://developer.mozilla.org/en/docs/Web/API/Document_Object_Model) /// /// It is the result of parsing an HTML document. /// Internally it is only a wrapper around the [`Parser`] struct, in which all of the HTML tags are stored. /// Many functions of the public API take a reference to a [`Parser`] as a parameter to resolve [`NodeHandle`]s to [`Node`]s. #[derive(Debug)] pub struct VDom<'a> { /// Internal parser parser: Parser<'a>, } impl<'a> From> for VDom<'a> { fn from(parser: Parser<'a>) -> Self { Self { parser } } } impl<'a> VDom<'a> { /// Returns a reference to the underlying parser #[inline] pub fn parser(&self) -> &Parser<'a> { &self.parser } /// Returns a mutable reference to the underlying parser #[inline] pub fn parser_mut(&mut self) -> &mut Parser<'a> { &mut self.parser } /// Finds an element by its `id` attribute. pub fn get_element_by_id<'b, S>(&'b self, id: S) -> Option where S: Into>, { let bytes: Bytes = id.into(); let parser = self.parser(); if parser.options.is_tracking_ids() { parser.ids.get(&bytes).copied() } else { self.nodes() .iter() .enumerate() .find(|(_, node)| { node.as_tag().map_or(false, |tag| { tag._attributes.id.as_ref().map_or(false, |x| x.eq(&bytes)) }) }) .map(|(id, _)| NodeHandle::new(id as InnerNodeHandle)) } } /// Returns a list of elements that match a given class name. pub fn get_elements_by_class_name<'b>( &'b self, id: &'b str, ) -> Box + 'b> { let parser = self.parser(); if parser.options.is_tracking_classes() { parser .classes .get(&Bytes::from(id.as_bytes())) .map(|x| Box::new(x.iter().cloned()) as Box>) .unwrap_or_else(|| Box::new(std::iter::empty())) } else { let member = id; let iter = self .nodes() .iter() .enumerate() .filter_map(move |(id, node)| { node.as_tag().and_then(|tag| { tag._attributes .is_class_member(member) .then(|| NodeHandle::new(id as InnerNodeHandle)) }) }); Box::new(iter) } } /// Returns a slice of *all* the elements in the HTML document /// /// The difference between `children()` and `nodes()` is that children only returns the immediate children of the root node, /// while `nodes()` returns all nodes, including nested tags. /// /// # Order /// The order of the returned nodes is the same as the order of the nodes in the HTML document. pub fn nodes(&self) -> &[Node<'a>] { &self.parser.tags } /// Returns a mutable slice of *all* the elements in the HTML document /// /// The difference between `children()` and `nodes()` is that children only returns the immediate children of the root node, /// while `nodes()` returns all nodes, including nested tags. pub fn nodes_mut(&mut self) -> &mut [Node<'a>] { &mut self.parser.tags } /// Returns the topmost subnodes ("children") of this DOM pub fn children(&self) -> &[NodeHandle] { &self.parser.ast } /// Returns a mutable reference to the topmost subnodes ("children") of this DOM pub fn children_mut(&mut self) -> &mut [NodeHandle] { &mut self.parser.ast } /// Returns the HTML version. /// This is determined by the `` tag pub fn version(&self) -> Option { self.parser.version } /// Returns the contained markup of all of the elements in this DOM. /// /// Equivalent to [Element#outerHTML](https://developer.mozilla.org/en-US/docs/Web/API/Element/outerHTML) in browsers) /// /// # Example /// ``` /// let html = r#"

Hello world

"#; /// let mut dom = tl::parse(html, Default::default()).unwrap(); /// /// let element = dom.get_element_by_id("find-me") /// .unwrap() /// .get_mut(dom.parser_mut()) /// .unwrap() /// .as_tag_mut() /// .unwrap(); /// /// element.attributes_mut().get_mut("href").flatten().unwrap().set("/"); /// /// assert_eq!(dom.outer_html(), r#"

Hello world

"#); /// ``` pub fn outer_html(&self) -> String { let mut inner_html = String::with_capacity(self.parser.stream.len()); for node in self.children() { let node = node.get(&self.parser).unwrap(); inner_html.push_str(&node.outer_html(&self.parser)); } inner_html } /// Tries to parse the query selector and returns an iterator over elements that match the given query selector. /// /// # Example /// ``` /// let dom = tl::parse("

bar

", tl::ParserOptions::default()).unwrap(); /// let handle = dom.query_selector("p.foo").and_then(|mut iter| iter.next()).unwrap(); /// let node = handle.get(dom.parser()).unwrap(); /// assert_eq!(node.inner_text(dom.parser()), "bar"); /// ``` pub fn query_selector<'b>( &'b self, selector: &'b str, ) -> Option> { let selector = crate::parse_query_selector(selector)?; let iter = queryselector::QuerySelectorIterator::new(selector, self.parser(), self); Some(iter) } } /// A RAII guarded version of VDom /// /// The input string is freed once this struct goes out of scope. /// The only way to construct this is by calling `parse_owned()`. #[derive(Debug)] pub struct VDomGuard { /// Wrapped VDom instance dom: VDom<'static>, /// The leaked input string that is referenced by self.dom _s: RawString, /// PhantomData for self.dom _phantom: PhantomData<&'static str>, } unsafe impl Send for VDomGuard {} unsafe impl Sync for VDomGuard {} impl VDomGuard { /// Parses the input string pub(crate) fn parse(input: String, options: ParserOptions) -> Result { let input = RawString::new(input); let ptr = input.as_ptr(); let input_ref: &'static str = unsafe { &*ptr }; // Parsing will either: // a) succeed, and we return a VDom instance // that, when dropped, will free the input string // b) fail, and we return a ParseError // and `RawString`s destructor will run and deallocate the string properly let mut parser = Parser::new(input_ref, options); parser.parse()?; Ok(Self { _s: input, dom: VDom::from(parser), _phantom: PhantomData, }) } } impl VDomGuard { /// Returns a reference to the inner DOM. /// /// The lifetime of the returned `VDom` is bound to self so that elements cannot outlive this `VDomGuard` struct. pub fn get_ref<'a>(&'a self) -> &'a VDom<'a> { &self.dom } /// Returns a mutable reference to the inner DOM. /// /// The lifetime of the returned `VDom` is bound to self so that elements cannot outlive this `VDomGuard` struct. pub fn get_mut_ref<'a, 'b: 'a>(&'b mut self) -> &'b VDom<'a> { &mut self.dom } } #[derive(Debug)] struct RawString(*mut str); impl RawString { pub fn new(s: String) -> Self { Self(Box::into_raw(s.into_boxed_str())) } pub fn as_ptr(&self) -> *mut str { self.0 } } impl Drop for RawString { fn drop(&mut self) { // SAFETY: the pointer is always valid because `RawString` can only be constructed through `RawString::new()` unsafe { drop(Box::from_raw(self.0)); }; } }