html2text-0.4.4/.cargo_vcs_info.json0000644000000001360000000000100127740ustar { "git": { "sha1": "81f29012abca1f5b63ce6fcd246cd0f5bb03efdc" }, "path_in_vcs": "" }html2text-0.4.4/.circleci/config.yml000064400000000000000000000024341046102023000154120ustar 00000000000000version: 2.1 orbs: win: circleci/windows@2.2.0 jobs: build-stable: docker: - image: cimg/rust:1.59 steps: - checkout - run: cargo --version - run: cargo build - run: cargo test build-1-56: docker: - image: cimg/rust:1.56 steps: - checkout - run: cargo --version - run: cargo build - run: cargo test build-windows: executor: name: win/default size: medium shell: bash.exe environment: PATHk steps: - checkout - run: name: Install Rust command: | curl https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe --output rustup-init.exe ./rustup-init.exe -y - run: name: Update PATH and cargo config command: | echo "[net]" >> $USERPROFILE/.cargo/config echo "git-fetch-with-cli = true" >> $USERPROFILE/.cargo/config echo 'export PATH=$USERPROFILE/.cargo/bin:$PATH' >> $BASH_ENV - run: name: Build command: | cargo build - run: name: Tests command: | cargo test workflows: version: 2 build: jobs: - "build-stable" - "build-1-56" - "build-windows" html2text-0.4.4/.github/dependabot.yml000064400000000000000000000002251046102023000157530ustar 00000000000000version: 2 updates: - package-ecosystem: "cargo" directory: "/" schedule: interval: "weekly" day: "friday" rebase-strategy: "disabled" html2text-0.4.4/.gitignore000064400000000000000000000000221046102023000135460ustar 00000000000000target Cargo.lock html2text-0.4.4/CHANGELOG.md000064400000000000000000000044571046102023000134070ustar 00000000000000# Changelog Possible log types: - `[added]` for new features. - `[changed]` for changes in existing functionality. - `[deprecated]` for once-stable features removed in upcoming releases. - `[removed]` for deprecated features removed in this release. - `[fixed]` for any bug fixes. - `[security]` to invite users to upgrade in case of vulnerabilities. ### 0.4.4 - [fixed] Fix some panics when enumerated lists are in tables (thanks sfts). - [fixed] Impove table size estimation to include links. ### 0.4.3 - [changed] MSRV is now 1.56. - [fixed] Fix some panics when very large widths are used with tables. ### 0.4.2 - [changed] Moved the rcdom module directly into src/ ### 0.4.1 (unpublished) - [changed] rcdom now vendored as a module. ### 0.4.0 (unpublished) - [changed] Update html5ever to v0.26. - [changed] MSRV is now 1.49. ### 0.3.1 - [changed] Update the build badges to reflect the updated CI configuration. ### 0.3.0 - [added] New experimental `from_read_coloured()` (under `ansi_colours` feature). - [added] Add `into_tagged_strings` and `tagged_strings` methods to `TaggedLine` (thanks Robin Krahl) - [added] Add `width` method to `TaggedString` (thanks Robin Krahl) - [changed] Keep annotations in `TextRenderer::into_lines` (thanks Robin Krahl) - [fixed] Add colon to reference style link (thanks zakaluka) - [added] Allow text decorators to customise block prefix strings (thanks SardineFish) - [fixed] Fixed some problems rendering some complicated tables, including a panic and near-infinite loops. - [changed] Tables which are too wide to possibly render in the given width are now arranged vertically instead (with `///`) lines. - [changed] A number of small table rendering improvements. - [changed] MSRV is now 1.41. ### 0.2.1 - [added] New entry points - split HTML parsing from rendering the output, thanks Robin Krahl. - [fixed] Decorators weren't being used for preformatted text. ### 0.2.0 - [added] Support `` strikeout text. ### 0.1.14 (2020-08-07) - [fixed] A table with an `id` attribute on `` would be hidden. ### 0.1.13 (2020-07-21) - [changed] Run cargo fmt (thanks crunchyjesus) - [added] CHANGELOG.md - [fixed] Some text near a fragment start (`id="foo"` attribute) could be lost if it needed breaking across lines. - [added] Experimentally add dependabot configuration. html2text-0.4.4/Cargo.lock0000644000000331670000000000100107610ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "addr2line" version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9ecd88a8c8378ca913a680cd98f0f13ac67383d35993f86c90a70e3f137816b" dependencies = [ "gimli", ] [[package]] name = "adler" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "argparse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f8ebf5827e4ac4fd5946560e6a99776ea73b596d80898f357007317a7141e47" [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "backtrace" version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cab84319d616cfb654d03394f38ab7e6f0919e181b1b57e1fd15e7fb4077d9a7" dependencies = [ "addr2line", "cc", "cfg-if", "libc", "miniz_oxide", "object", "rustc-demangle", ] [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "cc" version = "1.0.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "581f5dba903aac52ea3feb5ec4810848460ee833876f1f9b0fdeab1f19091574" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "futf" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" dependencies = [ "mac", "new_debug_unreachable", ] [[package]] name = "getrandom" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "gimli" version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d" [[package]] name = "html2text" version = "0.4.4" dependencies = [ "argparse", "backtrace", "html5ever", "markup5ever", "tendril", "termion", "unicode-width", "xml5ever", ] [[package]] name = "html5ever" version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" dependencies = [ "log", "mac", "markup5ever", "proc-macro2", "quote", "syn", ] [[package]] name = "libc" version = "0.2.137" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89" [[package]] name = "lock_api" version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" dependencies = [ "autocfg", "scopeguard", ] [[package]] name = "log" version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", ] [[package]] name = "mac" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" dependencies = [ "log", "phf", "phf_codegen", "string_cache", "string_cache_codegen", "tendril", ] [[package]] name = "memchr" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "miniz_oxide" version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96590ba8f175222643a85693f33d26e9c8a015f599c216509b1a6894af675d34" dependencies = [ "adler", ] [[package]] name = "new_debug_unreachable" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" [[package]] name = "numtoa" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef" [[package]] name = "object" version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21158b2c33aa6d4561f1c0a6ea283ca92bc54802a93b263e910746d679a7eb53" dependencies = [ "memchr", ] [[package]] name = "once_cell" version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" [[package]] name = "parking_lot" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", "parking_lot_core", ] [[package]] name = "parking_lot_core" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dc9e0dc2adc1c69d09143aff38d3d30c5c3f0df0dad82e6d25547af174ebec0" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", "windows-sys", ] [[package]] name = "phf" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" dependencies = [ "phf_generator", "phf_shared", ] [[package]] name = "phf_generator" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" dependencies = [ "phf_shared", "rand", ] [[package]] name = "phf_shared" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" dependencies = [ "siphasher", ] [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "precomputed-hash" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] [[package]] name = "redox_syscall" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "redox_termios" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8440d8acb4fd3d277125b4bd01a6f38aee8d814b3b5fc09b3f2b825d37d3fe8f" dependencies = [ "redox_syscall", ] [[package]] name = "rustc-demangle" version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" [[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "serde" version = "1.0.147" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" [[package]] name = "siphasher" version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "smallvec" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "string_cache" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213494b7a2b503146286049378ce02b482200519accc31872ee8be91fa820a08" dependencies = [ "new_debug_unreachable", "once_cell", "parking_lot", "phf_shared", "precomputed-hash", "serde", ] [[package]] name = "string_cache_codegen" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" dependencies = [ "phf_generator", "phf_shared", "proc-macro2", "quote", ] [[package]] name = "syn" version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tendril" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" dependencies = [ "futf", "mac", "utf-8", ] [[package]] name = "termion" version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "077185e2eac69c3f8379a4298e1e07cd36beb962290d4a51199acf0fdc10607e" dependencies = [ "libc", "numtoa", "redox_syscall", "redox_termios", ] [[package]] name = "unicode-ident" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-width" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] name = "utf-8" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "windows-sys" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" [[package]] name = "windows_aarch64_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" [[package]] name = "windows_i686_gnu" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" [[package]] name = "windows_i686_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" [[package]] name = "windows_x86_64_gnu" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" [[package]] name = "windows_x86_64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" [[package]] name = "windows_x86_64_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" [[package]] name = "xml5ever" version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" dependencies = [ "log", "mac", "markup5ever", ] html2text-0.4.4/Cargo.toml0000644000000026370000000000100110020ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" rust-version = "1.56" name = "html2text" version = "0.4.4" authors = ["Chris Emerson "] description = "Render HTML as plain text." documentation = "https://docs.rs/html2text/" readme = "README.md" keywords = [ "html", "text", ] license = "MIT" repository = "https://github.com/jugglerchris/rust-html2text/" [[example]] name = "html2term" path = "examples/html2term.rs" [[example]] name = "html2text" path = "examples/html2text.rs" [dependencies.backtrace] version = "0.3" optional = true [dependencies.html5ever] version = "0.26.0" [dependencies.markup5ever] version = "0.11.0" [dependencies.tendril] version = "0.4" [dependencies.unicode-width] version = "0.1.5" [dependencies.xml5ever] version = "0.17" [dev-dependencies.argparse] version = "0.2.2" [features] ansi_colours = [] default = [] html_trace = [] html_trace_bt = ["backtrace"] [target."cfg(unix)".dev-dependencies.termion] version = "1.5" html2text-0.4.4/Cargo.toml.orig000064400000000000000000000014771046102023000144640ustar 00000000000000[package] name = "html2text" version = "0.4.4" authors = ["Chris Emerson "] description = "Render HTML as plain text." repository = "https://github.com/jugglerchris/rust-html2text/" readme = "README.md" documentation = "https://docs.rs/html2text/" edition = "2018" rust-version = "1.56" keywords = ["html", "text"] license = "MIT" [dependencies] html5ever = "0.26.0" markup5ever = "0.11.0" tendril = "0.4" xml5ever = "0.17" unicode-width = "0.1.5" backtrace = { version = "0.3", optional=true } [features] html_trace = [] html_trace_bt = ["backtrace"] default = [] ansi_colours = [] [[example]] name = "html2term" path = "examples/html2term.rs" [[example]] name = "html2text" path = "examples/html2text.rs" [dev-dependencies] argparse = "0.2.2" [target.'cfg(unix)'.dev-dependencies] termion = "1.5" html2text-0.4.4/LICENSE000064400000000000000000000020571046102023000125750ustar 00000000000000MIT License Copyright (c) 2016 Chris Emerson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. html2text-0.4.4/README.md000064400000000000000000000027741046102023000130550ustar 00000000000000[![jugglerchris](https://circleci.com/gh/jugglerchris/rust-html2text.svg?branch=master&style=svg)](https://app.circleci.com/pipelines/github/jugglerchris/rust-html2text?filter=all) # html2text html2text is a [Rust](http://www.rust-lang.org/) crate which converts HTML to plain text. It makes use of the [Servo project](https://github.com/servo/servo)'s HTML parser, [html5ever](https://github.com/servo/html5ever/), using the DOM to generate text (which can optionally include annotations for some features such as hyperlinks). The project aims to do a reasonable job of rendering reasonable HTML in a terminal or other places where HTML needs to be converted to text (for example the text/plain fallback in HTML e-mails). ## Examples ```rust use html2text::from_read; let html = b"
  • Item one
  • Item two
  • Item three
"; assert_eq!(from_read(&html[..], 20), "\ * Item one * Item two * Item three "); ``` A couple of simple demonstration programs are included as examples: ### html2text The simplest example uses `from_read` to convert HTML on stdin into plain text: ```sh $ cargo run --example html2text < foo.html [...] ``` ### html2term A very simple example of using the rich interface (`from_read_rich`) for a slightly interactive console HTML viewer is provided as `html2term`. ```sh $ cargo run --example html2term foo.html [...] ``` Note that this example takes the HTML file as a parameter so that it can read keys from stdin. html2text-0.4.4/benches/tables.rs000064400000000000000000000052571046102023000150240ustar 00000000000000#![feature(test)] extern crate html2text; extern crate test; use ::test::Bencher; use html2text::from_read; fn make_html(content: &str) -> String { String::from("") + content + "" } fn make_tab(cell: &str, rows: usize, cols: usize) -> String { let mut result = String::from(""); for _ in 0..rows { result.push_str(""); for _ in 0..cols { result.push_str(""); } result.push_str(""); } result } #[bench] fn bench_empty(b: &mut Bencher) { b.iter(|| from_read(make_html("").as_bytes(), 80)); } #[bench] fn bench_tab_1_1(b: &mut Bencher) { b.iter(|| from_read(make_html(&make_tab("cell", 1, 1)).as_bytes(), 80)); } #[bench] fn bench_tab_2_2(b: &mut Bencher) { b.iter(|| from_read(make_html(&make_tab("cell", 2, 2)).as_bytes(), 80)); } #[bench] fn bench_tab_3_3(b: &mut Bencher) { b.iter(|| from_read(make_html(&make_tab("cell", 3, 3)).as_bytes(), 80)); } #[bench] fn bench_tab_4_4(b: &mut Bencher) { b.iter(|| from_read(make_html(&make_tab("cell", 4, 4)).as_bytes(), 80)); } #[bench] fn bench_tab_5_5(b: &mut Bencher) { b.iter(|| from_read(make_html(&make_tab("cell", 5, 5)).as_bytes(), 80)); } #[bench] fn bench_tab_6_6(b: &mut Bencher) { b.iter(|| from_read(make_html(&make_tab("cell", 6, 6)).as_bytes(), 80)); } // Try a table with `depth` nested tables each with `rows` rows and `cols` columns. fn bench_tab_depth(b: &mut Bencher, content: &str, depth: usize, rows: usize, cols: usize) { let mut t = String::from(content); for _ in 0..depth { t = make_tab(&t, rows, cols); } let html = make_html(&t); b.iter(|| from_read(html.as_bytes(), 80)); } #[bench] fn bench_tab_2_1_depth_2(b: &mut Bencher) { bench_tab_depth(b, "cell", 2, 2, 1); } #[bench] fn bench_tab_3_1_depth_2(b: &mut Bencher) { bench_tab_depth(b, "cell", 2, 3, 1); } #[bench] fn bench_tab_4_1_depth_2(b: &mut Bencher) { bench_tab_depth(b, "cell", 2, 4, 1); } #[bench] fn bench_tab_1_2_depth_2(b: &mut Bencher) { bench_tab_depth(b, "cell", 2, 1, 2); } #[bench] fn bench_tab_1_3_depth_2(b: &mut Bencher) { bench_tab_depth(b, "cell", 2, 1, 3); } #[bench] fn bench_tab_1_4_depth_2(b: &mut Bencher) { bench_tab_depth(b, "cell", 2, 1, 4); } #[bench] fn bench_tab_2_depth_2(b: &mut Bencher) { bench_tab_depth(b, "cell", 2, 2, 2); } /* #[bench] fn bench_tab_2_depth_3(b: &mut Bencher) { bench_tab_depth(b, "cell", 3, 2, 2); } #[bench] fn bench_tab_2_depth_4(b: &mut Bencher) { bench_tab_depth(b, "cell", 4, 2, 2); } #[bench] fn bench_tab_2_depth_5(b: &mut Bencher) { bench_tab_depth(b, "cell", 5, 2, 2); } */ html2text-0.4.4/examples/html2term.rs000064400000000000000000000237411046102023000156750ustar 00000000000000#[cfg(unix)] extern crate argparse; extern crate html2text; #[cfg(unix)] extern crate termion; #[cfg(unix)] extern crate unicode_width; #[cfg(unix)] mod top { use ::html2text; use ::std; use ::termion; use argparse::{ArgumentParser, Store}; use html2text::render::text_renderer::{RichAnnotation, TaggedLine, TaggedLineElement}; use std::collections::HashMap; use std::io::{self, Write}; use termion::cursor::Goto; use termion::event::Key; use termion::input::TermRead; use termion::raw::IntoRawMode; use termion::screen::AlternateScreen; use unicode_width::UnicodeWidthStr; fn to_style(tag: &Vec) -> String { let mut style = String::new(); for ann in tag { match *ann { RichAnnotation::Default => (), RichAnnotation::Link(_) => { style.push_str(&format!("{}", termion::style::Underline)); } RichAnnotation::Image => { style.push_str(&format!( "{}", termion::color::Fg(termion::color::LightBlue) )); } RichAnnotation::Emphasis => { style.push_str(&format!( "{}", termion::color::Fg(termion::color::LightGreen) )); } RichAnnotation::Strong => { style.push_str(&format!( "{}", termion::color::Fg(termion::color::LightGreen) )); } RichAnnotation::Strikeout => (), RichAnnotation::Code => { style.push_str(&format!( "{}", termion::color::Fg(termion::color::LightYellow) )); } RichAnnotation::Preformat(is_cont) => { if is_cont { style.push_str(&format!( "{}", termion::color::Fg(termion::color::LightMagenta) )); } else { style.push_str(&format!("{}", termion::color::Fg(termion::color::Magenta))); } } } } style } struct LinkMap { lines: Vec>>, // lines[y][x] => Some(URL) or None } impl LinkMap { pub fn link_at(&self, x: usize, y: usize) -> Option<&str> { if let Some(ref linevec) = self.lines.get(y) { if let Some(&Some(ref text)) = linevec.get(x) { return Some(&text); } } None } } fn link_from_tag(tag: &Vec) -> Option { let mut link = None; for annotation in tag { if let RichAnnotation::Link(ref text) = *annotation { link = Some(text.clone()); } } link } fn find_links(lines: &Vec>>) -> LinkMap { let mut map = Vec::new(); for line in lines { let mut linevec = Vec::new(); for ts in line.tagged_strings() { let link = link_from_tag(&ts.tag); for _ in 0..UnicodeWidthStr::width(ts.s.as_str()) { linevec.push(link.clone()); } } map.push(linevec); } LinkMap { lines: map } } struct FragMap { start_xy: HashMap, } fn find_frags(lines: &Vec>>) -> FragMap { use self::TaggedLineElement::*; let mut map = HashMap::new(); let mut y = 0; for line in lines { let mut x = 0; for tli in line.iter() { match tli { FragmentStart(fragname) => { map.insert(fragname.to_string(), (x, y)); } Str(ts) => { x += UnicodeWidthStr::width(ts.s.as_str()); } } } y += 1; } FragMap { start_xy: map } } pub fn main() { let mut filename = String::new(); { let mut ap = ArgumentParser::new(); ap.refer(&mut filename) .add_argument("filename", Store, "Set HTML filename"); ap.parse_args_or_exit(); } let (width, height) = termion::terminal_size().unwrap(); let (width, height) = (width as usize, height as usize); let mut file = std::fs::File::open(filename).expect("Tried to open file"); let annotated = html2text::from_read_rich(&mut file, width as usize); let link_map = find_links(&annotated); let frag_map = find_frags(&annotated); let mut keys = io::stdin().keys(); // max_y is the largest (0-based) index of a real document line. let max_y = annotated.len() - 1; // top_y is the (0-based) index of the document line shown at // the top of the visible screen. let mut top_y = 0; // doc_x and doc_y are the logical (0-based) x and y of the // cursor position within the document. let mut doc_x = 0; let mut doc_y = 0; let mut screen = AlternateScreen::from(io::stdout().into_raw_mode().unwrap()); loop { // Sanity-check the current screen position. max_y should // be small enough that no blank lines beyond the end of // the document are visible on screen (except when the // document is shorter than a screenful); large enough // that the cursor isn't off the bottom of the visible // screen; and small enough that the cursor isn't off the // top. if max_y >= height - 1 { top_y = std::cmp::min(top_y, max_y - (height - 1)); } if doc_y >= height - 1 { top_y = std::cmp::max(top_y, doc_y - (height - 1)); } top_y = std::cmp::min(top_y, doc_y); let opt_url = link_map.link_at(doc_x, doc_y); let vis_y_limit = std::cmp::min(top_y + height, max_y + 1); write!(screen, "{}", termion::clear::All).unwrap(); for (i, line) in annotated[top_y..vis_y_limit].iter().enumerate() { write!(screen, "{}", Goto(1, i as u16 + 1)).unwrap(); for ts in line.tagged_strings() { let style = to_style(&ts.tag); let link = link_from_tag(&ts.tag); match (opt_url, link) { (Some(ref t1), Some(ref t2)) if t1 == t2 => { write!(screen, "{}", termion::style::Invert).unwrap(); } _ => (), } write!(screen, "{}{}{}", style, ts.s, termion::style::Reset).unwrap(); } } // 1-based screen coordinates let cursor_x = (doc_x + 1) as u16; let cursor_y = (doc_y - top_y + 1) as u16; write!(screen, "{}", Goto(cursor_x, cursor_y)).unwrap(); screen.flush().unwrap(); if let Some(Ok(k)) = keys.next() { match k { Key::Char('q') => break, Key::Char('j') | Key::Down => { if doc_y < max_y { doc_y += 1; } } Key::Char('k') | Key::Up => { if doc_y > 0 { doc_y -= 1; } } Key::Char('h') | Key::Left => { if doc_x > 0 { doc_x -= 1; } } Key::Char('l') | Key::Right => { if doc_x + 1 < width { doc_x += 1; } } Key::Char(' ') | Key::PageDown => { // Ideally, move both the cursor and the top // visible line down by a whole page doc_y += height; top_y += height; // But bound the cursor within the document doc_y = std::cmp::min(doc_y, max_y); // And the standard bounds checking for top_y // will take care of the rest of the special // cases. } Key::PageUp => { // Ideally, move both the cursor and the top // visible line up by a whole page. But bound // both at zero. doc_y = std::cmp::max(doc_y, height) - height; top_y = std::cmp::max(top_y, height) - height; } Key::Home => { doc_y = 0; } Key::End => { doc_y = max_y; } Key::Char('\t') => {} Key::Char('\r') | Key::Char('\n') => { if let Some(url) = opt_url { if url.starts_with("#") { let start = frag_map.start_xy.get(&url[1..]); if let Some((x, y)) = start { doc_x = *x; doc_y = *y; } } } } _ => {} } } } } } #[cfg(not(unix))] mod top { pub fn main() {} } fn main() { top::main() } html2text-0.4.4/examples/html2text.rs000064400000000000000000000065531046102023000157140ustar 00000000000000extern crate argparse; extern crate html2text; use argparse::{ArgumentParser, Store, StoreOption, StoreTrue}; use std::io; use std::io::Write; #[cfg(feature = "ansi_colours")] use html2text::render::text_renderer::RichAnnotation; #[cfg(feature = "ansi_colours")] use termion; #[cfg(feature = "ansi_colours")] fn default_colour_map(annotation: &RichAnnotation) -> (String, String) { use termion::color::*; use RichAnnotation::*; match annotation { Default => ("".into(), "".into()), Link(_) => ( format!("{}", termion::style::Underline), format!("{}", termion::style::Reset), ), Image => (format!("{}", Fg(Blue)), format!("{}", Fg(Reset))), Emphasis => ( format!("{}", termion::style::Bold), format!("{}", termion::style::Reset), ), Strong => (format!("{}", Fg(LightYellow)), format!("{}", Fg(Reset))), Strikeout => (format!("{}", Fg(LightBlack)), format!("{}", Fg(Reset))), Code => (format!("{}", Fg(Blue)), format!("{}", Fg(Reset))), Preformat(_) => (format!("{}", Fg(Blue)), format!("{}", Fg(Reset))), } } fn translate(input: R, width: usize, literal: bool, _use_colour: bool) -> String where R: io::Read, { #[cfg(feature = "ansi_colours")] { if _use_colour { return html2text::from_read_coloured(input, width, default_colour_map).unwrap(); }; } if literal { let decorator = html2text::render::text_renderer::TrivialDecorator::new(); html2text::from_read_with_decorator(input, width, decorator) } else { html2text::from_read(input, width) } } fn main() { let mut infile: Option = None; let mut outfile: Option = None; let mut width: usize = 80; let mut literal: bool = false; #[allow(unused)] let mut use_colour = false; { let mut ap = ArgumentParser::new(); ap.refer(&mut infile).add_argument( "infile", StoreOption, "Input HTML file (default is standard input)", ); ap.refer(&mut width).add_option( &["-w", "--width"], Store, "Column width to format to (default is 80)", ); ap.refer(&mut outfile).add_option( &["-o", "--output"], StoreOption, "Output file (default is standard output)", ); ap.refer(&mut literal).add_option( &["-L", "--literal"], StoreTrue, "Output only literal text (no decorations)", ); #[cfg(feature = "ansi_colours")] ap.refer(&mut use_colour) .add_option(&["--colour"], StoreTrue, "Use ANSI terminal colours"); ap.parse_args_or_exit(); } let data = match infile { None => { let stdin = io::stdin(); let data = translate(&mut stdin.lock(), width, literal, use_colour); data } Some(name) => { let mut file = std::fs::File::open(name).expect("Tried to open file"); translate(&mut file, width, literal, use_colour) } }; match outfile { None => { println!("{}", data); } Some(name) => { let mut file = std::fs::File::create(name).expect("Tried to create file"); write!(file, "{}", data).unwrap(); } }; } html2text-0.4.4/rust.yml000064400000000000000000000004741046102023000133110ustar 00000000000000name: Rust on: push: branches: [ master ] pull_request: branches: [ master ] env: CARGO_TERM_COLOR: always jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Build run: cargo build --verbose - name: Run tests run: cargo test --verbose html2text-0.4.4/src/ansi_colours.rs000064400000000000000000000031501046102023000154200ustar 00000000000000//! Convenience helper for producing coloured terminal output. //! //! This optional helper applies terminal colours (or other effects which //! can be achieved using inline characters sent to the terminal such as //! underlining in some terminals). use crate::{parse, RichAnnotation, RichDecorator}; use std::fmt::Write; use std::io; /// Reads HTML from `input`, and returns text wrapped to `width` columns. /// The text is returned as a `Vec>`; the annotations are vectors /// of `RichAnnotation`. The "outer" annotation comes first in the `Vec`. /// /// The function `colour_map` is given a slice of `RichAnnotation` and should /// return a pair of static strings which should be inserted before/after a text /// span with that annotation; for example a string which sets text colour /// and a string which sets the colour back to the default. pub fn from_read_coloured( input: R, width: usize, colour_map: FMap, ) -> Result where R: io::Read, FMap: Fn(&RichAnnotation) -> (String, String), { let lines = parse(input) .render(width, RichDecorator::new()) .into_lines(); let mut result = String::new(); for line in lines { for ts in line.tagged_strings() { let mut start = String::new(); let mut finish = String::new(); for ann in &ts.tag { let (s, f) = colour_map(ann); start.push_str(&s); finish.push_str(&f); } write!(result, "{}{}{}", start, ts.s, finish)?; } result.push('\n'); } Ok(result) } html2text-0.4.4/src/lib.rs000064400000000000000000001565241046102023000135040ustar 00000000000000//! Convert HTML to text formats. //! //! This crate renders HTML into a text format, wrapped to a specified width. //! This can either be plain text or with extra annotations to (for example) //! show in a terminal which supports colours. //! //! # Examples //! //! ```rust //! # use html2text::from_read; //! let html = b" //!
    //!
  • Item one
  • //!
  • Item two
  • //!
  • Item three
  • //!
"; //! assert_eq!(from_read(&html[..], 20), //! "\ //! * Item one //! * Item two //! * Item three //! "); //! ``` //! A couple of simple demonstration programs are included as examples: //! //! ### html2text //! //! The simplest example uses `from_read` to convert HTML on stdin into plain //! text: //! //! ```sh //! $ cargo run --example html2text < foo.html //! [...] //! ``` //! //! ### html2term //! //! A very simple example of using the rich interface (`from_read_rich`) for a //! slightly interactive console HTML viewer is provided as `html2term`. //! //! ```sh //! $ cargo run --example html2term foo.html //! [...] //! ``` //! //! Note that this example takes the HTML file as a parameter so that it can //! read keys from stdin. //! #![cfg_attr(feature = "clippy", feature(plugin))] #![cfg_attr(feature = "clippy", plugin(clippy))] #![deny(missing_docs)] #[macro_use] extern crate html5ever; extern crate unicode_width; #[macro_use] mod macros; pub mod render; use render::text_renderer::{ PlainDecorator, RenderLine, RichAnnotation, RichDecorator, TaggedLine, TextDecorator, TextRenderer, }; use render::Renderer; use html5ever::driver::ParseOpts; use html5ever::parse_document; use html5ever::tendril::TendrilSink; use html5ever::tree_builder::TreeBuilderOpts; mod markup5ever_rcdom; use markup5ever_rcdom::{ Handle, NodeData::{Comment, Document, Element}, RcDom, }; use std::cell::Cell; use std::cmp::{max, min}; use std::io; use std::io::Write; use std::iter::{once, repeat}; use std::ops::{Deref, DerefMut}; /// A dummy writer which does nothing struct Discard {} impl Write for Discard { fn write(&mut self, bytes: &[u8]) -> std::result::Result { Ok(bytes.len()) } fn flush(&mut self) -> std::result::Result<(), io::Error> { Ok(()) } } const MIN_WIDTH: usize = 3; /// Size information/estimate #[derive(Debug, Copy, Clone)] pub struct SizeEstimate { size: usize, // Rough overall size min_width: usize, // The narrowest possible } impl Default for SizeEstimate { fn default() -> SizeEstimate { SizeEstimate { size: 0, min_width: 0, } } } impl SizeEstimate { /// Combine two estimates into one (add size and widest required) pub fn add(self, other: SizeEstimate) -> SizeEstimate { SizeEstimate { size: self.size + other.size, min_width: max(self.min_width, other.min_width), } } /// Combine two estimates into one (take max of each) pub fn max(self, other: SizeEstimate) -> SizeEstimate { SizeEstimate { size: max(self.size, other.size), min_width: max(self.min_width, other.min_width), } } } #[derive(Clone, Debug)] /// Render tree table cell pub struct RenderTableCell { colspan: usize, content: Vec, size_estimate: Cell>, col_width: Option, // Actual width to use } impl RenderTableCell { /// Render this cell to a builder. pub fn render(&mut self, _builder: &mut R, _err_out: &mut T) { unimplemented!() //render_tree_children_to_string(builder, &mut self.content, err_out) } /// Calculate or return the estimate size of the cell pub fn get_size_estimate(&self) -> SizeEstimate { if self.size_estimate.get().is_none() { let size = self .content .iter() .map(|node| node.get_size_estimate()) .fold(Default::default(), SizeEstimate::add); self.size_estimate.set(Some(size)); } self.size_estimate.get().unwrap() } } #[derive(Clone, Debug)] /// Render tree table row pub struct RenderTableRow { cells: Vec, col_sizes: Option>, } impl RenderTableRow { /// Return a mutable iterator over the cells. pub fn cells(&self) -> std::slice::Iter { self.cells.iter() } /// Return a mutable iterator over the cells. pub fn cells_mut(&mut self) -> std::slice::IterMut { self.cells.iter_mut() } /// Count the number of cells in the row. /// Takes into account colspan. pub fn num_cells(&self) -> usize { self.cells.iter().map(|cell| cell.colspan).sum() } /// Return an iterator over (column, &cell)s, which /// takes into account colspan. pub fn cell_columns(&mut self) -> Vec<(usize, &mut RenderTableCell)> { let mut result = Vec::new(); let mut colno = 0; for cell in &mut self.cells { let colspan = cell.colspan; result.push((colno, cell)); colno += colspan; } result } /// Return the contained cells as RenderNodes, annotated with their /// widths if available. Skips cells with no width allocated. pub fn into_cells(self, vertical: bool) -> Vec { let mut result = Vec::new(); let mut colno = 0; let col_sizes = self.col_sizes.unwrap(); for mut cell in self.cells { let colspan = cell.colspan; let col_width = if vertical { col_sizes[colno] } else { col_sizes[colno..colno + cell.colspan].iter().sum::() }; // Skip any zero-width columns if col_width > 0 { cell.col_width = Some(col_width + cell.colspan - 1); result.push(RenderNode::new(RenderNodeInfo::TableCell(cell))); } colno += colspan; } result } } #[derive(Clone, Debug)] /// A representation of a table render tree with metadata. pub struct RenderTable { rows: Vec, num_columns: usize, size_estimate: Cell>, } impl RenderTable { /// Create a new RenderTable with the given rows pub fn new(rows: Vec) -> RenderTable { let num_columns = rows.iter().map(|r| r.num_cells()).max().unwrap_or(0); RenderTable { rows, num_columns, size_estimate: Cell::new(None), } } /// Return an iterator over the rows. pub fn rows(&self) -> std::slice::Iter { self.rows.iter() } /// Return an iterator over the rows. pub fn rows_mut(&mut self) -> std::slice::IterMut { self.rows.iter_mut() } /// Consume this and return a Vec containing the children; /// the children know the column sizes required. pub fn into_rows(self, col_sizes: Vec, vert: bool) -> Vec { self.rows .into_iter() .map(|mut tr| { tr.col_sizes = Some(col_sizes.clone()); RenderNode::new(RenderNodeInfo::TableRow(tr, vert)) }) .collect() } fn calc_size_estimate(&self) { if self.num_columns == 0 { self.size_estimate.set(Some(SizeEstimate { size: 0, min_width: 0, })); return; } let mut sizes: Vec = vec![Default::default(); self.num_columns]; // For now, a simple estimate based on adding up sub-parts. for row in self.rows() { let mut colno = 0usize; for cell in row.cells() { let cellsize = cell.get_size_estimate(); for colnum in 0..cell.colspan { sizes[colno + colnum].size += cellsize.size / cell.colspan; sizes[colno + colnum].min_width = max( sizes[colno + colnum].min_width / cell.colspan, cellsize.min_width, ); } colno += cell.colspan; } } let size = sizes.iter().map(|s| s.size).sum(); // Include borders? let min_width = sizes.iter().map(|s| s.min_width).sum::() + self.num_columns - 1; self.size_estimate .set(Some(SizeEstimate { size, min_width })); } /// Calculate and store (or return stored value) of estimated size pub fn get_size_estimate(&self) -> SizeEstimate { if self.size_estimate.get().is_none() { self.calc_size_estimate(); } self.size_estimate.get().unwrap() } } /// The node-specific information distilled from the DOM. #[derive(Clone, Debug)] pub enum RenderNodeInfo { /// Some text. Text(String), /// A group of nodes collected together. Container(Vec), /// A link with contained nodes Link(String, Vec), /// An emphasised region Em(Vec), /// A strong region Strong(Vec), /// A struck out region Strikeout(Vec), /// A code region Code(Vec), /// An image (title) Img(String), /// A block element with children Block(Vec), /// A header (h1, h2, ...) with children Header(usize, Vec), /// A Div element with children Div(Vec), /// A preformatted region. Pre(Vec), /// A blockquote BlockQuote(Vec), /// An unordered list Ul(Vec), /// An ordered list Ol(i64, Vec), /// A description list (containing Dt or Dd) Dl(Vec), /// A term (from a
) Dt(Vec), /// A definition (from a
) Dd(Vec), /// A line break Break, /// A table Table(RenderTable), /// A set of table rows (from either
or TableBody(Vec), /// Table row (must only appear within a table body) /// If the boolean is true, then the cells are drawn vertically /// instead of horizontally (because of space). TableRow(RenderTableRow, bool), /// Table cell (must only appear within a table row) TableCell(RenderTableCell), /// Start of a named HTML fragment FragStart(String), } /// Common fields from a node. #[derive(Clone, Debug)] pub struct RenderNode { size_estimate: Cell>, info: RenderNodeInfo, } impl RenderNode { /// Create a node from the RenderNodeInfo. pub fn new(info: RenderNodeInfo) -> RenderNode { RenderNode { size_estimate: Cell::new(None), info, } } /// Get a size estimate pub fn get_size_estimate(&self) -> SizeEstimate { // If it's already calculated, then just return the answer. if let Some(s) = self.size_estimate.get() { return s; }; use RenderNodeInfo::*; // Otherwise, make an estimate. let estimate = match self.info { Text(ref t) | Img(ref t) => { use unicode_width::UnicodeWidthStr; let mut len = t.trim().width(); // Add one for preceding whitespace. if let Some(true) = t.chars().next().map(|c| c.is_whitespace()) { len += 1; } SizeEstimate { size: len, min_width: len.min(MIN_WIDTH), } } Container(ref v) | Em(ref v) | Strong(ref v) | Strikeout(ref v) | Code(ref v) | Block(ref v) | Div(ref v) | Pre(ref v) | BlockQuote(ref v) | Dl(ref v) | Dt(ref v) | Dd(ref v) => v .iter() .map(RenderNode::get_size_estimate) .fold(Default::default(), SizeEstimate::add), Link(ref target, ref v) => v .iter() .map(RenderNode::get_size_estimate) .fold(Default::default(), SizeEstimate::add) .add(SizeEstimate { size: target.len() + 4, min_width: 4, }), Ul(ref v) => v .iter() .map(RenderNode::get_size_estimate) .fold(Default::default(), SizeEstimate::add) .add(SizeEstimate { size: 2, min_width: 2, }), Ol(i, ref v) => v .iter() .map(RenderNode::get_size_estimate) .fold(Default::default(), SizeEstimate::add) .add(SizeEstimate { size: i.to_string().len() + 2, min_width: i.to_string().len() + 2, }), Header(level, ref v) => v .iter() .map(RenderNode::get_size_estimate) .fold(Default::default(), SizeEstimate::add) .add(SizeEstimate { size: 0, min_width: MIN_WIDTH + level + 2, }), Break => SizeEstimate { size: 1, min_width: 1, }, Table(ref t) => t.get_size_estimate(), TableRow(..) | TableBody(_) | TableCell(_) => unimplemented!(), FragStart(_) => Default::default(), }; self.size_estimate.set(Some(estimate)); estimate } /// Return true if this node is definitely empty. This is used to quickly /// remove e.g. links with no anchor text in most cases, but can't recurse /// and look more deeply. pub fn is_shallow_empty(&self) -> bool { use RenderNodeInfo::*; // Otherwise, make an estimate. match self.info { Text(ref t) | Img(ref t) => { let len = t.trim().len(); len == 0 } Container(ref v) | Link(_, ref v) | Em(ref v) | Strong(ref v) | Strikeout(ref v) | Code(ref v) | Block(ref v) | Div(ref v) | Pre(ref v) | BlockQuote(ref v) | Dl(ref v) | Dt(ref v) | Dd(ref v) | Ul(ref v) | Ol(_, ref v) => v.is_empty(), Header(_level, ref v) => v.is_empty(), Break => true, Table(ref _t) => false, TableRow(..) | TableBody(_) | TableCell(_) => false, FragStart(_) => true, } } } fn precalc_size_estimate<'a>(node: &'a RenderNode) -> TreeMapResult<(), &'a RenderNode, ()> { use RenderNodeInfo::*; if node.size_estimate.get().is_some() { return TreeMapResult::Nothing; } match node.info { Text(_) | Img(_) | Break | FragStart(_) => { let _ = node.get_size_estimate(); TreeMapResult::Nothing } Container(ref v) | Link(_, ref v) | Em(ref v) | Strong(ref v) | Strikeout(ref v) | Code(ref v) | Block(ref v) | Div(ref v) | Pre(ref v) | BlockQuote(ref v) | Ul(ref v) | Ol(_, ref v) | Dl(ref v) | Dt(ref v) | Dd(ref v) | Header(_, ref v) => TreeMapResult::PendingChildren { children: v.iter().collect(), cons: Box::new(move |_, _cs| { node.get_size_estimate(); None }), prefn: None, postfn: None, }, Table(ref t) => { /* Return all the indirect children which are RenderNodes. */ let mut children = Vec::new(); for row in &t.rows { for cell in &row.cells { children.extend(cell.content.iter()); } } TreeMapResult::PendingChildren { children, cons: Box::new(move |_, _cs| { node.get_size_estimate(); None }), prefn: None, postfn: None, } } TableRow(..) | TableBody(_) | TableCell(_) => unimplemented!(), } } /// Make a Vec of RenderNodes from the children of a node. fn children_to_render_nodes(handle: Handle, err_out: &mut T) -> Vec { /* process children, but don't add anything */ let children = handle .children .borrow() .iter() .flat_map(|ch| dom_to_render_tree(ch.clone(), err_out)) .collect(); children } /// Make a Vec of RenderNodes from the
  • children of a node. fn list_children_to_render_nodes(handle: Handle, err_out: &mut T) -> Vec { let mut children = Vec::new(); for child in handle.children.borrow().iter() { match child.data { Element { ref name, .. } => match name.expanded() { expanded_name!(html "li") => { let li_children = children_to_render_nodes(child.clone(), err_out); children.push(RenderNode::new(RenderNodeInfo::Block(li_children))); } _ => {} }, Comment { .. } => {} _ => { html_trace!("Unhandled in list: {:?}\n", child); } } } children } /// Make a Vec of DtElements from the
    and
    children of a node. fn desc_list_children_to_render_nodes( handle: Handle, err_out: &mut T, ) -> Vec { let mut children = Vec::new(); for child in handle.children.borrow().iter() { match child.data { Element { ref name, .. } => match name.expanded() { expanded_name!(html "dt") => { let dt_children = children_to_render_nodes(child.clone(), err_out); children.push(RenderNode::new(RenderNodeInfo::Dt(dt_children))); } expanded_name!(html "dd") => { let dd_children = children_to_render_nodes(child.clone(), err_out); children.push(RenderNode::new(RenderNodeInfo::Dd(dd_children))); } _ => {} }, Comment { .. } => {} _ => { html_trace!("Unhandled in list: {:?}\n", child); } } } children } /// Convert a table into a RenderNode fn table_to_render_tree<'a, 'b, T: Write>( handle: Handle, _err_out: &'b mut T, ) -> TreeMapResult<'a, (), Handle, RenderNode> { pending(handle, |_, rowset| { let mut rows = vec![]; for bodynode in rowset { if let RenderNodeInfo::TableBody(body) = bodynode.info { rows.extend(body); } else { html_trace!("Found in table: {:?}", bodynode.info); } } Some(RenderNode::new(RenderNodeInfo::Table(RenderTable::new( rows, )))) }) } /// Add rows from a thead or tbody. fn tbody_to_render_tree<'a, 'b, T: Write>( handle: Handle, _err_out: &'b mut T, ) -> TreeMapResult<'a, (), Handle, RenderNode> { pending(handle, |_, rowchildren| { let rows = rowchildren .into_iter() .flat_map(|rownode| { if let RenderNodeInfo::TableRow(row, _) = rownode.info { Some(row) } else { html_trace!(" [[tbody child: {:?}]]", rownode); None } }) .collect(); Some(RenderNode::new(RenderNodeInfo::TableBody(rows))) }) } /// Convert a table row to a RenderTableRow fn tr_to_render_tree<'a, 'b, T: Write>( handle: Handle, _err_out: &'b mut T, ) -> TreeMapResult<'a, (), Handle, RenderNode> { pending(handle, |_, cellnodes| { let cells = cellnodes .into_iter() .flat_map(|cellnode| { if let RenderNodeInfo::TableCell(cell) = cellnode.info { Some(cell) } else { html_trace!(" [[tr child: {:?}]]", cellnode); None } }) .collect(); Some(RenderNode::new(RenderNodeInfo::TableRow( RenderTableRow { cells, col_sizes: None, }, false, ))) }) } /// Convert a single table cell to a render node. fn td_to_render_tree<'a, 'b, T: Write>( handle: Handle, _err_out: &'b mut T, ) -> TreeMapResult<'a, (), Handle, RenderNode> { let mut colspan = 1; if let Element { ref attrs, .. } = handle.data { for attr in attrs.borrow().iter() { if &attr.name.local == "colspan" { let v: &str = &*attr.value; colspan = v.parse().unwrap_or(1); } } } pending(handle, move |_, children| { Some(RenderNode::new(RenderNodeInfo::TableCell( RenderTableCell { colspan, content: children, size_estimate: Cell::new(None), col_width: None, }, ))) }) } /// A reducer which combines results from mapping children into /// the result for the current node. Takes a context and a /// vector of results and returns a new result (or nothing). type ResultReducer<'a, C, R> = dyn Fn(&mut C, Vec) -> Option + 'a; /// A closure to call before processing a child node. type ChildPreFn = dyn Fn(&mut C, &N); /// A closure to call after processing a child node, /// before adding the result to the processed results /// vector. type ChildPostFn = dyn Fn(&mut C, &R); /// The result of trying to render one node. enum TreeMapResult<'a, C, N, R> { /// A completed result. Finished(R), /// Deferred completion - can be turned into a result /// once the vector of children are processed. PendingChildren { children: Vec, cons: Box>, prefn: Option>>, postfn: Option>>, }, /// Nothing (e.g. a comment or other ignored element). Nothing, } fn tree_map_reduce<'a, C, N, R, M>(context: &mut C, top: N, mut process_node: M) -> Option where M: for<'c> FnMut(&'c mut C, N) -> TreeMapResult<'a, C, N, R>, { /// A node partially decoded, waiting for its children to /// be processed. struct PendingNode<'a, C, R, N> { /// How to make the node once finished construct: Box>, /// Called before processing each child prefn: Option>>, /// Called after processing each child postfn: Option>>, /// Children already processed children: Vec, /// Iterator of child nodes not yet processed to_process: std::vec::IntoIter, } let mut pending_stack = vec![PendingNode { // We only expect one child, which we'll just return. construct: Box::new(|_, mut cs| cs.pop()), prefn: None, postfn: None, children: Vec::new(), to_process: vec![top].into_iter(), }]; loop { // Get the next child node to process let next_node = pending_stack.last_mut().unwrap().to_process.next(); if let Some(h) = next_node { pending_stack .last_mut() .unwrap() .prefn .as_ref() .map(|ref f| f(context, &h)); match process_node(context, h) { TreeMapResult::Finished(result) => { pending_stack .last_mut() .unwrap() .postfn .as_ref() .map(|ref f| f(context, &result)); pending_stack.last_mut().unwrap().children.push(result); } TreeMapResult::PendingChildren { children, cons, prefn, postfn, } => { pending_stack.push(PendingNode { construct: cons, prefn, postfn, children: Vec::new(), to_process: children.into_iter(), }); } TreeMapResult::Nothing => {} }; } else { // No more children, so finally construct the parent. let completed = pending_stack.pop().unwrap(); let reduced = (completed.construct)(context, completed.children); if let Some(node) = reduced { if let Some(parent) = pending_stack.last_mut() { parent.postfn.as_ref().map(|ref f| f(context, &node)); parent.children.push(node); } else { // Finished the whole stack! break Some(node); } } else { /* Finished the stack, and have nothing */ if pending_stack.is_empty() { break None; } } } } } /// Convert a DOM tree or subtree into a render tree. pub fn dom_to_render_tree(handle: Handle, err_out: &mut T) -> Option { html_trace!("### dom_to_render_tree: HTML: {:?}", handle); let result = tree_map_reduce(&mut (), handle, |_, handle| { process_dom_node(handle, err_out) }); html_trace!("### dom_to_render_tree: out= {:#?}", result); result } fn pending<'a, F>(handle: Handle, f: F) -> TreeMapResult<'a, (), Handle, RenderNode> where //for<'a> F: Fn(&'a mut C, Vec) -> Option+'static for<'r> F: Fn(&'r mut (), std::vec::Vec) -> Option + 'static, { TreeMapResult::PendingChildren { children: handle.children.borrow().clone(), cons: Box::new(f), prefn: None, postfn: None, } } /// Prepend a FragmentStart (or analogous) marker to an existing /// RenderNode. fn prepend_marker(prefix: RenderNode, mut orig: RenderNode) -> RenderNode { use RenderNodeInfo::*; html_trace!("prepend_marker({:?}, {:?})", prefix, orig); match orig.info { // For block elements such as Block and Div, we need to insert // the node at the front of their children array, otherwise // the renderer is liable to drop the fragment start marker // _before_ the new line indicating the end of the previous // paragraph. // // For Container, we do the same thing just to make the data // less pointlessly nested. Block(ref mut children) | Div(ref mut children) | Pre(ref mut children) | BlockQuote(ref mut children) | Container(ref mut children) | TableCell(RenderTableCell { content: ref mut children, .. }) => { children.insert(0, prefix); // Now return orig, but we do that outside the match so // that we've given back the borrowed ref 'children'. } // For table rows and tables, push down if there's any content. TableRow(ref mut rrow, _) => { // If the row is empty, then there isn't really anything // to attach the fragment start to. if rrow.cells.len() > 0 { rrow.cells[0].content.insert(0, prefix); } } TableBody(ref mut rows) | Table(RenderTable { ref mut rows, .. }) => { // If the row is empty, then there isn't really anything // to attach the fragment start to. if rows.len() > 0 { let rrow = &mut rows[0]; if rrow.cells.len() > 0 { rrow.cells[0].content.insert(0, prefix); } } } // For anything else, just make a new Container with the // prefix node and the original one. _ => { let result = RenderNode::new(Container(vec![prefix, orig])); html_trace!("prepend_marker() -> {:?}", result); return result; } } html_trace!("prepend_marker() -> {:?}", &orig); orig } fn process_dom_node<'a, 'b, T: Write>( handle: Handle, err_out: &'b mut T, ) -> TreeMapResult<'a, (), Handle, RenderNode> { use RenderNodeInfo::*; use TreeMapResult::*; match handle.clone().data { Document => pending(handle, |&mut (), cs| Some(RenderNode::new(Container(cs)))), Comment { .. } => Nothing, Element { ref name, ref attrs, .. } => { let mut frag_from_name_attr = false; let result = match name.expanded() { expanded_name!(html "html") | expanded_name!(html "span") | expanded_name!(html "body") => { /* process children, but don't add anything */ pending(handle, |_, cs| Some(RenderNode::new(Container(cs)))) } expanded_name!(html "link") | expanded_name!(html "meta") | expanded_name!(html "hr") | expanded_name!(html "script") | expanded_name!(html "style") | expanded_name!(html "head") => { /* Ignore the head and its children */ Nothing } expanded_name!(html "a") => { let borrowed = attrs.borrow(); let mut target = None; frag_from_name_attr = true; for attr in borrowed.iter() { if &attr.name.local == "href" { target = Some(&*attr.value); break; } } PendingChildren { children: handle.children.borrow().clone(), cons: if let Some(href) = target { // We need the closure to own the string it's going to use. // Unfortunately that means we ideally want FnOnce; but // that doesn't yet work in a Box. Box does, but // is unstable. So we'll just move a string in and clone // it on use. let href: String = href.into(); Box::new(move |_, cs: Vec| { if cs.iter().any(|c| !c.is_shallow_empty()) { Some(RenderNode::new(Link(href.clone(), cs))) } else { None } }) } else { Box::new(|_, cs| Some(RenderNode::new(Container(cs)))) }, prefn: None, postfn: None, } } expanded_name!(html "em") => pending(handle, |_, cs| Some(RenderNode::new(Em(cs)))), expanded_name!(html "strong") => { pending(handle, |_, cs| Some(RenderNode::new(Strong(cs)))) } expanded_name!(html "s") => { pending(handle, |_, cs| Some(RenderNode::new(Strikeout(cs)))) } expanded_name!(html "code") => { pending(handle, |_, cs| Some(RenderNode::new(Code(cs)))) } expanded_name!(html "img") => { let borrowed = attrs.borrow(); let mut title = None; for attr in borrowed.iter() { if &attr.name.local == "alt" && !attr.value.is_empty() { title = Some(&*attr.value); break; } } if let Some(title) = title { Finished(RenderNode::new(Img(title.into()))) } else { Nothing } } expanded_name!(html "h1") | expanded_name!(html "h2") | expanded_name!(html "h3") | expanded_name!(html "h4") => { let level: usize = name.local[1..].parse().unwrap(); pending(handle, move |_, cs| { Some(RenderNode::new(Header(level, cs))) }) } expanded_name!(html "p") => { pending(handle, |_, cs| Some(RenderNode::new(Block(cs)))) } expanded_name!(html "div") => { pending(handle, |_, cs| Some(RenderNode::new(Div(cs)))) } expanded_name!(html "pre") => { pending(handle, |_, cs| Some(RenderNode::new(Pre(cs)))) } expanded_name!(html "br") => Finished(RenderNode::new(Break)), expanded_name!(html "table") => table_to_render_tree(handle.clone(), err_out), expanded_name!(html "thead") | expanded_name!(html "tbody") => { tbody_to_render_tree(handle.clone(), err_out) } expanded_name!(html "tr") => tr_to_render_tree(handle.clone(), err_out), expanded_name!(html "th") | expanded_name!(html "td") => { td_to_render_tree(handle.clone(), err_out) } expanded_name!(html "blockquote") => { pending(handle, |_, cs| Some(RenderNode::new(BlockQuote(cs)))) } expanded_name!(html "ul") => Finished(RenderNode::new(Ul( list_children_to_render_nodes(handle.clone(), err_out), ))), expanded_name!(html "ol") => { let borrowed = attrs.borrow(); let mut start = 1; for attr in borrowed.iter() { if &attr.name.local == "start" { start = attr.value.parse().ok().unwrap_or(1); break; } } Finished(RenderNode::new(Ol( start, list_children_to_render_nodes(handle.clone(), err_out), ))) } expanded_name!(html "dl") => Finished(RenderNode::new(Dl( desc_list_children_to_render_nodes(handle.clone(), err_out), ))), _ => { html_trace!("Unhandled element: {:?}\n", name.local); pending(handle, |_, cs| Some(RenderNode::new(Container(cs)))) //None } }; let mut fragment = None; let borrowed = attrs.borrow(); for attr in borrowed.iter() { if &attr.name.local == "id" || (frag_from_name_attr && &attr.name.local == "name") { fragment = Some(attr.value.to_string()); break; } } if let Some(fragname) = fragment { match result { Finished(node) => { Finished(prepend_marker(RenderNode::new(FragStart(fragname)), node)) } Nothing => Finished(RenderNode::new(FragStart(fragname))), PendingChildren { children, cons, prefn, postfn, } => { let fragname: String = fragname.into(); PendingChildren { children, prefn, postfn, cons: Box::new(move |ctx, ch| { let fragnode = RenderNode::new(FragStart(fragname.clone())); match cons(ctx, ch) { None => Some(fragnode), Some(node) => Some(prepend_marker(fragnode, node)), } }), } } } } else { result } } markup5ever_rcdom::NodeData::Text { contents: ref tstr } => { Finished(RenderNode::new(Text((&*tstr.borrow()).into()))) } _ => { // NodeData doesn't have a Debug impl. write!(err_out, "Unhandled node type.\n").unwrap(); Nothing } } } /// Context to use during tree parsing. /// This mainly gives access to a Renderer, but needs to be able to push /// new ones on for nested structures. #[derive(Clone, Debug)] struct BuilderStack { builders: Vec, } impl BuilderStack { pub fn new(builder: R) -> BuilderStack { BuilderStack { builders: vec![builder], } } /// Push a new builder onto the stack pub fn push(&mut self, builder: R) { self.builders.push(builder); } /// Pop off the top builder and return it. /// Panics if empty pub fn pop(&mut self) -> R { self.builders.pop().unwrap() } /// Pop off the only builder and return it. /// panics if there aren't exactly 1 available. pub fn into_inner(mut self) -> R { assert_eq!(self.builders.len(), 1); self.builders.pop().unwrap() } } impl Deref for BuilderStack { type Target = R; fn deref(&self) -> &R { self.builders.last().expect("Underflow in BuilderStack") } } impl DerefMut for BuilderStack { fn deref_mut(&mut self) -> &mut R { self.builders.last_mut().expect("Underflow in BuilderStack") } } fn render_tree_to_string( builder: R, tree: RenderNode, err_out: &mut T, ) -> R { /* Phase 1: get size estimates. */ tree_map_reduce(&mut (), &tree, |_, node| precalc_size_estimate(&node)); /* Phase 2: actually render. */ let mut bs = BuilderStack::new(builder); tree_map_reduce(&mut bs, tree, |builders, node| { do_render_node(builders, node, err_out) }); bs.into_inner() } fn pending2< 'a, R: Renderer, F: Fn(&mut BuilderStack, Vec>) -> Option> + 'static, >( children: Vec, f: F, ) -> TreeMapResult<'a, BuilderStack, RenderNode, Option> { TreeMapResult::PendingChildren { children, cons: Box::new(f), prefn: None, postfn: None, } } fn do_render_node<'a, 'b, T: Write, R: Renderer>( builder: &mut BuilderStack, tree: RenderNode, err_out: &'b mut T, ) -> TreeMapResult<'static, BuilderStack, RenderNode, Option> { html_trace!("do_render_node({:?}", tree); use RenderNodeInfo::*; use TreeMapResult::*; match tree.info { Text(ref tstr) => { builder.add_inline_text(tstr); Finished(None) } Container(children) => pending2(children, |_, _| Some(None)), Link(href, children) => { builder.start_link(&href); pending2(children, |builder: &mut BuilderStack, _| { builder.end_link(); Some(None) }) } Em(children) => { builder.start_emphasis(); pending2(children, |builder: &mut BuilderStack, _| { builder.end_emphasis(); Some(None) }) } Strong(children) => { builder.start_strong(); pending2(children, |builder: &mut BuilderStack, _| { builder.end_strong(); Some(None) }) } Strikeout(children) => { builder.start_strikeout(); pending2(children, |builder: &mut BuilderStack, _| { builder.end_strikeout(); Some(None) }) } Code(children) => { builder.start_code(); pending2(children, |builder: &mut BuilderStack, _| { builder.end_code(); Some(None) }) } Img(title) => { builder.add_image(&title); Finished(None) } Block(children) => { builder.start_block(); pending2(children, |builder: &mut BuilderStack, _| { builder.end_block(); Some(None) }) } Header(level, children) => { let prefix = builder.header_prefix(level); let min_width = max(builder.width(), 1 + prefix.len()); let sub_builder = builder.new_sub_renderer(min_width - prefix.len()); builder.push(sub_builder); pending2(children, move |builder: &mut BuilderStack, _| { let sub_builder = builder.pop(); builder.start_block(); builder.append_subrender(sub_builder, repeat(&prefix[..])); builder.end_block(); Some(None) }) } Div(children) => { builder.new_line(); pending2(children, |builder: &mut BuilderStack, _| { builder.new_line(); Some(None) }) } Pre(children) => { builder.new_line(); builder.start_pre(); pending2(children, |builder: &mut BuilderStack, _| { builder.new_line(); builder.end_pre(); Some(None) }) } BlockQuote(children) => { let prefix = builder.quote_prefix(); let sub_builder = builder.new_sub_renderer(builder.width() - prefix.len()); builder.push(sub_builder); pending2(children, move |builder: &mut BuilderStack, _| { let sub_builder = builder.pop(); builder.start_block(); builder.append_subrender(sub_builder, repeat(&prefix[..])); builder.end_block(); Some(None) }) } Ul(items) => { builder.start_block(); let prefix = builder.unordered_item_prefix(); let prefix_len = prefix.len(); TreeMapResult::PendingChildren { children: items, cons: Box::new(|_, _| Some(None)), prefn: Some(Box::new(move |builder: &mut BuilderStack, _| { let sub_builder = builder.new_sub_renderer(builder.width() - prefix_len); builder.push(sub_builder); })), postfn: Some(Box::new(move |builder: &mut BuilderStack, _| { let sub_builder = builder.pop(); let indent = " ".repeat(prefix.len()); builder.append_subrender( sub_builder, once(&prefix[..]).chain(repeat(&indent[..])), ); })), } } Ol(start, items) => { builder.start_block(); let num_items = items.len(); // The prefix width could be at either end if the start is negative. let min_number = start; // Assumption: num_items can't overflow isize. let max_number = start + (num_items as i64) - 1; let prefix_width_min = builder.ordered_item_prefix(min_number).len(); let prefix_width_max = builder.ordered_item_prefix(max_number).len(); let prefix_width = max(prefix_width_min, prefix_width_max); let prefixn = format!("{: = Cell::new(start); TreeMapResult::PendingChildren { children: items, cons: Box::new(|_, _| Some(None)), prefn: Some(Box::new(move |builder: &mut BuilderStack, _| { let sub_builder = builder.new_sub_renderer(builder.width() - prefix_width); builder.push(sub_builder); })), postfn: Some(Box::new(move |builder: &mut BuilderStack, _| { let sub_builder = builder.pop(); let prefix1 = builder.ordered_item_prefix(i.get()); let prefix1 = format!("{: { builder.start_block(); TreeMapResult::PendingChildren { children: items, cons: Box::new(|_, _| Some(None)), prefn: None, postfn: None, } } Dt(children) => { builder.new_line(); builder.start_emphasis(); pending2(children, |builder: &mut BuilderStack, _| { builder.end_emphasis(); Some(None) }) } Dd(children) => { let sub_builder = builder.new_sub_renderer(builder.width() - 2); builder.push(sub_builder); pending2(children, |builder: &mut BuilderStack, _| { let sub_builder = builder.pop(); builder.append_subrender(sub_builder, repeat(" ")); Some(None) }) } Break => { builder.new_line_hard(); Finished(None) } Table(tab) => render_table_tree(builder.deref_mut(), tab, err_out), TableRow(row, false) => render_table_row(builder.deref_mut(), row, err_out), TableRow(row, true) => render_table_row_vert(builder.deref_mut(), row, err_out), TableBody(_) => unimplemented!("Unexpected TableBody while rendering"), TableCell(cell) => render_table_cell(builder.deref_mut(), cell, err_out), FragStart(fragname) => { builder.record_frag_start(&fragname); Finished(None) } } } fn render_table_tree( builder: &mut R, table: RenderTable, _err_out: &mut T, ) -> TreeMapResult<'static, BuilderStack, RenderNode, Option> { /* Now lay out the table. */ let num_columns = table.num_columns; /* Heuristic: scale the column widths according to how much content there is. */ let mut col_sizes: Vec = vec![Default::default(); num_columns]; for row in table.rows() { let mut colno = 0; for cell in row.cells() { // FIXME: get_size_estimate is still recursive. let mut estimate = cell.get_size_estimate(); // If the cell has a colspan>1, then spread its size between the // columns. estimate.size /= cell.colspan; estimate.min_width /= cell.colspan; for i in 0..cell.colspan { col_sizes[colno + i] = (col_sizes[colno + i]).max(estimate); } colno += cell.colspan; } } // TODO: remove empty columns let tot_size: usize = col_sizes.iter().map(|est| est.size).sum(); let min_size: usize = col_sizes.iter().map(|est| est.min_width).sum::() + col_sizes.len().saturating_sub(1); let width = builder.width(); let vert_row = min_size > width; let mut col_widths: Vec = if !vert_row { col_sizes .iter() .map(|sz| { if sz.size == 0 { 0 } else { min( sz.size, if usize::MAX / width <= sz.size { // The provided width is too large to multiply by width, // so do it the other way around. max((width / tot_size) * sz.size, sz.min_width) } else { max(sz.size * width / tot_size, sz.min_width) }, ) } }) .collect() } else { col_sizes.iter().map(|_| width).collect() }; if !vert_row { loop { let cur_width = col_widths.iter().cloned().sum::(); if cur_width <= width { break; } let (i, _) = col_widths .iter() .cloned() .enumerate() .max_by_key(|&(colno, width)| { ( width.saturating_sub(col_sizes[colno].min_width), width, usize::max_value() - colno, ) }) .unwrap(); col_widths[i] -= 1; } } builder.start_block(); let table_width = if vert_row { width } else { col_widths.iter().cloned().sum::() + col_widths .iter() .filter(|&w| w > &0) .count() .saturating_sub(1) }; builder.add_horizontal_border_width(table_width); TreeMapResult::PendingChildren { children: table.into_rows(col_widths, vert_row), cons: Box::new(|_, _| Some(None)), prefn: Some(Box::new(|_, _| {})), postfn: Some(Box::new(|_, _| {})), } } fn render_table_row( _builder: &mut R, row: RenderTableRow, _err_out: &mut T, ) -> TreeMapResult<'static, BuilderStack, RenderNode, Option> { TreeMapResult::PendingChildren { children: row.into_cells(false), cons: Box::new(|builders, children| { let children: Vec<_> = children.into_iter().map(Option::unwrap).collect(); if children.iter().any(|c| !c.empty()) { builders.append_columns_with_borders(children, true); } Some(None) }), prefn: Some(Box::new(|builder: &mut BuilderStack, node| { if let RenderNodeInfo::TableCell(ref cell) = node.info { let sub_builder = builder.new_sub_renderer(cell.col_width.unwrap()); builder.push(sub_builder); } else { panic!() } })), postfn: Some(Box::new(|_builder: &mut BuilderStack, _| {})), } } fn render_table_row_vert( _builder: &mut R, row: RenderTableRow, _err_out: &mut T, ) -> TreeMapResult<'static, BuilderStack, RenderNode, Option> { TreeMapResult::PendingChildren { children: row.into_cells(true), cons: Box::new(|builders, children| { let children: Vec<_> = children.into_iter().map(Option::unwrap).collect(); builders.append_vert_row(children); Some(None) }), prefn: Some(Box::new(|builder: &mut BuilderStack, node| { if let RenderNodeInfo::TableCell(ref cell) = node.info { let sub_builder = builder.new_sub_renderer(cell.col_width.unwrap()); builder.push(sub_builder); } else { panic!() } })), postfn: Some(Box::new(|_builder: &mut BuilderStack, _| {})), } } fn render_table_cell( _builder: &mut R, cell: RenderTableCell, _err_out: &mut T, ) -> TreeMapResult<'static, BuilderStack, RenderNode, Option> { pending2(cell.content, |builder: &mut BuilderStack, _| { let sub_builder = builder.pop(); Some(Some(sub_builder)) }) } /// The structure of an HTML document that can be rendered using a [`TextDecorator`][]. /// /// [`TextDecorator`]: render/text_renderer/trait.TextDecorator.html #[derive(Clone, Debug)] pub struct RenderTree(RenderNode); impl RenderTree { /// Render this document using the given `decorator` and wrap it to `width` columns. pub fn render( self, width: usize, decorator: D, ) -> RenderedText { let builder = TextRenderer::new(width, decorator); let builder = render_tree_to_string(builder, self.0, &mut Discard {}); RenderedText(builder) } /// Render this document as plain text using the [`PlainDecorator`][] and wrap it to `width` /// columns. /// /// [`PlainDecorator`]: render/text_renderer/struct.PlainDecorator.html pub fn render_plain(self, width: usize) -> RenderedText { self.render(width, PlainDecorator::new()) } /// Render this document as rich text using the [`RichDecorator`][] and wrap it to `width` /// columns. /// /// [`RichDecorator`]: render/text_renderer/struct.RichDecorator.html pub fn render_rich(self, width: usize) -> RenderedText { self.render(width, RichDecorator::new()) } } /// A rendered HTML document. pub struct RenderedText(TextRenderer); impl RenderedText { /// Convert the rendered HTML document to a string. pub fn into_string(self) -> String { self.0.into_string() } /// Convert the rendered HTML document to a vector of lines with the annotations created by the /// decorator. pub fn into_lines(self) -> Vec>> { self.0 .into_lines() .into_iter() .map(RenderLine::into_tagged_line) .collect() } } /// Reads and parses HTML from `input` and prepares a render tree. pub fn parse(mut input: impl io::Read) -> RenderTree { let opts = ParseOpts { tree_builder: TreeBuilderOpts { drop_doctype: true, ..Default::default() }, ..Default::default() }; let dom = parse_document(RcDom::default(), opts) .from_utf8() .read_from(&mut input) .unwrap(); let render_tree = dom_to_render_tree(dom.document.clone(), &mut Discard {}).unwrap(); RenderTree(render_tree) } /// Reads HTML from `input`, decorates it using `decorator`, and /// returns a `String` with text wrapped to `width` columns. pub fn from_read_with_decorator( input: R, width: usize, decorator: D, ) -> String where R: io::Read, D: TextDecorator, { parse(input).render(width, decorator).into_string() } /// Reads HTML from `input`, and returns a `String` with text wrapped to /// `width` columns. pub fn from_read(input: R, width: usize) -> String where R: io::Read, { let decorator = PlainDecorator::new(); from_read_with_decorator(input, width, decorator) } /// Reads HTML from `input`, and returns text wrapped to `width` columns. /// The text is returned as a `Vec>`; the annotations are vectors /// of `RichAnnotation`. The "outer" annotation comes first in the `Vec`. pub fn from_read_rich(input: R, width: usize) -> Vec>> where R: io::Read, { parse(input) .render(width, RichDecorator::new()) .into_lines() } #[cfg(feature = "ansi_colours")] mod ansi_colours; #[cfg(feature = "ansi_colours")] pub use ansi_colours::from_read_coloured; #[cfg(test)] mod tests; html2text-0.4.4/src/macros.rs000064400000000000000000000033201046102023000142030ustar 00000000000000#[cfg(feature = "html_trace_bt")] extern crate backtrace; /* This is to work around a false positive for the clippy warning * `match_on_same_arms`. * See https://github.com/Manishearth/rust-clippy/issues/1390 */ #[cfg(not(feature = "html_trace"))] #[inline(always)] pub fn nop() {} #[cfg(feature = "html_trace")] #[macro_export] #[doc(hidden)] macro_rules! html_trace { ($fmt:expr) => { #[cfg(feature = "html_trace_bt")] { let bt = ::backtrace::Backtrace::new(); eprintln!( concat!($fmt, " at {:?}"), bt ); } #[cfg(not(feature = "html_trace_bt"))] { eprintln!($fmt); } }; ($fmt:expr, $( $args:expr ),*) => { #[cfg(feature = "html_trace_bt")] { let bt = ::backtrace::Backtrace::new(); eprintln!( concat!($fmt, " at {:?}"), $( $args ),* , bt ); } #[cfg(not(feature = "html_trace_bt"))] { eprintln!($fmt, $( $args ),*); } }; } #[cfg(not(feature = "html_trace"))] #[macro_export] #[doc(hidden)] macro_rules! html_trace { ($fmt:expr) => { $crate::macros::nop(); }; ($fmt:expr, $( $args:expr ),*) => { $crate::macros::nop(); }; } #[cfg(feature = "html_trace")] #[macro_export] #[doc(hidden)] macro_rules! html_trace_quiet { ($fmt:expr) => { eprintln!( $fmt ); }; ($fmt:expr, $( $args:expr ),*) => { eprintln!( $fmt, $( $args ),* ); }; } #[cfg(not(feature = "html_trace"))] #[macro_export] #[doc(hidden)] macro_rules! html_trace_quiet { ($fmt:expr) => { $crate::macros::nop(); }; ($fmt:expr, $( $args:expr ),*) => { $crate::macros::nop(); }; } html2text-0.4.4/src/markup5ever_rcdom.rs000064400000000000000000000363051046102023000163620ustar 00000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! A simple reference-counted DOM. //! //! This is sufficient as a static parse tree, but don't build a //! web browser using it. :) //! //! A DOM is a [tree structure] with ordered children that can be represented in an XML-like //! format. For example, the following graph //! //! ```text //! div //! +- "text node" //! +- span //! ``` //! in HTML would be serialized as //! //! ```html //!
    text node
    //! ``` //! //! See the [document object model article on wikipedia][dom wiki] for more information. //! //! This implementation stores the information associated with each node once, and then hands out //! refs to children. The nodes themselves are reference-counted to avoid copying - you can create //! a new ref and then a node will outlive the document. Nodes own their children, but only have //! weak references to their parents. //! //! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure) //! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model extern crate markup5ever; extern crate tendril; use std::borrow::Cow; use std::cell::{Cell, RefCell}; use std::collections::{HashSet, VecDeque}; use std::default::Default; use std::fmt; use std::io; use std::mem; use std::rc::{Rc, Weak}; use tendril::StrTendril; use markup5ever::interface::tree_builder; use markup5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; use markup5ever::serialize::TraversalScope; use markup5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode}; use markup5ever::serialize::{Serialize, Serializer}; use markup5ever::Attribute; use markup5ever::ExpandedName; use markup5ever::QualName; /// The different kinds of nodes in the DOM. #[derive(Debug)] pub enum NodeData { /// The `Document` itself - the root node of a HTML document. Document, /// A `DOCTYPE` with name, public id, and system id. See /// [document type declaration on wikipedia][dtd wiki]. /// /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration Doctype { name: StrTendril, public_id: StrTendril, system_id: StrTendril, }, /// A text node. Text { contents: RefCell }, /// A comment. Comment { contents: StrTendril }, /// An element with attributes. Element { name: QualName, attrs: RefCell>, /// For HTML \ elements, the [template contents]. /// /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents template_contents: RefCell>, /// Whether the node is a [HTML integration point]. /// /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point mathml_annotation_xml_integration_point: bool, }, /// A Processing instruction. ProcessingInstruction { target: StrTendril, contents: StrTendril, }, } /// A DOM node. pub struct Node { /// Parent node. pub parent: Cell>, /// Child nodes of this node. pub children: RefCell>, /// Represents this node's data. pub data: NodeData, } impl Node { /// Create a new node from its contents pub fn new(data: NodeData) -> Rc { Rc::new(Node { data, parent: Cell::new(None), children: RefCell::new(Vec::new()), }) } } impl Drop for Node { fn drop(&mut self) { let mut nodes = mem::replace(&mut *self.children.borrow_mut(), vec![]); while let Some(node) = nodes.pop() { let children = mem::replace(&mut *node.children.borrow_mut(), vec![]); nodes.extend(children.into_iter()); if let NodeData::Element { ref template_contents, .. } = node.data { if let Some(template_contents) = template_contents.borrow_mut().take() { nodes.push(template_contents); } } } } } impl fmt::Debug for Node { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { fmt.debug_struct("Node") .field("data", &self.data) .field("children", &self.children) .finish() } } /// Reference to a DOM node. pub type Handle = Rc; /// Weak reference to a DOM node, used for parent pointers. pub type WeakHandle = Weak; /// Append a parentless node to another nodes' children fn append(new_parent: &Handle, child: Handle) { let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent))); // Invariant: child cannot have existing parent assert!(previous_parent.is_none()); new_parent.children.borrow_mut().push(child); } /// If the node has a parent, get it and this node's position in its children fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> { if let Some(weak) = target.parent.take() { let parent = weak.upgrade().expect("dangling weak pointer"); target.parent.set(Some(weak)); let i = match parent .children .borrow() .iter() .enumerate() .find(|&(_, child)| Rc::ptr_eq(&child, &target)) { Some((i, _)) => i, None => panic!("have parent but couldn't find in parent's children!"), }; Some((parent, i)) } else { None } } fn append_to_existing_text(prev: &Handle, text: &str) -> bool { match prev.data { NodeData::Text { ref contents } => { contents.borrow_mut().push_slice(text); true }, _ => false, } } fn remove_from_parent(target: &Handle) { if let Some((parent, i)) = get_parent_and_index(target) { parent.children.borrow_mut().remove(i); target.parent.set(None); } } /// The DOM itself; the result of parsing. pub struct RcDom { /// The `Document` itself. pub document: Handle, /// Errors that occurred during parsing. pub errors: Vec>, /// The document's quirks mode. pub quirks_mode: QuirksMode, } impl TreeSink for RcDom { type Output = Self; fn finish(self) -> Self { self } type Handle = Handle; fn parse_error(&mut self, msg: Cow<'static, str>) { self.errors.push(msg); } fn get_document(&mut self) -> Handle { self.document.clone() } fn get_template_contents(&mut self, target: &Handle) -> Handle { if let NodeData::Element { ref template_contents, .. } = target.data { template_contents.borrow().as_ref().expect("not a template element!").clone() } else { panic!("not a template element!") } } fn set_quirks_mode(&mut self, mode: QuirksMode) { self.quirks_mode = mode; } fn same_node(&self, x: &Handle, y: &Handle) -> bool { Rc::ptr_eq(x, y) } fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> { return match target.data { NodeData::Element { ref name, .. } => name.expanded(), _ => panic!("not an element!"), }; } fn create_element( &mut self, name: QualName, attrs: Vec, flags: ElementFlags, ) -> Handle { Node::new(NodeData::Element { name, attrs: RefCell::new(attrs), template_contents: RefCell::new(if flags.template { Some(Node::new(NodeData::Document)) } else { None }), mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point, }) } fn create_comment(&mut self, text: StrTendril) -> Handle { Node::new(NodeData::Comment { contents: text }) } fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Handle { Node::new(NodeData::ProcessingInstruction { target, contents: data, }) } fn append(&mut self, parent: &Handle, child: NodeOrText) { // Append to an existing Text node if we have one. match child { NodeOrText::AppendText(ref text) => match parent.children.borrow().last() { Some(h) => { if append_to_existing_text(h, &text) { return; } }, _ => (), }, _ => (), } append( &parent, match child { NodeOrText::AppendText(text) => Node::new(NodeData::Text { contents: RefCell::new(text), }), NodeOrText::AppendNode(node) => node, }, ); } fn append_before_sibling(&mut self, sibling: &Handle, child: NodeOrText) { let (parent, i) = get_parent_and_index(&sibling) .expect("append_before_sibling called on node without parent"); let child = match (child, i) { // No previous node. (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text { contents: RefCell::new(text), }), // Look for a text node before the insertion point. (NodeOrText::AppendText(text), i) => { let children = parent.children.borrow(); let prev = &children[i - 1]; if append_to_existing_text(prev, &text) { return; } Node::new(NodeData::Text { contents: RefCell::new(text), }) }, // The tree builder promises we won't have a text node after // the insertion point. // Any other kind of node. (NodeOrText::AppendNode(node), _) => node, }; remove_from_parent(&child); child.parent.set(Some(Rc::downgrade(&parent))); parent.children.borrow_mut().insert(i, child); } fn append_based_on_parent_node( &mut self, element: &Self::Handle, prev_element: &Self::Handle, child: NodeOrText, ) { let parent = element.parent.take(); let has_parent = parent.is_some(); element.parent.set(parent); if has_parent { self.append_before_sibling(element, child); } else { self.append(prev_element, child); } } fn append_doctype_to_document( &mut self, name: StrTendril, public_id: StrTendril, system_id: StrTendril, ) { append( &self.document, Node::new(NodeData::Doctype { name, public_id, system_id, }), ); } fn add_attrs_if_missing(&mut self, target: &Handle, attrs: Vec) { let mut existing = if let NodeData::Element { ref attrs, .. } = target.data { attrs.borrow_mut() } else { panic!("not an element") }; let existing_names = existing .iter() .map(|e| e.name.clone()) .collect::>(); existing.extend( attrs .into_iter() .filter(|attr| !existing_names.contains(&attr.name)), ); } fn remove_from_parent(&mut self, target: &Handle) { remove_from_parent(&target); } fn reparent_children(&mut self, node: &Handle, new_parent: &Handle) { let mut children = node.children.borrow_mut(); let mut new_children = new_parent.children.borrow_mut(); for child in children.iter() { let previous_parent = child.parent.replace(Some(Rc::downgrade(&new_parent))); assert!(Rc::ptr_eq( &node, &previous_parent.unwrap().upgrade().expect("dangling weak") )) } new_children.extend(mem::replace(&mut *children, Vec::new())); } fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool { if let NodeData::Element { mathml_annotation_xml_integration_point, .. } = target.data { mathml_annotation_xml_integration_point } else { panic!("not an element!") } } } impl Default for RcDom { fn default() -> RcDom { RcDom { document: Node::new(NodeData::Document), errors: vec![], quirks_mode: tree_builder::NoQuirks, } } } enum SerializeOp { Open(Handle), Close(QualName), } pub struct SerializableHandle(Handle); impl From for SerializableHandle { fn from(h: Handle) -> SerializableHandle { SerializableHandle(h) } } impl Serialize for SerializableHandle { fn serialize(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()> where S: Serializer, { let mut ops = VecDeque::new(); match traversal_scope { IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())), ChildrenOnly(_) => ops.extend(self .0 .children .borrow() .iter() .map(|h| SerializeOp::Open(h.clone()))) } while let Some(op) = ops.pop_front() { match op { SerializeOp::Open(handle) => match handle.data { NodeData::Element { ref name, ref attrs, .. } => { serializer.start_elem( name.clone(), attrs.borrow().iter().map(|at| (&at.name, &at.value[..])), )?; ops.reserve(1 + handle.children.borrow().len()); ops.push_front(SerializeOp::Close(name.clone())); for child in handle.children.borrow().iter().rev() { ops.push_front(SerializeOp::Open(child.clone())); } }, NodeData::Doctype { ref name, .. } => serializer.write_doctype(&name)?, NodeData::Text { ref contents } => { serializer.write_text(&contents.borrow())? }, NodeData::Comment { ref contents } => serializer.write_comment(&contents)?, NodeData::ProcessingInstruction { ref target, ref contents, } => serializer.write_processing_instruction(target, contents)?, NodeData::Document => panic!("Can't serialize Document node itself"), }, SerializeOp::Close(name) => { serializer.end_elem(name)?; }, } } Ok(()) } } html2text-0.4.4/src/render/mod.rs000064400000000000000000000076011046102023000147630ustar 00000000000000//! Module containing the `Renderer` interface for constructing a //! particular text output. pub mod text_renderer; /// A type which is a backend for HTML to text rendering. pub trait Renderer { /// Add an empty line to the output (ie between blocks). fn add_empty_line(&mut self); /// Create a sub-renderer for nested blocks. fn new_sub_renderer(&self, width: usize) -> Self; /// Start a new block. fn start_block(&mut self); /// Mark the end of a block. fn end_block(&mut self); /// Start a new line, if necessary (but don't add a new line). fn new_line(&mut self); /// Start a new line. fn new_line_hard(&mut self); /// Add a horizontal table border. fn add_horizontal_border(&mut self); /// Add a horizontal border which is not the full width fn add_horizontal_border_width(&mut self, #[allow(unused_variables)] width: usize) { self.add_horizontal_border(); } /// Begin a preformatted block. Until the corresponding end, /// whitespace will used verbatim. Pre regions can nest. fn start_pre(&mut self); /// Finish a preformatted block started with `start_pre`. fn end_pre(&mut self); /// Add some inline text (which should be wrapped at the /// appropriate width) to the current block. fn add_inline_text(&mut self, text: &str); /// Return the current width in character cells fn width(&self) -> usize; /// Add a line to the current block without starting a new one. fn add_block_line(&mut self, line: &str); /// Add a new block from a sub renderer, and prefix every line by the /// corresponding text from each iteration of prefixes. fn append_subrender<'a, I>(&mut self, other: Self, prefixes: I) where I: Iterator; /// Append a set of sub renderers joined left-to-right with a vertical line, /// and add a horizontal line below. /// If collapse is true, then merge top/bottom borders of the subrenderer /// with the surrounding one. fn append_columns_with_borders(&mut self, cols: I, collapse: bool) where I: IntoIterator, Self: Sized; /// Append a set of sub renderers joined vertically with lines, for tables /// which would otherwise be too wide for the screen. fn append_vert_row(&mut self, cols: I) where I: IntoIterator, Self: Sized; /// Returns true if this renderer has no content. fn empty(&self) -> bool; /// Return the length of the contained text. fn text_len(&self) -> usize; /// Start a hyperlink /// TODO: return sub-builder or similar to make misuse /// of start/link harder? fn start_link(&mut self, target: &str); /// Finish a hyperlink started earlier. fn end_link(&mut self); /// Start an emphasised region fn start_emphasis(&mut self); /// Finish emphasised text started earlier. fn end_emphasis(&mut self); /// Start a strong region fn start_strong(&mut self); /// Finish strong text started earlier. fn end_strong(&mut self); /// Start a strikeout region fn start_strikeout(&mut self); /// Finish strikeout text started earlier. fn end_strikeout(&mut self); /// Start a code region fn start_code(&mut self); /// End a code region fn end_code(&mut self); /// Add an image fn add_image(&mut self, title: &str); /// Get prefix string of header in specific level. fn header_prefix(&mut self, level: usize) -> String; /// Get prefix string of quoted block. fn quote_prefix(&mut self) -> String; /// Get prefix string of unordered list item. fn unordered_item_prefix(&mut self) -> String; /// Get prefix string of ith ordered list item. fn ordered_item_prefix(&mut self, i: i64) -> String; /// Record the start of a named HTML fragment fn record_frag_start(&mut self, fragname: &str); } html2text-0.4.4/src/render/text_renderer.rs000064400000000000000000001542351046102023000170640ustar 00000000000000//! Implementations of the `Renderer` trait. //! //! This module implements helpers and concrete types for rendering from HTML //! into different text formats. use super::Renderer; use std::mem; use std::ops::Deref; use std::vec; use std::{collections::LinkedList, fmt::Debug}; use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; /// A wrapper around a String with extra metadata. #[derive(Debug, Clone, PartialEq)] pub struct TaggedString { /// The wrapped text. pub s: String, /// The metadata. pub tag: T, } impl TaggedString { /// Returns the tagged string’s display width in columns. /// /// See [`unicode_width::UnicodeWidthStr::width`][] for more information. /// /// [`unicode_width::UnicodeWidthStr::width`]: https://docs.rs/unicode-width/latest/unicode_width/trait.UnicodeWidthStr.html pub fn width(&self) -> usize { self.s.width() } } /// An element of a line of tagged text: either a TaggedString or a /// marker appearing in between document characters. #[derive(Clone, Debug, PartialEq)] pub enum TaggedLineElement { /// A string with tag information attached. Str(TaggedString), /// A zero-width marker indicating the start of a named HTML fragment. FragmentStart(String), } /// A line of tagged text (composed of a set of `TaggedString`s). #[derive(Debug, Clone, PartialEq)] pub struct TaggedLine { v: Vec>, } impl TaggedLine { /// Create an empty `TaggedLine`. pub fn new() -> TaggedLine { TaggedLine { v: Vec::new() } } /// Create a new TaggedLine from a string and tag. pub fn from_string(s: String, tag: &T) -> TaggedLine { TaggedLine { v: vec![TaggedLineElement::Str(TaggedString { s: s, tag: tag.clone(), })], } } /// Join the line into a String, ignoring the tags and markers. pub fn into_string(self) -> String { let mut s = String::new(); for tle in self.v { if let TaggedLineElement::Str(ts) = tle { s.push_str(&ts.s); } } s } /// Return true if the line is non-empty pub fn is_empty(&self) -> bool { self.v.len() == 0 } /// Add a new tagged string fragment to the line pub fn push_str(&mut self, ts: TaggedString) { use self::TaggedLineElement::Str; if !self.v.is_empty() { if let Str(ref mut ts_prev) = self.v.last_mut().unwrap() { if ts_prev.tag == ts.tag { ts_prev.s.push_str(&ts.s); return; } } } self.v.push(Str(ts)); } /// Add a new general TaggedLineElement to the line pub fn push(&mut self, tle: TaggedLineElement) { use self::TaggedLineElement::Str; if let Str(ts) = tle { self.push_str(ts); } else { self.v.push(tle); } } /// Add a new fragment to the start of the line pub fn insert_front(&mut self, ts: TaggedString) { use self::TaggedLineElement::Str; self.v.insert(0, Str(ts)); } /// Add text with a particular tag to self pub fn push_char(&mut self, c: char, tag: &T) { use self::TaggedLineElement::Str; if !self.v.is_empty() { if let Str(ref mut ts_prev) = self.v.last_mut().unwrap() { if ts_prev.tag == *tag { ts_prev.s.push(c); return; } } } let mut s = String::new(); s.push(c); self.v.push(Str(TaggedString { s, tag: tag.clone(), })); } /// Drain tl and use to extend self. pub fn consume(&mut self, tl: &mut TaggedLine) { for ts in tl.v.drain(..) { self.push(ts); } } /// Drain the contained items pub fn drain_all(&mut self) -> vec::Drain> { self.v.drain(..) } /// Iterator over the chars in this line. #[cfg_attr(feature = "clippy", allow(needless_lifetimes))] pub fn chars<'a>(&'a self) -> Box + 'a> { use self::TaggedLineElement::Str; Box::new(self.v.iter().flat_map(|tle| { if let Str(ts) = tle { ts.s.chars() } else { "".chars() } })) } #[cfg(feature = "html_trace")] /// Return a string contents for debugging. fn to_string(&self) -> String { self.chars().collect() } /// Iterator over TaggedLineElements pub fn iter<'a>(&'a self) -> Box> + 'a> { Box::new(self.v.iter()) } /// Iterator over the tagged strings in this line, ignoring any fragments. pub fn tagged_strings(&self) -> impl Iterator> { self.v.iter().filter_map(|tle| match tle { TaggedLineElement::Str(ts) => Some(ts), _ => None, }) } /// Converts the tagged line into an iterator over the tagged strings in this line, ignoring /// any fragments. pub fn into_tagged_strings(self) -> impl Iterator> { self.v.into_iter().filter_map(|tle| match tle { TaggedLineElement::Str(ts) => Some(ts), _ => None, }) } /// Return the width of the line in cells pub fn width(&self) -> usize { self.tagged_strings().map(TaggedString::width).sum() } /// Pad this line to width with spaces (or if already at least this wide, do /// nothing). pub fn pad_to(&mut self, width: usize) { use self::TaggedLineElement::Str; let my_width = self.width(); if width > my_width { self.v.push(Str(TaggedString { s: format!("{: { width: usize, text: Vec>, textlen: usize, line: TaggedLine, linelen: usize, spacetag: Option, // Tag for the whitespace before the current word word: TaggedLine, // The current word (with no whitespace). wordlen: usize, pre_wrapped: bool, // If true, we've been forced to wrap a
     line.
    }
    
    impl WrappedBlock {
        pub fn new(width: usize) -> WrappedBlock {
            WrappedBlock {
                width,
                text: Vec::new(),
                textlen: 0,
                line: TaggedLine::new(),
                linelen: 0,
                spacetag: None,
                word: TaggedLine::new(),
                wordlen: 0,
                pre_wrapped: false,
            }
        }
    
        fn flush_word(&mut self) {
            use self::TaggedLineElement::Str;
    
            /* Finish the word. */
            html_trace_quiet!("flush_word: word={:?}, linelen={}", self.word, self.linelen);
            if !self.word.is_empty() {
                self.pre_wrapped = false;
                let space_in_line = self.width - self.linelen;
                let space_needed = self.wordlen + if self.linelen > 0 { 1 } else { 0 }; // space
                if space_needed <= space_in_line {
                    html_trace!("Got enough space");
                    if self.linelen > 0 {
                        self.line.push(Str(TaggedString {
                            s: " ".into(),
                            tag: self.spacetag.take().unwrap_or_else(|| Default::default()),
                        }));
                        self.linelen += 1;
                        html_trace!("linelen incremented to {}", self.linelen);
                    }
                    self.line.consume(&mut self.word);
                    self.linelen += self.wordlen;
                    html_trace!("linelen increased by wordlen to {}", self.linelen);
                } else {
                    html_trace!("Not enough space");
                    /* Start a new line */
                    self.flush_line();
                    if self.wordlen <= self.width {
                        html_trace!("wordlen <= width");
                        let mut new_word = TaggedLine::new();
                        mem::swap(&mut new_word, &mut self.word);
                        mem::swap(&mut self.line, &mut new_word);
                        self.linelen = self.wordlen;
                        html_trace!("linelen set to wordlen {}", self.linelen);
                    } else {
                        html_trace!("Splitting the word");
                        /* We need to split the word. */
                        let mut wordbits = self.word.drain_all();
                        /* Note: there's always at least one piece */
                        let mut opt_elt = wordbits.next();
                        let mut lineleft = self.width;
                        while let Some(elt) = opt_elt.take() {
                            html_trace!("Take element {:?}", elt);
                            if let Str(piece) = elt {
                                let w = piece.width();
                                if w <= lineleft {
                                    self.line.push(Str(piece));
                                    lineleft -= w;
                                    self.linelen += w;
                                    html_trace!("linelen had w={} added to {}", w, self.linelen);
                                    opt_elt = wordbits.next();
                                } else {
                                    /* Split into two */
                                    let mut split_idx = 0;
                                    for (idx, c) in piece.s.char_indices() {
                                        let c_w = UnicodeWidthChar::width(c).unwrap();
                                        if c_w <= lineleft {
                                            lineleft -= c_w;
                                        } else {
                                            split_idx = idx;
                                            break;
                                        }
                                    }
                                    self.line.push(Str(TaggedString {
                                        s: piece.s[..split_idx].into(),
                                        tag: piece.tag.clone(),
                                    }));
                                    {
                                        let mut tmp_line = TaggedLine::new();
                                        mem::swap(&mut tmp_line, &mut self.line);
                                        self.text.push(tmp_line);
                                    }
                                    lineleft = self.width;
                                    self.linelen = 0;
                                    html_trace!("linelen set to zero here");
                                    opt_elt = Some(Str(TaggedString {
                                        s: piece.s[split_idx..].into(),
                                        tag: piece.tag,
                                    }));
                                }
                            } else {
                                self.line.push(elt);
                                opt_elt = wordbits.next();
                            }
                        }
                    }
                }
            }
            self.wordlen = 0;
        }
    
        fn flush_line(&mut self) {
            if !self.line.is_empty() {
                let mut tmp_line = TaggedLine::new();
                mem::swap(&mut tmp_line, &mut self.line);
                self.text.push(tmp_line);
                self.linelen = 0;
            }
        }
    
        fn flush(&mut self) {
            self.flush_word();
            self.flush_line();
        }
    
        /// Consume self and return a vector of lines.
        /*
        pub fn into_untagged_lines(mut self) -> Vec {
            self.flush();
    
            let mut result = Vec::new();
            for line in self.text.into_iter() {
                let mut line_s = String::new();
                for TaggedString{ s, .. } in line.into_iter() {
                    line_s.push_str(&s);
                }
                result.push(line_s);
            }
            result
        }
        */
    
        /// Consume self and return vector of lines including annotations.
        pub fn into_lines(mut self) -> Vec> {
            self.flush();
    
            self.text
        }
    
        pub fn add_text(&mut self, text: &str, tag: &T) {
            html_trace!("WrappedBlock::add_text({}), {:?}", text, tag);
            for c in text.chars() {
                if c.is_whitespace() {
                    /* Whitespace is mostly ignored, except to terminate words. */
                    self.flush_word();
                    self.spacetag = Some(tag.clone());
                } else if let Some(charwidth) = UnicodeWidthChar::width(c) {
                    /* Not whitespace; add to the current word. */
                    self.word.push_char(c, tag);
                    self.wordlen += charwidth;
                }
                html_trace_quiet!("  Added char {:?}, wordlen={}", c, self.wordlen);
            }
        }
    
        pub fn add_preformatted_text(&mut self, text: &str, tag_main: &T, tag_wrapped: &T) {
            html_trace!(
                "WrappedBlock::add_preformatted_text({}), {:?}/{:?}",
                text,
                tag_main,
                tag_wrapped
            );
            // Make sure that any previous word has been sent to the line, as we
            // bypass the word buffer.
            self.flush_word();
    
            for c in text.chars() {
                if let Some(charwidth) = UnicodeWidthChar::width(c) {
                    if self.linelen + charwidth > self.width {
                        self.flush_line();
                        self.pre_wrapped = true;
                    }
                    self.line.push_char(
                        c,
                        if self.pre_wrapped {
                            tag_wrapped
                        } else {
                            tag_main
                        },
                    );
                    self.linelen += charwidth;
                } else {
                    match c {
                        '\n' => {
                            self.flush_line();
                            self.pre_wrapped = false;
                        }
                        '\t' => {
                            let tab_stop = 8;
                            let mut at_least_one_space = false;
                            while self.linelen % tab_stop != 0 || !at_least_one_space {
                                if self.linelen >= self.width {
                                    self.flush_line();
                                } else {
                                    self.line.push_char(
                                        ' ',
                                        if self.pre_wrapped {
                                            tag_wrapped
                                        } else {
                                            tag_main
                                        },
                                    );
                                    self.linelen += 1;
                                    at_least_one_space = true;
                                }
                            }
                        }
                        _ => {
                            eprintln!("Got character: {:?}", c);
                        }
                    }
                }
                html_trace_quiet!("  Added char {:?}", c);
            }
        }
    
        pub fn add_element(&mut self, elt: TaggedLineElement) {
            self.word.push(elt);
        }
    
        pub fn text_len(&self) -> usize {
            self.textlen + self.linelen + self.wordlen
        }
    
        pub fn is_empty(&self) -> bool {
            self.text_len() == 0
        }
    }
    
    /// Allow decorating/styling text.
    ///
    /// Decorating refers to adding extra text around the rendered version
    /// of some elements, such as surrounding emphasised text with `*` like
    /// in markdown: `Some *bold* text`.  The decorations are formatted and
    /// wrapped along with the rest of the rendered text.  This is
    ///
    /// In addition, instances of `TextDecorator` can also return annotations
    /// of an associated type `Annotation` which will be associated with spans of
    /// text.  This can be anything from `()` as for `PlainDecorator` or a more
    /// featured type such as `RichAnnotation`.  The annotated spans (`TaggedLine`)
    /// can be used by application code to add e.g. terminal colours or underlines.
    pub trait TextDecorator {
        /// An annotation which can be added to text, and which will
        /// be attached to spans of text.
        type Annotation: Eq + PartialEq + Debug + Clone + Default;
    
        /// Return an annotation and rendering prefix for a link.
        fn decorate_link_start(&mut self, url: &str) -> (String, Self::Annotation);
    
        /// Return a suffix for after a link.
        fn decorate_link_end(&mut self) -> String;
    
        /// Return an annotation and rendering prefix for em
        fn decorate_em_start(&mut self) -> (String, Self::Annotation);
    
        /// Return a suffix for after an em.
        fn decorate_em_end(&mut self) -> String;
    
        /// Return an annotation and rendering prefix for strong
        fn decorate_strong_start(&mut self) -> (String, Self::Annotation);
    
        /// Return a suffix for after an strong.
        fn decorate_strong_end(&mut self) -> String;
    
        /// Return an annotation and rendering prefix for strikeout
        fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation);
    
        /// Return a suffix for after an strikeout.
        fn decorate_strikeout_end(&mut self) -> String;
    
        /// Return an annotation and rendering prefix for code
        fn decorate_code_start(&mut self) -> (String, Self::Annotation);
    
        /// Return a suffix for after an code.
        fn decorate_code_end(&mut self) -> String;
    
        /// Return an annotation for the initial part of a preformatted line
        fn decorate_preformat_first(&mut self) -> Self::Annotation;
    
        /// Return an annotation for a continuation line when a preformatted
        /// line doesn't fit.
        fn decorate_preformat_cont(&mut self) -> Self::Annotation;
    
        /// Return an annotation and rendering prefix for a link.
        fn decorate_image(&mut self, title: &str) -> (String, Self::Annotation);
    
        /// Return prefix string of header in specific level.
        fn header_prefix(&mut self, level: usize) -> String;
    
        /// Return prefix string of quoted block.
        fn quote_prefix(&mut self) -> String;
    
        /// Return prefix string of unordered list item.
        fn unordered_item_prefix(&mut self) -> String;
    
        /// Return prefix string of ith ordered list item.
        fn ordered_item_prefix(&mut self, i: i64) -> String;
    
        /// Return a new decorator of the same type which can be used
        /// for sub blocks.
        fn make_subblock_decorator(&self) -> Self;
    
        /// Finish with a document, and return extra lines (eg footnotes)
        /// to add to the rendered text.
        fn finalise(self) -> Vec>;
    }
    
    /// A space on a horizontal row.
    #[derive(Copy, Clone, Debug)]
    pub enum BorderSegHoriz {
        /// Pure horizontal line
        Straight,
        /// Joined with a line above
        JoinAbove,
        /// Joins with a line below
        JoinBelow,
        /// Joins both ways
        JoinCross,
        /// Horizontal line, but separating two table cells from a row
        /// which wouldn't fit next to each other.
        StraightVert,
    }
    
    /// A dividing line between table rows which tracks intersections
    /// with vertical lines.
    #[derive(Clone, Debug)]
    pub struct BorderHoriz {
        /// The segments for the line.
        pub segments: Vec,
    }
    
    impl BorderHoriz {
        /// Create a new blank border line.
        pub fn new(width: usize) -> BorderHoriz {
            BorderHoriz {
                segments: vec![BorderSegHoriz::Straight; width],
            }
        }
    
        /// Create a new blank border line.
        pub fn new_type(width: usize, linetype: BorderSegHoriz) -> BorderHoriz {
            BorderHoriz {
                segments: vec![linetype; width],
            }
        }
    
        /// Stretch the line to at least the specified width
        pub fn stretch_to(&mut self, width: usize) {
            use self::BorderSegHoriz::*;
            while width > self.segments.len() {
                self.segments.push(Straight);
            }
        }
    
        /// Make a join to a line above at the xth cell
        pub fn join_above(&mut self, x: usize) {
            use self::BorderSegHoriz::*;
            self.stretch_to(x + 1);
            let prev = self.segments[x];
            self.segments[x] = match prev {
                Straight | JoinAbove => JoinAbove,
                JoinBelow | JoinCross => JoinCross,
                StraightVert => StraightVert,
            }
        }
    
        /// Make a join to a line below at the xth cell
        pub fn join_below(&mut self, x: usize) {
            use self::BorderSegHoriz::*;
            self.stretch_to(x + 1);
            let prev = self.segments[x];
            self.segments[x] = match prev {
                Straight | JoinBelow => JoinBelow,
                JoinAbove | JoinCross => JoinCross,
                StraightVert => StraightVert,
            }
        }
    
        /// Merge a (possibly partial) border line below into this one.
        pub fn merge_from_below(&mut self, other: &BorderHoriz, pos: usize) {
            use self::BorderSegHoriz::*;
            for (idx, seg) in other.segments.iter().enumerate() {
                match *seg {
                    Straight | StraightVert => (),
                    JoinAbove | JoinBelow | JoinCross => {
                        self.join_below(idx + pos);
                    }
                }
            }
        }
    
        /// Merge a (possibly partial) border line above into this one.
        pub fn merge_from_above(&mut self, other: &BorderHoriz, pos: usize) {
            use self::BorderSegHoriz::*;
            for (idx, seg) in other.segments.iter().enumerate() {
                match *seg {
                    Straight | StraightVert => (),
                    JoinAbove | JoinBelow | JoinCross => {
                        self.join_above(idx + pos);
                    }
                }
            }
        }
    
        /// Return a string of spaces and vertical lines which would match
        /// just above this line.
        pub fn to_vertical_lines_above(&self) -> String {
            use self::BorderSegHoriz::*;
            self.segments
                .iter()
                .map(|seg| match *seg {
                    Straight | JoinBelow | StraightVert => ' ',
                    JoinAbove | JoinCross => '│',
                })
                .collect()
        }
    
        /// Turn into a string with drawing characters
        pub fn into_string(self) -> String {
            self.segments
                .into_iter()
                .map(|seg| match seg {
                    BorderSegHoriz::Straight => '─',
                    BorderSegHoriz::StraightVert => '/',
                    BorderSegHoriz::JoinAbove => '┴',
                    BorderSegHoriz::JoinBelow => '┬',
                    BorderSegHoriz::JoinCross => '┼',
                })
                .collect::()
        }
    
        /// Return a string without destroying self
        pub fn to_string(&self) -> String {
            self.clone().into_string()
        }
    }
    
    /// A line, which can either be text or a line.
    #[derive(Clone, Debug)]
    pub enum RenderLine {
        /// Some rendered text
        Text(TaggedLine),
        /// A table border line
        Line(BorderHoriz),
    }
    
    impl RenderLine {
        /// Turn the rendered line into a String
        pub fn into_string(self) -> String {
            match self {
                RenderLine::Text(tagged) => tagged.into_string(),
                RenderLine::Line(border) => border.into_string(),
            }
        }
    
        /// Convert into a `TaggedLine`, if necessary squashing the
        /// BorderHoriz into one.
        pub fn into_tagged_line(self) -> TaggedLine {
            use self::TaggedLineElement::Str;
    
            match self {
                RenderLine::Text(tagged) => tagged,
                RenderLine::Line(border) => {
                    let mut tagged = TaggedLine::new();
                    tagged.push(Str(TaggedString {
                        s: border.into_string(),
                        tag: T::default(),
                    }));
                    tagged
                }
            }
        }
    
        #[cfg(feature = "html_trace")]
        /// For testing, return a simple string of the contents.
        fn to_string(&self) -> String {
            match self {
                RenderLine::Text(tagged) => tagged.to_string(),
                RenderLine::Line(border) => border.to_string(),
            }
        }
    }
    
    /// A renderer which just outputs plain text with
    /// annotations depending on a decorator.
    #[derive(Clone)]
    pub struct TextRenderer {
        width: usize,
        lines: LinkedList>>,
        /// True at the end of a block, meaning we should add
        /// a blank line if any other text is added.
        at_block_end: bool,
        wrapping: Option>>,
        decorator: Option,
        ann_stack: Vec,
        text_filter_stack: Vec Option>,
        /// The depth of 
     block stacking.
        pre_depth: usize,
    }
    
    impl std::fmt::Debug for TextRenderer {
        fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
            f.debug_struct("TextRenderer")
                .field("width", &self.width)
                .field("lines", &self.lines)
                .field("decorator", &self.decorator)
                .field("ann_stack", &self.ann_stack)
                .field("pre_depth", &self.pre_depth)
                .finish()
        }
    }
    
    impl TextRenderer {
        /// Construct a new empty TextRenderer.
        pub fn new(width: usize, decorator: D) -> TextRenderer {
            html_trace!("new({})", width);
            TextRenderer {
                width,
                lines: LinkedList::new(),
                at_block_end: false,
                wrapping: None,
                decorator: Some(decorator),
                ann_stack: Vec::new(),
                pre_depth: 0,
                text_filter_stack: Vec::new(),
            }
        }
    
        fn ensure_wrapping_exists(&mut self) {
            if self.wrapping.is_none() {
                self.wrapping = Some(WrappedBlock::new(self.width));
            }
        }
    
        /// Get the current line wrapping context (and create if
        /// needed).
        fn current_text(&mut self) -> &mut WrappedBlock> {
            self.ensure_wrapping_exists();
            self.wrapping.as_mut().unwrap()
        }
    
        /// Add a prerendered (multiline) string with the current annotations.
        pub fn add_subblock(&mut self, s: &str) {
            use self::TaggedLineElement::Str;
    
            html_trace!("add_subblock({}, {})", self.width, s);
            let tag = self.ann_stack.clone();
            self.lines.extend(s.lines().map(|l| {
                let mut line = TaggedLine::new();
                line.push(Str(TaggedString {
                    s: l.into(),
                    tag: tag.clone(),
                }));
                RenderLine::Text(line)
            }));
        }
    
        /// Flushes the current wrapped block into the lines.
        fn flush_wrapping(&mut self) {
            if let Some(w) = self.wrapping.take() {
                self.lines
                    .extend(w.into_lines().into_iter().map(RenderLine::Text))
            }
        }
    
        /// Flush the wrapping text and border.  Only one should have
        /// anything to do.
        fn flush_all(&mut self) {
            self.flush_wrapping();
        }
    
        /// Consumes this renderer and return a multiline `String` with the result.
        pub fn into_string(self) -> String {
            let mut result = String::new();
            #[cfg(feature = "html_trace")]
            let width: usize = self.width;
            for line in self.into_lines() {
                result.push_str(&line.into_string());
                result.push('\n');
            }
            html_trace!("into_string({}, {:?})", width, result);
            result
        }
    
        #[cfg(feature = "html_trace")]
        /// Returns a string of the current builder contents (for testing).
        fn to_string(&self) -> String {
            let mut result = String::new();
            for line in &self.lines {
                result += &line.to_string();
                result.push_str("\n");
            }
            result
        }
    
        /// Returns a `Vec` of `TaggedLine`s with therendered text.
        pub fn into_lines(mut self) -> LinkedList>> {
            self.flush_wrapping();
            // And add the links
            let mut trailer = self.decorator.take().unwrap().finalise();
            if !trailer.is_empty() {
                self.start_block();
                for line in trailer.drain(0..) {
                    /* Hard wrap */
                    let mut pos = 0;
                    let mut wrapped_line = TaggedLine::new();
                    for ts in line.into_tagged_strings() {
                        // FIXME: should we percent-escape?  This is probably
                        // an invalid URL to start with.
                        let s = ts.s.replace('\n', " ");
                        let tag = vec![ts.tag];
    
                        let width = s.width();
                        if pos + width > self.width {
                            // split the string and start a new line
                            let mut buf = String::new();
                            for c in s.chars() {
                                let c_width = UnicodeWidthChar::width(c).unwrap_or(0);
                                if pos + c_width > self.width {
                                    if !buf.is_empty() {
                                        wrapped_line.push_str(TaggedString {
                                            s: buf,
                                            tag: tag.clone(),
                                        });
                                        buf = String::new();
                                    }
    
                                    self.lines.push_back(RenderLine::Text(wrapped_line));
                                    wrapped_line = TaggedLine::new();
                                    pos = 0;
                                }
                                pos += c_width;
                                buf.push(c);
                            }
                            wrapped_line.push_str(TaggedString { s: buf, tag });
                        } else {
                            wrapped_line.push_str(TaggedString {
                                s: s.to_owned(),
                                tag,
                            });
                            pos += width;
                        }
                    }
                    self.lines.push_back(RenderLine::Text(wrapped_line));
                }
            }
            self.lines
        }
    
        fn add_horizontal_line(&mut self, line: BorderHoriz) {
            self.flush_wrapping();
            self.lines.push_back(RenderLine::Line(line));
        }
    }
    
    fn filter_text_strikeout(s: &str) -> Option {
        let mut result = String::new();
        for c in s.chars() {
            result.push(c);
            if UnicodeWidthChar::width(c).unwrap_or(0) > 0 {
                // This is a character with width (not a combining or other character)
                // so add a strikethrough combiner.
                result.push('\u{336}');
            }
        }
        Some(result)
    }
    
    impl Renderer for TextRenderer {
        fn add_empty_line(&mut self) {
            html_trace!("add_empty_line()");
            self.flush_all();
            self.lines.push_back(RenderLine::Text(TaggedLine::new()));
            html_trace_quiet!("add_empty_line: at_block_end <- false");
            self.at_block_end = false;
            html_trace_quiet!("add_empty_line: new lines: {:?}", self.lines);
        }
    
        fn new_sub_renderer(&self, width: usize) -> Self {
            TextRenderer::new(
                width,
                self.decorator.as_ref().unwrap().make_subblock_decorator(),
            )
        }
    
        fn start_block(&mut self) {
            html_trace!("start_block({})", self.width);
            self.flush_all();
            if !self.lines.is_empty() {
                self.add_empty_line();
            }
            html_trace_quiet!("start_block; at_block_end <- false");
            self.at_block_end = false;
        }
    
        fn new_line(&mut self) {
            self.flush_all();
        }
    
        fn new_line_hard(&mut self) {
            match self.wrapping {
                None => self.add_empty_line(),
                Some(WrappedBlock {
                    linelen: 0,
                    wordlen: 0,
                    ..
                }) => self.add_empty_line(),
                Some(_) => self.flush_all(),
            }
        }
    
        fn add_horizontal_border(&mut self) {
            self.flush_wrapping();
            self.lines
                .push_back(RenderLine::Line(BorderHoriz::new(self.width)));
        }
    
        fn add_horizontal_border_width(&mut self, width: usize) {
            self.flush_wrapping();
            self.lines
                .push_back(RenderLine::Line(BorderHoriz::new(width)));
        }
    
        fn start_pre(&mut self) {
            self.pre_depth += 1;
        }
    
        fn end_pre(&mut self) {
            if self.pre_depth > 0 {
                self.pre_depth -= 1;
            } else {
                panic!("Attempt to end a preformatted block which wasn't opened.");
            }
        }
    
        fn end_block(&mut self) {
            self.at_block_end = true;
        }
    
        fn add_inline_text(&mut self, text: &str) {
            html_trace!("add_inline_text({}, {})", self.width, text);
            if self.pre_depth == 0 && self.at_block_end && text.chars().all(char::is_whitespace) {
                // Ignore whitespace between blocks.
                return;
            }
            if self.at_block_end {
                self.start_block();
            }
            // ensure wrapping is set
            let _ = self.current_text();
            let mut s = None;
            // Do any filtering of the text
            for filter in &self.text_filter_stack {
                // When we stop supporting Rust < 1.40, this can become:
                //let srctext = s.as_deref().unwrap_or(text);
                let srctext = s.as_ref().map(Deref::deref).unwrap_or(text);
                if let Some(filtered) = filter(srctext) {
                    s = Some(filtered);
                }
            }
            // When we stop supporting Rust < 1.40, this can become:
            //let filtered_text = s.as_deref().unwrap_or(text);
            let filtered_text = s.as_ref().map(Deref::deref).unwrap_or(text);
            if self.pre_depth == 0 {
                self.wrapping
                    .as_mut()
                    .unwrap()
                    .add_text(filtered_text, &self.ann_stack);
            } else {
                let mut tag_first = self.ann_stack.clone();
                let mut tag_cont = self.ann_stack.clone();
                tag_first.push(self.decorator.as_mut().unwrap().decorate_preformat_first());
                tag_cont.push(self.decorator.as_mut().unwrap().decorate_preformat_cont());
                self.wrapping.as_mut().unwrap().add_preformatted_text(
                    filtered_text,
                    &tag_first,
                    &tag_cont,
                );
            }
        }
    
        fn width(&self) -> usize {
            self.width
        }
    
        fn add_block_line(&mut self, line: &str) {
            self.add_subblock(line);
        }
    
        fn append_subrender<'a, I>(&mut self, other: Self, prefixes: I)
        where
            I: Iterator,
        {
            use self::TaggedLineElement::Str;
    
            self.flush_wrapping();
            let tag = self.ann_stack.clone();
            self.lines.extend(
                other
                    .into_lines()
                    .into_iter()
                    .zip(prefixes)
                    .map(|(line, prefix)| match line {
                        RenderLine::Text(mut tline) => {
                            if !prefix.is_empty() {
                                tline.insert_front(TaggedString {
                                    s: prefix.to_string(),
                                    tag: tag.clone(),
                                });
                            }
                            RenderLine::Text(tline)
                        }
                        RenderLine::Line(l) => {
                            let mut tline = TaggedLine::new();
                            tline.push(Str(TaggedString {
                                s: prefix.to_string(),
                                tag: tag.clone(),
                            }));
                            tline.push(Str(TaggedString {
                                s: l.into_string(),
                                tag: tag.clone(),
                            }));
                            RenderLine::Text(tline)
                        }
                    }),
            );
        }
    
        fn append_columns_with_borders(&mut self, cols: I, collapse: bool)
        where
            I: IntoIterator,
            Self: Sized,
        {
            use self::TaggedLineElement::Str;
            html_trace!("append_columns_with_borders(collapse={})", collapse);
            html_trace!("self=\n{}", self.to_string());
    
            self.flush_wrapping();
    
            let mut tot_width = 0;
    
            let mut line_sets = cols
                .into_iter()
                .map(|sub_r| {
                    let width = sub_r.width;
                    tot_width += width;
                    html_trace!("Adding column:\n{}", sub_r.to_string());
                    (
                        width,
                        sub_r
                            .into_lines()
                            .into_iter()
                            .map(|mut line| {
                                match line {
                                    RenderLine::Text(ref mut tline) => {
                                        tline.pad_to(width);
                                    }
                                    RenderLine::Line(ref mut border) => {
                                        border.stretch_to(width);
                                    }
                                }
                                line
                            })
                            .collect(),
                    )
                })
                .collect::>)>>();
    
            tot_width += line_sets.len().saturating_sub(1);
    
            let mut next_border = BorderHoriz::new(tot_width);
    
            // Join the vertical lines to all the borders
            {
                let mut pos = 0;
                if let &mut RenderLine::Line(ref mut prev_border) = self.lines.back_mut().unwrap() {
                    html_trace!("Merging with last line:\n{}", prev_border.to_string());
                    for &(w, _) in &line_sets[..line_sets.len() - 1] {
                        html_trace!("pos={}, w={}", pos, w);
                        prev_border.join_below(pos + w);
                        next_border.join_above(pos + w);
                        pos += w + 1;
                    }
                } else {
                    panic!("Expected a border line");
                }
            }
    
            // If we're collapsing bottom borders, then the bottom border of a
            // nested table is being merged into the bottom border of the
            // containing cell.  If that cell happens not to be the tallest
            // cell in the row, then we need to extend any vertical lines
            // to the bottom.  We'll remember what to do when we update the
            // containing border.
            let mut column_padding = vec![None; line_sets.len()];
    
            // If we're collapsing borders, do so.
            if collapse {
                html_trace!("Collapsing borders.");
                /* Collapse any top border */
                let mut pos = 0;
                for &mut (w, ref mut sublines) in &mut line_sets {
                    let starts_border = if sublines.len() > 0 {
                        if let RenderLine::Line(_) = sublines[0] {
                            true
                        } else {
                            false
                        }
                    } else {
                        false
                    };
                    if starts_border {
                        html_trace!("Starts border");
                        if let &mut RenderLine::Line(ref mut prev_border) =
                            self.lines.back_mut().expect("No previous line")
                        {
                            if let RenderLine::Line(line) = sublines.remove(0) {
                                html_trace!(
                                    "prev border:\n{}\n, pos={}, line:\n{}",
                                    prev_border.to_string(),
                                    pos,
                                    line.to_string()
                                );
                                prev_border.merge_from_below(&line, pos);
                            }
                        } else {
                            unreachable!();
                        }
                    }
                    pos += w + 1;
                }
    
                /* Collapse any bottom border */
                let mut pos = 0;
                for (col_no, &mut (w, ref mut sublines)) in line_sets.iter_mut().enumerate() {
                    let ends_border = if sublines.len() > 0 {
                        if let Some(&RenderLine::Line(_)) = sublines.last() {
                            true
                        } else {
                            false
                        }
                    } else {
                        false
                    };
                    if ends_border {
                        html_trace!("Ends border");
                        if let RenderLine::Line(line) = sublines.pop().unwrap() {
                            next_border.merge_from_above(&line, pos);
                            column_padding[col_no] = Some(line.to_vertical_lines_above())
                        }
                    }
                    pos += w + 1;
                }
            }
    
            let cell_height = line_sets
                .iter()
                .map(|&(_, ref v)| v.len())
                .max()
                .unwrap_or(0);
            let spaces: String = (0..tot_width).map(|_| ' ').collect();
            let last_cellno = line_sets.len() - 1;
            for i in 0..cell_height {
                let mut line = TaggedLine::new();
                for (cellno, &mut (width, ref mut ls)) in line_sets.iter_mut().enumerate() {
                    if let Some(piece) = ls.get_mut(i) {
                        match piece {
                            &mut RenderLine::Text(ref mut tline) => {
                                line.consume(tline);
                            }
                            &mut RenderLine::Line(ref bord) => {
                                line.push(Str(TaggedString {
                                    s: bord.to_string(),
                                    tag: self.ann_stack.clone(),
                                }));
                            }
                        };
                    } else {
                        line.push(Str(TaggedString {
                            s: column_padding[cellno]
                                .as_ref()
                                .map(|s| s.clone())
                                .unwrap_or_else(|| spaces[0..width].to_string()),
    
                            tag: self.ann_stack.clone(),
                        }));
                    }
                    if cellno != last_cellno {
                        line.push_char('│', &self.ann_stack);
                    }
                }
                self.lines.push_back(RenderLine::Text(line));
            }
            self.lines.push_back(RenderLine::Line(next_border));
        }
    
        fn append_vert_row(&mut self, cols: I)
        where
            I: IntoIterator,
            Self: Sized,
        {
            html_trace!("append_vert_row()");
            html_trace!("self=\n{}", self.to_string());
    
            self.flush_wrapping();
    
            let width = self.width();
    
            let mut first = true;
            for col in cols {
                if first {
                    first = false;
                } else {
                    let border = BorderHoriz::new_type(width, BorderSegHoriz::StraightVert);
                    self.add_horizontal_line(border);
                }
                self.append_subrender(col, std::iter::repeat(""));
            }
            self.add_horizontal_border();
        }
    
        fn empty(&self) -> bool {
            self.lines.is_empty()
                && if let Some(wrapping) = &self.wrapping {
                    wrapping.is_empty()
                } else {
                    true
                }
        }
    
        fn text_len(&self) -> usize {
            let mut result = 0;
            for line in &self.lines {
                result += match *line {
                    RenderLine::Text(ref tline) => tline.width(),
                    RenderLine::Line(_) => 0, // FIXME: should borders count?
                };
            }
            if let Some(ref w) = self.wrapping {
                result += w.text_len();
            }
            result
        }
    
        fn start_link(&mut self, target: &str) {
            if let Some((s, annotation)) = self
                .decorator
                .as_mut()
                .map(|d| d.decorate_link_start(target))
            {
                self.ann_stack.push(annotation);
                self.add_inline_text(&s);
            }
        }
        fn end_link(&mut self) {
            if let Some(s) = self.decorator.as_mut().map(|d| d.decorate_link_end()) {
                self.add_inline_text(&s);
                self.ann_stack.pop();
            }
        }
        fn start_emphasis(&mut self) {
            if let Some((s, annotation)) = self.decorator.as_mut().map(|d| d.decorate_em_start()) {
                self.ann_stack.push(annotation);
                self.add_inline_text(&s);
            }
        }
        fn end_emphasis(&mut self) {
            if let Some(s) = self.decorator.as_mut().map(|d| d.decorate_em_end()) {
                self.add_inline_text(&s);
                self.ann_stack.pop();
            }
        }
        fn start_strong(&mut self) {
            if let Some((s, annotation)) = self.decorator.as_mut().map(|d| d.decorate_strong_start()) {
                self.ann_stack.push(annotation);
                self.add_inline_text(&s);
            }
        }
        fn end_strong(&mut self) {
            if let Some(s) = self.decorator.as_mut().map(|d| d.decorate_strong_end()) {
                self.add_inline_text(&s);
                self.ann_stack.pop();
            }
        }
        fn start_strikeout(&mut self) {
            if let Some((s, annotation)) = self
                .decorator
                .as_mut()
                .map(|d| d.decorate_strikeout_start())
            {
                self.ann_stack.push(annotation);
                self.add_inline_text(&s);
            }
            self.text_filter_stack.push(filter_text_strikeout);
        }
        fn end_strikeout(&mut self) {
            self.text_filter_stack.pop().unwrap();
            if let Some(s) = self.decorator.as_mut().map(|d| d.decorate_strikeout_end()) {
                self.add_inline_text(&s);
                self.ann_stack.pop();
            }
        }
        fn start_code(&mut self) {
            if let Some((s, annotation)) = self.decorator.as_mut().map(|d| d.decorate_code_start()) {
                self.ann_stack.push(annotation);
                self.add_inline_text(&s);
            }
        }
        fn end_code(&mut self) {
            if let Some(s) = self.decorator.as_mut().map(|d| d.decorate_code_end()) {
                self.add_inline_text(&s);
                self.ann_stack.pop();
            }
        }
        fn add_image(&mut self, title: &str) {
            if let Some((s, tag)) = self.decorator.as_mut().map(|d| d.decorate_image(title)) {
                self.ann_stack.push(tag);
                self.add_inline_text(&s);
                self.ann_stack.pop();
            }
        }
    
        fn header_prefix(&mut self, level: usize) -> String {
            if let Some(d) = self.decorator.as_mut() {
                d.header_prefix(level)
            } else {
                "".to_owned()
            }
        }
    
        fn quote_prefix(&mut self) -> String {
            if let Some(d) = self.decorator.as_mut() {
                d.quote_prefix()
            } else {
                "".to_owned()
            }
        }
    
        fn unordered_item_prefix(&mut self) -> String {
            if let Some(d) = self.decorator.as_mut() {
                d.unordered_item_prefix()
            } else {
                "".to_owned()
            }
        }
    
        fn ordered_item_prefix(&mut self, i: i64) -> String {
            if let Some(d) = self.decorator.as_mut() {
                d.ordered_item_prefix(i)
            } else {
                "".to_owned()
            }
        }
    
        fn record_frag_start(&mut self, fragname: &str) {
            use self::TaggedLineElement::FragmentStart;
    
            self.ensure_wrapping_exists();
            self.wrapping
                .as_mut()
                .unwrap()
                .add_element(FragmentStart(fragname.to_string()));
        }
    }
    
    /// A decorator for use with `TextRenderer` which outputs plain UTF-8 text
    /// with no annotations.  Markup is rendered as text characters or footnotes.
    #[derive(Clone, Debug)]
    pub struct PlainDecorator {
        links: Vec,
    }
    
    impl PlainDecorator {
        /// Create a new `PlainDecorator`.
        #[cfg_attr(feature = "clippy", allow(new_without_default_derive))]
        pub fn new() -> PlainDecorator {
            PlainDecorator { links: Vec::new() }
        }
    }
    
    impl TextDecorator for PlainDecorator {
        type Annotation = ();
    
        fn decorate_link_start(&mut self, url: &str) -> (String, Self::Annotation) {
            self.links.push(url.to_string());
            ("[".to_string(), ())
        }
    
        fn decorate_link_end(&mut self) -> String {
            format!("][{}]", self.links.len())
        }
    
        fn decorate_em_start(&mut self) -> (String, Self::Annotation) {
            ("*".to_string(), ())
        }
    
        fn decorate_em_end(&mut self) -> String {
            "*".to_string()
        }
    
        fn decorate_strong_start(&mut self) -> (String, Self::Annotation) {
            ("**".to_string(), ())
        }
    
        fn decorate_strong_end(&mut self) -> String {
            "**".to_string()
        }
    
        fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation) {
            ("".to_string(), ())
        }
    
        fn decorate_strikeout_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_code_start(&mut self) -> (String, Self::Annotation) {
            ("`".to_string(), ())
        }
    
        fn decorate_code_end(&mut self) -> String {
            "`".to_string()
        }
    
        fn decorate_preformat_first(&mut self) -> Self::Annotation {
            ()
        }
        fn decorate_preformat_cont(&mut self) -> Self::Annotation {
            ()
        }
    
        fn decorate_image(&mut self, title: &str) -> (String, Self::Annotation) {
            (format!("[{}]", title), ())
        }
    
        fn header_prefix(&mut self, level: usize) -> String {
            "#".repeat(level) + " "
        }
    
        fn quote_prefix(&mut self) -> String {
            "> ".to_string()
        }
    
        fn unordered_item_prefix(&mut self) -> String {
            "* ".to_string()
        }
    
        fn ordered_item_prefix(&mut self, i: i64) -> String {
            format!("{}. ", i)
        }
    
        fn finalise(self) -> Vec> {
            self.links
                .into_iter()
                .enumerate()
                .map(|(idx, s)| TaggedLine::from_string(format!("[{}]: {}", idx + 1, s), &()))
                .collect()
        }
    
        fn make_subblock_decorator(&self) -> Self {
            PlainDecorator::new()
        }
    }
    
    /// A decorator for use with `TextRenderer` which outputs plain UTF-8 text
    /// with no annotations or markup, emitting only the literal text.
    #[derive(Clone, Debug)]
    pub struct TrivialDecorator {}
    
    impl TrivialDecorator {
        /// Create a new `TrivialDecorator`.
        #[cfg_attr(feature = "clippy", allow(new_without_default_derive))]
        pub fn new() -> TrivialDecorator {
            TrivialDecorator {}
        }
    }
    
    impl TextDecorator for TrivialDecorator {
        type Annotation = ();
    
        fn decorate_link_start(&mut self, _url: &str) -> (String, Self::Annotation) {
            ("".to_string(), ())
        }
    
        fn decorate_link_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_em_start(&mut self) -> (String, Self::Annotation) {
            ("".to_string(), ())
        }
    
        fn decorate_em_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_strong_start(&mut self) -> (String, Self::Annotation) {
            ("".to_string(), ())
        }
    
        fn decorate_strong_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation) {
            ("".to_string(), ())
        }
    
        fn decorate_strikeout_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_code_start(&mut self) -> (String, Self::Annotation) {
            ("".to_string(), ())
        }
    
        fn decorate_code_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_preformat_first(&mut self) -> Self::Annotation {
            ()
        }
        fn decorate_preformat_cont(&mut self) -> Self::Annotation {
            ()
        }
    
        fn decorate_image(&mut self, title: &str) -> (String, Self::Annotation) {
            // FIXME: this should surely be the alt text, not the title text
            (title.to_string(), ())
        }
    
        fn header_prefix(&mut self, _level: usize) -> String {
            "".to_string()
        }
    
        fn quote_prefix(&mut self) -> String {
            "".to_string()
        }
    
        fn unordered_item_prefix(&mut self) -> String {
            "".to_string()
        }
    
        fn ordered_item_prefix(&mut self, _i: i64) -> String {
            "".to_string()
        }
    
        fn finalise(self) -> Vec> {
            Vec::new()
        }
    
        fn make_subblock_decorator(&self) -> Self {
            TrivialDecorator::new()
        }
    }
    
    /// A decorator to generate rich text (styled) rather than
    /// pure text output.
    #[derive(Clone, Debug)]
    pub struct RichDecorator {}
    
    /// Annotation type for "rich" text.  Text is associated with a set of
    /// these.
    #[derive(PartialEq, Eq, Clone, Debug)]
    pub enum RichAnnotation {
        /// Normal text.
        Default,
        /// A link with the target.
        Link(String),
        /// An image (attached to the title text)
        Image,
        /// Emphasised text, which might be rendered in bold or another colour.
        Emphasis,
        /// Strong text, which might be rendered in bold or another colour.
        Strong,
        /// Stikeout text
        Strikeout,
        /// Code
        Code,
        /// Preformatted; true if a continuation line for an overly-long line.
        Preformat(bool),
    }
    
    impl Default for RichAnnotation {
        fn default() -> Self {
            RichAnnotation::Default
        }
    }
    
    impl RichDecorator {
        /// Create a new `RichDecorator`.
        #[cfg_attr(feature = "clippy", allow(new_without_default_derive))]
        pub fn new() -> RichDecorator {
            RichDecorator {}
        }
    }
    
    impl TextDecorator for RichDecorator {
        type Annotation = RichAnnotation;
    
        fn decorate_link_start(&mut self, url: &str) -> (String, Self::Annotation) {
            ("".to_string(), RichAnnotation::Link(url.to_string()))
        }
    
        fn decorate_link_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_em_start(&mut self) -> (String, Self::Annotation) {
            ("".to_string(), RichAnnotation::Emphasis)
        }
    
        fn decorate_em_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_strong_start(&mut self) -> (String, Self::Annotation) {
            ("*".to_string(), RichAnnotation::Strong)
        }
    
        fn decorate_strong_end(&mut self) -> String {
            "*".to_string()
        }
    
        fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation) {
            ("".to_string(), RichAnnotation::Strikeout)
        }
    
        fn decorate_strikeout_end(&mut self) -> String {
            "".to_string()
        }
    
        fn decorate_code_start(&mut self) -> (String, Self::Annotation) {
            ("`".to_string(), RichAnnotation::Code)
        }
    
        fn decorate_code_end(&mut self) -> String {
            "`".to_string()
        }
    
        fn decorate_preformat_first(&mut self) -> Self::Annotation {
            RichAnnotation::Preformat(false)
        }
    
        fn decorate_preformat_cont(&mut self) -> Self::Annotation {
            RichAnnotation::Preformat(true)
        }
    
        fn decorate_image(&mut self, title: &str) -> (String, Self::Annotation) {
            (title.to_string(), RichAnnotation::Image)
        }
    
        fn header_prefix(&mut self, level: usize) -> String {
            "#".repeat(level) + " "
        }
    
        fn quote_prefix(&mut self) -> String {
            "> ".to_string()
        }
    
        fn unordered_item_prefix(&mut self) -> String {
            "* ".to_string()
        }
    
        fn ordered_item_prefix(&mut self, i: i64) -> String {
            format!("{}. ", i)
        }
    
        fn finalise(self) -> Vec> {
            Vec::new()
        }
    
        fn make_subblock_decorator(&self) -> Self {
            RichDecorator::new()
        }
    }
    html2text-0.4.4/src/tests.rs000064400000000000000000000577371046102023000141060ustar  00000000000000use super::render::text_renderer::{RichAnnotation, TaggedLine, TrivialDecorator};
    use super::{from_read, from_read_with_decorator, parse, TextDecorator};
    
    /// Like assert_eq!(), but prints out the results normally as well
    macro_rules! assert_eq_str {
        ($a:expr, $b:expr) => {
            if $a != $b {
                println!("<<<\n{}===\n{}>>>", $a, $b);
                assert_eq!($a, $b);
            }
        };
    }
    fn test_html(input: &[u8], expected: &str, width: usize) {
        assert_eq_str!(from_read(input, width), expected);
    }
    
    fn test_html_decorator(input: &[u8], expected: &str, width: usize, decorator: D)
    where
        D: TextDecorator,
    {
        let output = from_read_with_decorator(input, width, decorator);
        assert_eq_str!(output, expected);
    }
    
    #[test]
    fn test_table() {
        test_html(
            br##"
       
  • "); result.push_str(cell); result.push_str("
    1 2 3
    "##, r#"─┬─┬─ 1│2│3 ─┴─┴─ "#, 12, ); } #[test] fn test_table2() { test_html( br##"
    1 2 3
    4 5 6
    "##, r#"─┬─┬─ 1│2│3 ─┼─┼─ 4│5│6 ─┴─┴─ "#, 12, ); } #[test] fn test_thead() { test_html( br##"
    Col1 Col2 Col3
    1 2 3
    "##, r#"────┬────┬──── Col1│Col2│Col3 ────┼────┼──── 1 │2 │3 ────┴────┴──── "#, 15, ); } #[test] fn test_colspan() { test_html( br##"
    1 2 3
    12 3
    1 23
    "##, r#"─┬─┬─ 1│2│3 ─┴─┼─ 12 │3 ─┬─┴─ 1│23 ─┴─── "#, 12, ); } #[test] fn test_para() { assert_eq_str!(from_read(&b"

    Hello

    "[..], 10), "Hello\n"); } #[test] fn test_para2() { assert_eq_str!( from_read(&b"

    Hello, world!

    "[..], 20), "Hello, world!\n" ); } #[test] fn test_blockquote() { assert_eq_str!( from_read( &br#"

    Hello

    One, two, three

    foo

    "#[..], 12 ), r#"Hello > One, two, > three foo "# ); } #[test] fn test_ul() { test_html( br#"
    • Item one
    • Item two
    • Item three
    "#, r#"* Item one * Item two * Item three "#, 10, ); } #[test] fn test_ol1() { test_html( br#"
    1. Item one
    2. Item two
    3. Item three
    "#, r#"1. Item one 2. Item two 3. Item three "#, 11, ); } #[test] fn test_ol2() { test_html( br#"
    1. Item one
    2. Item two
    3. Item three
    4. Item four
    5. Item five
    6. Item six
    7. Item seven
    8. Item eight
    9. Item nine
    10. Item ten
    "#, r#"1. Item one 2. Item two 3. Item three 4. Item four 5. Item five 6. Item six 7. Item seven 8. Item eight 9. Item nine 10. Item ten "#, 20, ); } #[test] fn test_ol_start() { test_html( br#"
    1. Item three
    2. Item four
    "#, r#"3. Item three 4. Item four "#, 20, ); } #[test] fn test_ol_start_9() { test_html( br#"
    1. Item nine
    2. Item ten
    "#, r#"9. Item nine 10. Item ten "#, 20, ); } #[test] fn test_ol_start_neg() { test_html( br#"
    1. Item minus one
    2. Item zero
    3. Item one
    "#, r#"-1. Item minus one 0. Item zero 1. Item one "#, 20, ); } #[test] fn test_strip_nl() { test_html( br#"

    One Two Three

    "#, "One Two Three\n", 40, ); } #[test] fn test_strip_nl2() { test_html( br#"

    One Two Three

    "#, "One Two Three\n", 40, ); } #[test] fn test_strip_nl_tbl() { test_html( br#"
    One Two Three
    "#, r"────────────── One Two Three ────────────── ", 20, ); } #[test] fn test_unknown_element() { test_html( br#"
    One Two Three
    "#, r"────────────── One Two Three ────────────── ", 20, ); } #[test] fn test_strip_nl_tbl_p() { test_html( br#"

    One Two Three

    "#, r"────────────── One Two Three ────────────── ", 20, ); } #[test] fn test_pre() { test_html( br#"
    foo
    bar
    wib   asdf;
    

    Hello

    "#, r"foo bar wib asdf; Hello ", 20, ); } #[test] fn test_link() { test_html( br#"

    Hello, world

    "#, r"Hello, [world][1] [1]: http://www.example.com/ ", 80, ); } #[test] fn test_link2() { test_html( br#"

    Hello, world!

    "#, r"Hello, [world][1]! [1]: http://www.example.com/ ", 80, ); } #[test] fn test_link3() { test_html( br#"

    Hello, world

    "#, r"Hello, [w][1]orld [1]: http://www.example.com/ ", 80, ); } #[test] fn test_link_wrap() { test_html( br#" Hello"#, r"[Hello][1] [1]: http: //www.exam ple.com/ ", 10, ); } #[test] fn test_wrap() { test_html( br"

    Hello, world. Superlongwordreally

    ", r#"Hello, world. Superlon gwordrea lly "#, 8, ); } #[test] fn test_wrap2() { test_html( br"

    Hello, world. This is a long sentence with a few words, which we want to be wrapped correctly.

    ", r#"Hello, world. This is a long sentence with a few words, which we want to be wrapped correctly. "#, 20, ); } #[test] fn test_wrap3() { test_html( br#"

    http://example.org/blah/ one two three"#, r#"[http://example.org/blah/ ][1] one two three [1]: dest "#, 25, ); } #[test] fn test_div() { test_html( br"

    Hello

    Div
    ", r#"Hello Div "#, 20, ); test_html( br"

    Hello

    Div
    Div2
    ", r#"Hello Div Div2 "#, 20, ); } #[test] fn test_img_alt() { test_html( br"

    Hello world

    ", "Hello [world]\n", 80, ); } #[test] fn test_br() { test_html(br"

    Hello
    World

    ", "Hello\nWorld\n", 20); } #[test] fn test_br2() { test_html(br"

    Hello

    World

    ", "Hello\n\nWorld\n", 20); } #[test] fn test_br3() { test_html(br"

    Hello

    World

    ", "Hello\n\nWorld\n", 20); } #[test] fn test_subblock() { test_html( br#"
    Here's a link.
    • Bullet
    • Bullet
    • Bullet
    "#, r"Here's a [link][1]. * Bullet * Bullet * Bullet [1]: https://example.com/ ", 80, ); } #[test] fn test_controlchar() { test_html("Foo\u{0080}Bar".as_bytes(), "FooBar\n", 80); test_html("Foo\u{0080}Bar".as_bytes(), "FooB\nar\n", 4); test_html("FooBa\u{0080}r".as_bytes(), "FooB\nar\n", 4); } #[test] fn test_nested_table_1() { test_html( br##"
    123
    456
    789
    123
    456
    789
    123
    456
    789
    "##, r#"─┬─┬─┬─┬─┬─┬─┬─┬─ 1│2│3│4│5│6│7│8│9 ─┼─┼─┼─┼─┼─┼─┼─┼─ 1│2│3│4│5│6│7│8│9 ─┼─┼─┼─┼─┼─┼─┼─┼─ 1│2│3│4│5│6│7│8│9 ─┴─┴─┴─┴─┴─┴─┴─┴─ "#, 21, ); } #[test] fn test_nested_table_2() { test_html( br##"
    1a
    2b
    one
    two
    three
    four
    five
    
    "##, r#"─┬─┬──────── 1│a│one ─┼─│two 2│b│three │ │four │ │five ─┴─┴──────── "#, 11, ); } #[test] fn test_h1() { test_html( br##"

    Hi

    foo

    "##, r#"# Hi foo "#, 21, ); } #[test] fn test_h3() { test_html( br##"

    Hi

    foo

    "##, r#"### Hi foo "#, 21, ); } // General test that spacing is preserved #[test] fn test_pre2() { test_html( br##"
    Hello  sp
    world
    "##, r#"Hello sp world "#, 21, ); } // Check that spans work correctly inside
    #[test]
    fn test_pre_span() {
        test_html(
            br##"
    
    Hello $sp
    Hi $foo
    Hi foo, bar
    
    "##, r#"Hello $sp Hi $foo Hi foo, bar "#, 21, ); } // Check tab behaviour #[test] fn test_pre_tab() { test_html(b"
    \tworld
    ", " world\n", 40); test_html(b"
    H\tworld
    ", "H world\n", 40); test_html(b"
    He\tworld
    ", "He world\n", 40); test_html(b"
    Hel\tworld
    ", "Hel world\n", 40); test_html(b"
    Hell\tworld
    ", "Hell world\n", 40); test_html(b"
    Hello\tworld
    ", "Hello world\n", 40); test_html(b"
    Helloo\tworld
    ", "Helloo world\n", 40); test_html(b"
    Hellooo\tworld
    ", "Hellooo world\n", 40); test_html(b"
    Helloooo\tworld
    ", "Helloooo world\n", 40); } #[test] fn test_em_strong() { test_html( br##"

    Hi em strong

    "##, r#"Hi *em* **strong** "#, 21, ); } #[test] #[ignore] // Not yet fixed! fn test_nbsp_indent() { test_html( br##"
    Top
     Indented
      Indented again
    "##, r#"Top Indented Indented again "#, 21, ); } #[test] fn test_deeply_nested() { use ::std::iter::repeat; let html = repeat("").take(1000).collect::>().concat(); test_html(html.as_bytes(), "", 10); } #[test] fn test_deeply_nested_table() { use ::std::iter::repeat; let rpt = 1000; let html = repeat("
    hi") .take(rpt) .collect::>() .concat() + &repeat("
    ") .take(rpt) .collect::>() .concat(); let result = repeat( r#"────────── hi ////////// "#, ) .take(rpt - 3) .collect::>() .concat() + &r#"──┬──── hi│hi │//// │── │hi │── ──┴──── "# + &repeat("──────────\n").take(rpt - 3).collect::(); test_html(html.as_bytes(), &result, 10); } #[test] fn test_table_no_id() { let html = r#"
    hi, world
    "#; test_html( html.as_bytes(), r#"───────── hi, world ───────── "#, 10, ); } #[test] fn test_table_cell_id() { let html = r#"
    hi, world
    "#; test_html( html.as_bytes(), r#"───────── hi, world ───────── "#, 10, ); } #[test] fn test_table_row_id() { let html = r#"
    hi, world
    "#; test_html( html.as_bytes(), r#"───────── hi, world ───────── "#, 10, ); } #[test] fn test_table_table_id() { let html = r#"
    hi, world
    "#; test_html( html.as_bytes(), r#"───────── hi, world ───────── "#, 10, ); } #[test] fn test_table_tbody_id() { let html = r#"
    hi, world
    "#; test_html( html.as_bytes(), r#"───────── hi, world ───────── "#, 10, ); } #[test] fn test_header_width() { //0 size test_html( br##"

    Anything

    "##, r#"## ### A ## ### n ## ### y ## ### t ## ### h ## ### i ## ### n ## ### g ## ## "#, 7, ); //Underflow test_html( br##"

    Anything

    "##, r#"## ### A ## ### n ## ### y ## ### t ## ### h ## ### i ## ### n ## ### g ## ## "#, 5, ); } #[test] fn test_trivial_decorator() { test_html_decorator( br#"
    Here's a link.
    • Bullet
    • Bullet
    • Bullet
    "#, r"Here's a link. Bullet Bullet Bullet ", 80, TrivialDecorator::new(), ); } #[test] fn test_issue_16() { test_html(b"
    ", "", 10); } #[test] fn test_pre_br() { test_html( b"
    Foo
    Bar
    ", r#"Foo Bar "#, 10, ); } #[test] fn test_pre_emptyline() { test_html(br#"
    X 
    "#, "X \n", 10); } #[test] fn test_link_id_longline() { test_html( br#"quitelongline"#, r#"[quitelong line][1] [1]: foo "#, 10, ); } #[test] fn test_dl() { test_html( br#"
    Foo
    Definition of foo
    "#, r#"*Foo* Definition of foo "#, 40, ); } #[test] fn test_s() { test_html( br#"Hi youthee!"#, "Hi y\u{336}o\u{336}u\u{336}thee!\n", 40, ); } #[test] fn test_multi_parse() { let html: &[u8] = b"one two three four five six seven eight nine ten eleven twelve thirteen \ fourteen fifteen sixteen seventeen"; let tree = parse(html); assert_eq!( "one two three four five six seven eight nine ten eleven twelve thirteen fourteen\n\ fifteen sixteen seventeen\n", tree.clone().render_plain(80).into_string() ); assert_eq!( "one two three four five six seven eight nine ten eleven twelve\n\ thirteen fourteen fifteen sixteen seventeen\n", tree.clone().render_plain(70).into_string() ); assert_eq!( "one two three four five six seven eight nine ten\n\ eleven twelve thirteen fourteen fifteen sixteen\n\ seventeen\n", tree.clone().render_plain(50).into_string() ); } #[test] fn test_read_rich() { let html: &[u8] = b"bold"; let lines = parse(html).render_rich(80).into_lines(); let tag = vec![RichAnnotation::Strong]; let line = TaggedLine::from_string("*bold*".to_owned(), &tag); assert_eq!(vec![line], lines); } #[test] fn test_read_custom() { let html: &[u8] = b"bold"; let lines = parse(html).render(80, TrivialDecorator::new()).into_lines(); let tag = vec![()]; let line = TaggedLine::from_string("bold".to_owned(), &tag); assert_eq!(vec![line], lines); } #[test] fn test_pre_rich() { use RichAnnotation::*; assert_eq!( crate::parse("
    test
    ".as_bytes()) .render_rich(100) .into_lines(), [TaggedLine::from_string( "test".into(), &vec![Preformat(false)] )] ); assert_eq!( crate::parse("
    testlong
    ".as_bytes()) .render_rich(4) .into_lines(), [ TaggedLine::from_string("test".into(), &vec![Preformat(false)]), TaggedLine::from_string("long".into(), &vec![Preformat(true)]) ] ); } #[test] fn test_finalise() { use crate::render::text_renderer::{TaggedLine, TextDecorator}; #[derive(Clone, Debug)] struct TestDecorator; impl TextDecorator for TestDecorator { type Annotation = bool; fn decorate_link_start(&mut self, _url: &str) -> (String, Self::Annotation) { Default::default() } fn decorate_link_end(&mut self) -> String { Default::default() } fn decorate_em_start(&mut self) -> (String, Self::Annotation) { Default::default() } fn decorate_em_end(&mut self) -> String { Default::default() } fn decorate_strong_start(&mut self) -> (String, Self::Annotation) { Default::default() } fn decorate_strong_end(&mut self) -> String { Default::default() } fn decorate_strikeout_start(&mut self) -> (String, Self::Annotation) { Default::default() } fn decorate_strikeout_end(&mut self) -> String { Default::default() } fn decorate_code_start(&mut self) -> (String, Self::Annotation) { Default::default() } fn decorate_code_end(&mut self) -> String { Default::default() } fn decorate_preformat_first(&mut self) -> Self::Annotation { Default::default() } fn decorate_preformat_cont(&mut self) -> Self::Annotation { Default::default() } fn decorate_image(&mut self, _title: &str) -> (String, Self::Annotation) { Default::default() } fn header_prefix(&mut self, level: usize) -> String { "#".repeat(level) + " " } fn quote_prefix(&mut self) -> String { "> ".to_string() } fn unordered_item_prefix(&mut self) -> String { "* ".to_string() } fn ordered_item_prefix(&mut self, i: i64) -> String { format!("{}. ", i) } fn finalise(self) -> Vec> { vec![TaggedLine::from_string(String::new(), &true)] } fn make_subblock_decorator(&self) -> Self { TestDecorator } } assert_eq!( crate::parse("test".as_bytes()) .render(80, TestDecorator) .into_lines(), vec![ TaggedLine::from_string("test".to_owned(), &Vec::new()), TaggedLine::new(), TaggedLine::from_string("".to_owned(), &vec![true]), ] ); } #[test] fn test_empty_rows() { test_html( br##"
    1 2 3
    4 5 6
    "##, r#"─┬─┬─ 1│2│3 ─┼─┼─ 4│5│6 ─┴─┴─ "#, 12, ); } #[test] fn test_empty_cols() { test_html( br##"
    1 2
    3 4
    5 6
    "##, r#"─┬─ 1│2 ─┼─ 3│4 ─┼─ 5│6 ─┴─ "#, 12, ); } #[test] fn test_issue_54_oob() { test_html( br##"
     
    Blah blah blah
     
    "##, r#"─┬────────┬ │Blah │ │blah │ │blah │ ─┴────────┴ "#, 10, ); } #[test] fn test_table_vertical_rows() { test_html( br##"
    wid kin der
    "##, "───── wid ///// kin ///// der ───── ", 5, ); } #[test] fn test_unicode() { test_html( "
    နတမစ နတမစ aaa
    " .as_bytes(), "────┬────┬─── နတမစ│နတမစ│aaa ────┴────┴─── ", 15, ); } #[test] fn test_list_in_table() { test_html( b"
    1. 0
    2. 1
    3. 2
    4. 3
    5. 4
    6. 5
    7. 6
    8. 7
    9. 8
    10. 9
    11. 10
    ", "────── 1. 0 2. 1 3. 2 4. 3 5. 4 6. 5 7. 6 8. 7 9. 8 10. 9 11. 10 ────── ", 6, ); } #[test] fn test_max_width() { let html = r#"

    3,266

    "#; let decorator = crate::render::text_renderer::PlainDecorator::new(); let text = from_read_with_decorator(html.as_bytes(), usize::MAX, decorator.clone()); println!("{}", text); }